Implement core speech-to-text pipeline

All major components: hotkey listener (rdev), audio capture (cpal), resampling (rubato), VAD (Silero ONNX), Parakeet v3 TDT transcription (ort), overlay window (winit+softbuffer), paste simulation (enigo+arboard), audio feedback (rodio), YAML config, CLI with clap, HuggingFace model download. ~2400 lines of Rust across 16 source files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 16:47:46 +01:00
parent 6b737f92fe
commit 9b0bf7d9e3
22 changed files with 7750 additions and 0 deletions
@@ -0,0 +1,61 @@
+[package]
+name = "mouth"
+version = "0.1.0"
+edition = "2024"
+description = "Offline speech-to-text with global hotkey and paste"
+
+[dependencies]
+# CLI
+clap = { version = "4", features = ["derive"] }
+
+# Config
+serde = { version = "1", features = ["derive"] }
+serde_yaml = "0.9"
+dirs = "6"
+
+# Interactive config TUI
+dialoguer = "0.11"
+
+# Logging
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+
+# Async
+tokio = { version = "1", features = ["full"] }
+
+# Global hotkey
+rdev = "0.5"
+
+# Audio capture
+cpal = "0.15"
+
+# Audio resampling
+rubato = "0.16"
+
+# ONNX inference (Parakeet v3 + Silero VAD)
+ort = { version = "2.0.0-rc.12", features = ["download-binaries"] }
+ndarray = "0.17"
+
+# Model download from HuggingFace
+hf-hub = "0.4"
+indicatif = "0.17"
+
+# Clipboard
+arboard = "3"
+
+# Keyboard simulation
+enigo = { version = "0.3", features = ["serde"] }
+
+# Overlay window
+winit = "0.30"
+softbuffer = "0.4"
+
+# Audio feedback
+rodio = "0.20"
+
+# System info
+num_cpus = "1"
+
+# Error handling
+anyhow = "1"
+thiserror = "2"