Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.

2026-01-19 00:29:44 +02:00
parent 0a8910fff8
commit 362108f4b0
34 changed files with 4593 additions and 73 deletions
--- a/stt-parakeet/tools/diagnose.py
+++ b/stt-parakeet/tools/diagnose.py
@@ -0,0 +1,219 @@
+"""
+System diagnostics for ASR setup
+"""
+import sys
+import subprocess
+
+
+def print_section(title):
+    """Print a section header."""
+    print(f"\n{'='*80}")
+    print(f" {title}")
+    print(f"{'='*80}\n")
+
+
+def check_python():
+    """Check Python version."""
+    print_section("Python Version")
+    print(f"Python: {sys.version}")
+    print(f"Executable: {sys.executable}")
+
+
+def check_packages():
+    """Check installed packages."""
+    print_section("Installed Packages")
+    
+    packages = [
+        "onnx-asr",
+        "onnxruntime",
+        "onnxruntime-gpu",
+        "numpy",
+        "websockets",
+        "sounddevice",
+        "soundfile",
+    ]
+    
+    for package in packages:
+        try:
+            if package == "onnx-asr":
+                import onnx_asr
+                version = getattr(onnx_asr, "__version__", "unknown")
+            elif package == "onnxruntime":
+                import onnxruntime
+                version = onnxruntime.__version__
+            elif package == "onnxruntime-gpu":
+                try:
+                    import onnxruntime
+                    version = onnxruntime.__version__
+                    print(f"✓ {package}: {version}")
+                except ImportError:
+                    print(f"✗ {package}: Not installed")
+                continue
+            elif package == "numpy":
+                import numpy
+                version = numpy.__version__
+            elif package == "websockets":
+                import websockets
+                version = websockets.__version__
+            elif package == "sounddevice":
+                import sounddevice
+                version = sounddevice.__version__
+            elif package == "soundfile":
+                import soundfile
+                version = soundfile.__version__
+            
+            print(f"✓ {package}: {version}")
+        except ImportError:
+            print(f"✗ {package}: Not installed")
+
+
+def check_cuda():
+    """Check CUDA availability."""
+    print_section("CUDA Information")
+    
+    # Check nvcc
+    try:
+        result = subprocess.run(
+            ["nvcc", "--version"],
+            capture_output=True,
+            text=True,
+        )
+        print("NVCC (CUDA Compiler):")
+        print(result.stdout)
+    except FileNotFoundError:
+        print("✗ nvcc not found - CUDA may not be installed")
+    
+    # Check nvidia-smi
+    try:
+        result = subprocess.run(
+            ["nvidia-smi"],
+            capture_output=True,
+            text=True,
+        )
+        print("NVIDIA GPU Information:")
+        print(result.stdout)
+    except FileNotFoundError:
+        print("✗ nvidia-smi not found - NVIDIA drivers may not be installed")
+
+
+def check_onnxruntime():
+    """Check ONNX Runtime providers."""
+    print_section("ONNX Runtime Providers")
+    
+    try:
+        import onnxruntime as ort
+        
+        print("Available providers:")
+        for provider in ort.get_available_providers():
+            print(f"  ✓ {provider}")
+        
+        # Check if CUDA is available
+        if "CUDAExecutionProvider" in ort.get_available_providers():
+            print("\n✓ GPU acceleration available via CUDA")
+        else:
+            print("\n✗ GPU acceleration NOT available")
+            print("  Make sure onnxruntime-gpu is installed and CUDA is working")
+        
+        # Get device info
+        print(f"\nONNX Runtime version: {ort.__version__}")
+        
+    except ImportError:
+        print("✗ onnxruntime not installed")
+
+
+def check_audio_devices():
+    """Check audio devices."""
+    print_section("Audio Devices")
+    
+    try:
+        import sounddevice as sd
+        
+        devices = sd.query_devices()
+        
+        print("Input devices:")
+        for i, device in enumerate(devices):
+            if device['max_input_channels'] > 0:
+                default = " [DEFAULT]" if i == sd.default.device[0] else ""
+                print(f"  [{i}] {device['name']}{default}")
+                print(f"      Channels: {device['max_input_channels']}")
+                print(f"      Sample rate: {device['default_samplerate']} Hz")
+        
+    except ImportError:
+        print("✗ sounddevice not installed")
+    except Exception as e:
+        print(f"✗ Error querying audio devices: {e}")
+
+
+def check_model_files():
+    """Check if model files exist."""
+    print_section("Model Files")
+    
+    from pathlib import Path
+    
+    model_dir = Path("models/parakeet")
+    
+    expected_files = [
+        "config.json",
+        "encoder-parakeet-tdt-0.6b-v3.onnx",
+        "decoder_joint-parakeet-tdt-0.6b-v3.onnx",
+        "vocab.txt",
+    ]
+    
+    if not model_dir.exists():
+        print(f"✗ Model directory not found: {model_dir}")
+        print("  Models will be downloaded on first run")
+        return
+    
+    print(f"Model directory: {model_dir.absolute()}")
+    print("\nExpected files:")
+    
+    for filename in expected_files:
+        filepath = model_dir / filename
+        if filepath.exists():
+            size_mb = filepath.stat().st_size / (1024 * 1024)
+            print(f"  ✓ {filename} ({size_mb:.1f} MB)")
+        else:
+            print(f"  ✗ {filename} (missing)")
+
+
+def test_onnx_asr():
+    """Test onnx-asr import and basic functionality."""
+    print_section("onnx-asr Test")
+    
+    try:
+        import onnx_asr
+        
+        print("✓ onnx-asr imported successfully")
+        print(f"  Version: {getattr(onnx_asr, '__version__', 'unknown')}")
+        
+        # Test loading model info (without downloading)
+        print("\n✓ onnx-asr is ready to use")
+        print("  Run test_offline.py to download models and test transcription")
+        
+    except ImportError as e:
+        print(f"✗ Failed to import onnx-asr: {e}")
+    except Exception as e:
+        print(f"✗ Error testing onnx-asr: {e}")
+
+
+def main():
+    """Run all diagnostics."""
+    print("\n" + "="*80)
+    print(" ASR System Diagnostics")
+    print("="*80)
+    
+    check_python()
+    check_packages()
+    check_cuda()
+    check_onnxruntime()
+    check_audio_devices()
+    check_model_files()
+    test_onnx_asr()
+    
+    print("\n" + "="*80)
+    print(" Diagnostics Complete")
+    print("="*80 + "\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/stt-parakeet/tools/test_offline.py
+++ b/stt-parakeet/tools/test_offline.py
@@ -0,0 +1,114 @@
+"""
+Test offline ASR pipeline with onnx-asr
+"""
+import soundfile as sf
+import numpy as np
+import sys
+import argparse
+from pathlib import Path
+from asr.asr_pipeline import ASRPipeline
+
+
+def test_transcription(audio_file: str, use_vad: bool = False, quantization: str = None):
+    """
+    Test ASR transcription on an audio file.
+    
+    Args:
+        audio_file: Path to audio file
+        use_vad: Whether to use VAD
+        quantization: Optional quantization (e.g., "int8")
+    """
+    print(f"\n{'='*80}")
+    print(f"Testing ASR Pipeline with onnx-asr")
+    print(f"{'='*80}")
+    print(f"Audio file: {audio_file}")
+    print(f"Use VAD: {use_vad}")
+    print(f"Quantization: {quantization}")
+    print(f"{'='*80}\n")
+    
+    # Initialize pipeline
+    print("Initializing ASR pipeline...")
+    pipeline = ASRPipeline(
+        model_name="nemo-parakeet-tdt-0.6b-v3",
+        quantization=quantization,
+        use_vad=use_vad,
+    )
+    print("Pipeline initialized successfully!\n")
+    
+    # Read audio file
+    print(f"Reading audio file: {audio_file}")
+    audio, sr = sf.read(audio_file, dtype="float32")
+    print(f"Sample rate: {sr} Hz")
+    print(f"Audio shape: {audio.shape}")
+    print(f"Audio duration: {len(audio) / sr:.2f} seconds")
+    
+    # Ensure mono
+    if audio.ndim > 1:
+        print("Converting stereo to mono...")
+        audio = audio[:, 0]
+    
+    # Verify sample rate
+    if sr != 16000:
+        print(f"WARNING: Sample rate is {sr} Hz, expected 16000 Hz")
+        print("Consider resampling the audio file")
+    
+    print(f"\n{'='*80}")
+    print("Transcribing...")
+    print(f"{'='*80}\n")
+    
+    # Transcribe
+    result = pipeline.transcribe(audio, sample_rate=sr)
+    
+    # Display results
+    if use_vad and isinstance(result, list):
+        print("TRANSCRIPTION (with VAD):")
+        print("-" * 80)
+        for i, segment in enumerate(result, 1):
+            print(f"Segment {i}: {segment}")
+        print("-" * 80)
+    else:
+        print("TRANSCRIPTION:")
+        print("-" * 80)
+        print(result)
+        print("-" * 80)
+    
+    # Audio statistics
+    print(f"\nAUDIO STATISTICS:")
+    print(f"  dtype: {audio.dtype}")
+    print(f"  min: {audio.min():.6f}")
+    print(f"  max: {audio.max():.6f}")
+    print(f"  mean: {audio.mean():.6f}")
+    print(f"  std: {audio.std():.6f}")
+    
+    print(f"\n{'='*80}")
+    print("Test completed successfully!")
+    print(f"{'='*80}\n")
+    
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test offline ASR transcription")
+    parser.add_argument("audio_file", help="Path to audio file (WAV format)")
+    parser.add_argument("--use-vad", action="store_true", help="Enable VAD")
+    parser.add_argument("--quantization", default=None, choices=["int8", "fp16"], 
+                       help="Model quantization")
+    
+    args = parser.parse_args()
+    
+    # Check if file exists
+    if not Path(args.audio_file).exists():
+        print(f"ERROR: Audio file not found: {args.audio_file}")
+        sys.exit(1)
+    
+    try:
+        test_transcription(args.audio_file, args.use_vad, args.quantization)
+    except Exception as e:
+        print(f"\nERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()