Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.

This commit is contained in:
2026-01-19 00:29:44 +02:00
parent 0a8910fff8
commit 362108f4b0
34 changed files with 4593 additions and 73 deletions

View File

@@ -0,0 +1,219 @@
"""
System diagnostics for ASR setup
"""
import sys
import subprocess
def print_section(title):
"""Print a section header."""
print(f"\n{'='*80}")
print(f" {title}")
print(f"{'='*80}\n")
def check_python():
"""Check Python version."""
print_section("Python Version")
print(f"Python: {sys.version}")
print(f"Executable: {sys.executable}")
def check_packages():
"""Check installed packages."""
print_section("Installed Packages")
packages = [
"onnx-asr",
"onnxruntime",
"onnxruntime-gpu",
"numpy",
"websockets",
"sounddevice",
"soundfile",
]
for package in packages:
try:
if package == "onnx-asr":
import onnx_asr
version = getattr(onnx_asr, "__version__", "unknown")
elif package == "onnxruntime":
import onnxruntime
version = onnxruntime.__version__
elif package == "onnxruntime-gpu":
try:
import onnxruntime
version = onnxruntime.__version__
print(f"{package}: {version}")
except ImportError:
print(f"{package}: Not installed")
continue
elif package == "numpy":
import numpy
version = numpy.__version__
elif package == "websockets":
import websockets
version = websockets.__version__
elif package == "sounddevice":
import sounddevice
version = sounddevice.__version__
elif package == "soundfile":
import soundfile
version = soundfile.__version__
print(f"{package}: {version}")
except ImportError:
print(f"{package}: Not installed")
def check_cuda():
"""Check CUDA availability."""
print_section("CUDA Information")
# Check nvcc
try:
result = subprocess.run(
["nvcc", "--version"],
capture_output=True,
text=True,
)
print("NVCC (CUDA Compiler):")
print(result.stdout)
except FileNotFoundError:
print("✗ nvcc not found - CUDA may not be installed")
# Check nvidia-smi
try:
result = subprocess.run(
["nvidia-smi"],
capture_output=True,
text=True,
)
print("NVIDIA GPU Information:")
print(result.stdout)
except FileNotFoundError:
print("✗ nvidia-smi not found - NVIDIA drivers may not be installed")
def check_onnxruntime():
"""Check ONNX Runtime providers."""
print_section("ONNX Runtime Providers")
try:
import onnxruntime as ort
print("Available providers:")
for provider in ort.get_available_providers():
print(f"{provider}")
# Check if CUDA is available
if "CUDAExecutionProvider" in ort.get_available_providers():
print("\n✓ GPU acceleration available via CUDA")
else:
print("\n✗ GPU acceleration NOT available")
print(" Make sure onnxruntime-gpu is installed and CUDA is working")
# Get device info
print(f"\nONNX Runtime version: {ort.__version__}")
except ImportError:
print("✗ onnxruntime not installed")
def check_audio_devices():
"""Check audio devices."""
print_section("Audio Devices")
try:
import sounddevice as sd
devices = sd.query_devices()
print("Input devices:")
for i, device in enumerate(devices):
if device['max_input_channels'] > 0:
default = " [DEFAULT]" if i == sd.default.device[0] else ""
print(f" [{i}] {device['name']}{default}")
print(f" Channels: {device['max_input_channels']}")
print(f" Sample rate: {device['default_samplerate']} Hz")
except ImportError:
print("✗ sounddevice not installed")
except Exception as e:
print(f"✗ Error querying audio devices: {e}")
def check_model_files():
"""Check if model files exist."""
print_section("Model Files")
from pathlib import Path
model_dir = Path("models/parakeet")
expected_files = [
"config.json",
"encoder-parakeet-tdt-0.6b-v3.onnx",
"decoder_joint-parakeet-tdt-0.6b-v3.onnx",
"vocab.txt",
]
if not model_dir.exists():
print(f"✗ Model directory not found: {model_dir}")
print(" Models will be downloaded on first run")
return
print(f"Model directory: {model_dir.absolute()}")
print("\nExpected files:")
for filename in expected_files:
filepath = model_dir / filename
if filepath.exists():
size_mb = filepath.stat().st_size / (1024 * 1024)
print(f"{filename} ({size_mb:.1f} MB)")
else:
print(f"{filename} (missing)")
def test_onnx_asr():
"""Test onnx-asr import and basic functionality."""
print_section("onnx-asr Test")
try:
import onnx_asr
print("✓ onnx-asr imported successfully")
print(f" Version: {getattr(onnx_asr, '__version__', 'unknown')}")
# Test loading model info (without downloading)
print("\n✓ onnx-asr is ready to use")
print(" Run test_offline.py to download models and test transcription")
except ImportError as e:
print(f"✗ Failed to import onnx-asr: {e}")
except Exception as e:
print(f"✗ Error testing onnx-asr: {e}")
def main():
"""Run all diagnostics."""
print("\n" + "="*80)
print(" ASR System Diagnostics")
print("="*80)
check_python()
check_packages()
check_cuda()
check_onnxruntime()
check_audio_devices()
check_model_files()
test_onnx_asr()
print("\n" + "="*80)
print(" Diagnostics Complete")
print("="*80 + "\n")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,114 @@
"""
Test offline ASR pipeline with onnx-asr
"""
import soundfile as sf
import numpy as np
import sys
import argparse
from pathlib import Path
from asr.asr_pipeline import ASRPipeline
def test_transcription(audio_file: str, use_vad: bool = False, quantization: str = None):
"""
Test ASR transcription on an audio file.
Args:
audio_file: Path to audio file
use_vad: Whether to use VAD
quantization: Optional quantization (e.g., "int8")
"""
print(f"\n{'='*80}")
print(f"Testing ASR Pipeline with onnx-asr")
print(f"{'='*80}")
print(f"Audio file: {audio_file}")
print(f"Use VAD: {use_vad}")
print(f"Quantization: {quantization}")
print(f"{'='*80}\n")
# Initialize pipeline
print("Initializing ASR pipeline...")
pipeline = ASRPipeline(
model_name="nemo-parakeet-tdt-0.6b-v3",
quantization=quantization,
use_vad=use_vad,
)
print("Pipeline initialized successfully!\n")
# Read audio file
print(f"Reading audio file: {audio_file}")
audio, sr = sf.read(audio_file, dtype="float32")
print(f"Sample rate: {sr} Hz")
print(f"Audio shape: {audio.shape}")
print(f"Audio duration: {len(audio) / sr:.2f} seconds")
# Ensure mono
if audio.ndim > 1:
print("Converting stereo to mono...")
audio = audio[:, 0]
# Verify sample rate
if sr != 16000:
print(f"WARNING: Sample rate is {sr} Hz, expected 16000 Hz")
print("Consider resampling the audio file")
print(f"\n{'='*80}")
print("Transcribing...")
print(f"{'='*80}\n")
# Transcribe
result = pipeline.transcribe(audio, sample_rate=sr)
# Display results
if use_vad and isinstance(result, list):
print("TRANSCRIPTION (with VAD):")
print("-" * 80)
for i, segment in enumerate(result, 1):
print(f"Segment {i}: {segment}")
print("-" * 80)
else:
print("TRANSCRIPTION:")
print("-" * 80)
print(result)
print("-" * 80)
# Audio statistics
print(f"\nAUDIO STATISTICS:")
print(f" dtype: {audio.dtype}")
print(f" min: {audio.min():.6f}")
print(f" max: {audio.max():.6f}")
print(f" mean: {audio.mean():.6f}")
print(f" std: {audio.std():.6f}")
print(f"\n{'='*80}")
print("Test completed successfully!")
print(f"{'='*80}\n")
return result
def main():
parser = argparse.ArgumentParser(description="Test offline ASR transcription")
parser.add_argument("audio_file", help="Path to audio file (WAV format)")
parser.add_argument("--use-vad", action="store_true", help="Enable VAD")
parser.add_argument("--quantization", default=None, choices=["int8", "fp16"],
help="Model quantization")
args = parser.parse_args()
# Check if file exists
if not Path(args.audio_file).exists():
print(f"ERROR: Audio file not found: {args.audio_file}")
sys.exit(1)
try:
test_transcription(args.audio_file, args.use_vad, args.quantization)
except Exception as e:
print(f"\nERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()