115 lines
3.5 KiB
Python
115 lines
3.5 KiB
Python
|
|
"""
|
||
|
|
Silero VAD wrapper using onnx-asr library
|
||
|
|
"""
|
||
|
|
import numpy as np
|
||
|
|
import onnx_asr
|
||
|
|
from typing import Optional, Tuple
|
||
|
|
import logging
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
|
||
|
|
class SileroVAD:
|
||
|
|
"""
|
||
|
|
Voice Activity Detection using Silero VAD via onnx-asr.
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
providers: Optional[list] = None,
|
||
|
|
threshold: float = 0.5,
|
||
|
|
min_speech_duration_ms: int = 250,
|
||
|
|
min_silence_duration_ms: int = 100,
|
||
|
|
window_size_samples: int = 512,
|
||
|
|
speech_pad_ms: int = 30,
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Initialize Silero VAD.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
providers: Optional ONNX runtime providers
|
||
|
|
threshold: Speech probability threshold (0.0-1.0)
|
||
|
|
min_speech_duration_ms: Minimum duration of speech segment
|
||
|
|
min_silence_duration_ms: Minimum duration of silence to split segments
|
||
|
|
window_size_samples: Window size for VAD processing
|
||
|
|
speech_pad_ms: Padding around speech segments
|
||
|
|
"""
|
||
|
|
if providers is None:
|
||
|
|
providers = [
|
||
|
|
"CUDAExecutionProvider",
|
||
|
|
"CPUExecutionProvider",
|
||
|
|
]
|
||
|
|
|
||
|
|
logger.info("Loading Silero VAD model...")
|
||
|
|
self.vad = onnx_asr.load_vad("silero", providers=providers)
|
||
|
|
|
||
|
|
# VAD parameters
|
||
|
|
self.threshold = threshold
|
||
|
|
self.min_speech_duration_ms = min_speech_duration_ms
|
||
|
|
self.min_silence_duration_ms = min_silence_duration_ms
|
||
|
|
self.window_size_samples = window_size_samples
|
||
|
|
self.speech_pad_ms = speech_pad_ms
|
||
|
|
|
||
|
|
logger.info("Silero VAD initialized successfully")
|
||
|
|
|
||
|
|
def detect_speech(
|
||
|
|
self,
|
||
|
|
audio: np.ndarray,
|
||
|
|
sample_rate: int = 16000,
|
||
|
|
) -> list:
|
||
|
|
"""
|
||
|
|
Detect speech segments in audio.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
audio: Audio data as numpy array (float32)
|
||
|
|
sample_rate: Sample rate of audio
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
List of tuples (start_sample, end_sample) for speech segments
|
||
|
|
"""
|
||
|
|
# Note: The actual VAD processing is typically done within
|
||
|
|
# the onnx_asr model.with_vad() method, but we provide
|
||
|
|
# this interface for direct VAD usage
|
||
|
|
|
||
|
|
# For direct VAD detection, you would use the vad model directly
|
||
|
|
# However, onnx-asr integrates VAD into the recognition pipeline
|
||
|
|
# So this is mainly for compatibility
|
||
|
|
|
||
|
|
logger.warning("Direct VAD detection - consider using model.with_vad() instead")
|
||
|
|
return []
|
||
|
|
|
||
|
|
def is_speech(
|
||
|
|
self,
|
||
|
|
audio_chunk: np.ndarray,
|
||
|
|
sample_rate: int = 16000,
|
||
|
|
) -> Tuple[bool, float]:
|
||
|
|
"""
|
||
|
|
Check if audio chunk contains speech.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
audio_chunk: Audio chunk as numpy array (float32)
|
||
|
|
sample_rate: Sample rate
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Tuple of (is_speech: bool, probability: float)
|
||
|
|
"""
|
||
|
|
# Placeholder for direct VAD probability check
|
||
|
|
# In practice, use model.with_vad() for automatic segmentation
|
||
|
|
logger.warning("Direct speech detection not implemented - use model.with_vad()")
|
||
|
|
return False, 0.0
|
||
|
|
|
||
|
|
def get_vad(self):
|
||
|
|
"""
|
||
|
|
Get the underlying onnx_asr VAD model.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
The onnx_asr VAD model instance
|
||
|
|
"""
|
||
|
|
return self.vad
|
||
|
|
|
||
|
|
|
||
|
|
# Convenience function
|
||
|
|
def load_vad(**kwargs):
|
||
|
|
"""Load and return Silero VAD with given configuration."""
|
||
|
|
return SileroVAD(**kwargs)
|