Phase 4 STT pipeline implemented — Silero VAD + faster-whisper — still not working well at all
This commit is contained in:
193
stt/whisper_transcriber.py
Normal file
193
stt/whisper_transcriber.py
Normal file
@@ -0,0 +1,193 @@
|
||||
"""
|
||||
Faster-Whisper Transcriber
|
||||
|
||||
GPU-accelerated speech-to-text using faster-whisper (CTranslate2).
|
||||
Supports streaming transcription with partial results.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from faster_whisper import WhisperModel
|
||||
from typing import Iterator, Optional, List
|
||||
import logging
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
logger = logging.getLogger('whisper')
|
||||
|
||||
|
||||
class WhisperTranscriber:
|
||||
"""
|
||||
Faster-Whisper based transcription with streaming support.
|
||||
|
||||
Runs on GPU (GTX 1660) with small model for balance of speed/quality.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_size: str = "small",
|
||||
device: str = "cuda",
|
||||
compute_type: str = "float16",
|
||||
language: str = "en",
|
||||
beam_size: int = 5
|
||||
):
|
||||
"""
|
||||
Initialize Whisper transcriber.
|
||||
|
||||
Args:
|
||||
model_size: Model size (tiny, base, small, medium, large)
|
||||
device: Device to run on (cuda or cpu)
|
||||
compute_type: Compute precision (float16, int8, int8_float16)
|
||||
language: Language code for transcription
|
||||
beam_size: Beam search size (higher = better quality, slower)
|
||||
"""
|
||||
self.model_size = model_size
|
||||
self.device = device
|
||||
self.compute_type = compute_type
|
||||
self.language = language
|
||||
self.beam_size = beam_size
|
||||
|
||||
logger.info(f"Loading Faster-Whisper model: {model_size} on {device}...")
|
||||
|
||||
# Load model
|
||||
self.model = WhisperModel(
|
||||
model_size,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
download_root="/models"
|
||||
)
|
||||
|
||||
# Thread pool for blocking transcription calls
|
||||
self.executor = ThreadPoolExecutor(max_workers=2)
|
||||
|
||||
logger.info(f"Whisper model loaded: {model_size} ({compute_type})")
|
||||
|
||||
async def transcribe_async(
|
||||
self,
|
||||
audio: np.ndarray,
|
||||
sample_rate: int = 16000,
|
||||
initial_prompt: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Transcribe audio asynchronously (non-blocking).
|
||||
|
||||
Args:
|
||||
audio: Audio data as numpy array (float32)
|
||||
sample_rate: Audio sample rate
|
||||
initial_prompt: Optional prompt to guide transcription
|
||||
|
||||
Returns:
|
||||
Transcribed text
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
# Run transcription in thread pool to avoid blocking
|
||||
result = await loop.run_in_executor(
|
||||
self.executor,
|
||||
self._transcribe_blocking,
|
||||
audio,
|
||||
sample_rate,
|
||||
initial_prompt
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def _transcribe_blocking(
|
||||
self,
|
||||
audio: np.ndarray,
|
||||
sample_rate: int,
|
||||
initial_prompt: Optional[str]
|
||||
) -> str:
|
||||
"""
|
||||
Blocking transcription call (runs in thread pool).
|
||||
"""
|
||||
# Convert to float32 if needed
|
||||
if audio.dtype != np.float32:
|
||||
audio = audio.astype(np.float32) / 32768.0
|
||||
|
||||
# Transcribe
|
||||
segments, info = self.model.transcribe(
|
||||
audio,
|
||||
language=self.language,
|
||||
beam_size=self.beam_size,
|
||||
initial_prompt=initial_prompt,
|
||||
vad_filter=False, # We handle VAD separately
|
||||
word_timestamps=False # Can enable for word-level timing
|
||||
)
|
||||
|
||||
# Collect all segments
|
||||
text_parts = []
|
||||
for segment in segments:
|
||||
text_parts.append(segment.text.strip())
|
||||
|
||||
full_text = " ".join(text_parts).strip()
|
||||
|
||||
logger.debug(f"Transcribed: '{full_text}' (language: {info.language}, "
|
||||
f"probability: {info.language_probability:.2f})")
|
||||
|
||||
return full_text
|
||||
|
||||
async def transcribe_streaming(
|
||||
self,
|
||||
audio_stream: Iterator[np.ndarray],
|
||||
sample_rate: int = 16000,
|
||||
chunk_duration_s: float = 2.0
|
||||
) -> Iterator[dict]:
|
||||
"""
|
||||
Transcribe audio stream with partial results.
|
||||
|
||||
Args:
|
||||
audio_stream: Iterator yielding audio chunks
|
||||
sample_rate: Audio sample rate
|
||||
chunk_duration_s: Duration of each chunk to transcribe
|
||||
|
||||
Yields:
|
||||
{"type": "partial", "text": "partial transcript"}
|
||||
{"type": "final", "text": "complete transcript"}
|
||||
"""
|
||||
accumulated_audio = []
|
||||
chunk_samples = int(chunk_duration_s * sample_rate)
|
||||
|
||||
async for audio_chunk in audio_stream:
|
||||
accumulated_audio.append(audio_chunk)
|
||||
|
||||
# Check if we have enough audio for transcription
|
||||
total_samples = sum(len(chunk) for chunk in accumulated_audio)
|
||||
|
||||
if total_samples >= chunk_samples:
|
||||
# Concatenate accumulated audio
|
||||
audio_data = np.concatenate(accumulated_audio)
|
||||
|
||||
# Transcribe current accumulated audio
|
||||
text = await self.transcribe_async(audio_data, sample_rate)
|
||||
|
||||
if text:
|
||||
yield {
|
||||
"type": "partial",
|
||||
"text": text,
|
||||
"duration": total_samples / sample_rate
|
||||
}
|
||||
|
||||
# Final transcription of remaining audio
|
||||
if accumulated_audio:
|
||||
audio_data = np.concatenate(accumulated_audio)
|
||||
text = await self.transcribe_async(audio_data, sample_rate)
|
||||
|
||||
if text:
|
||||
yield {
|
||||
"type": "final",
|
||||
"text": text,
|
||||
"duration": len(audio_data) / sample_rate
|
||||
}
|
||||
|
||||
def get_supported_languages(self) -> List[str]:
|
||||
"""Get list of supported language codes."""
|
||||
return [
|
||||
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr",
|
||||
"pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi",
|
||||
"he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no"
|
||||
]
|
||||
|
||||
def cleanup(self):
|
||||
"""Cleanup resources."""
|
||||
self.executor.shutdown(wait=True)
|
||||
logger.info("Whisper transcriber cleaned up")
|
||||
Reference in New Issue
Block a user