Phase 4 STT pipeline implemented — Silero VAD + faster-whisper — still not working well at all

2026-01-17 03:14:40 +02:00
parent 3e59e5d2f6
commit d1e6b21508
30 changed files with 156595 additions and 8 deletions
--- a/stt/whisper_transcriber.py
+++ b/stt/whisper_transcriber.py
@@ -0,0 +1,193 @@
+"""
+Faster-Whisper Transcriber
+
+GPU-accelerated speech-to-text using faster-whisper (CTranslate2).
+Supports streaming transcription with partial results.
+"""
+
+import numpy as np
+from faster_whisper import WhisperModel
+from typing import Iterator, Optional, List
+import logging
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+
+logger = logging.getLogger('whisper')
+
+
+class WhisperTranscriber:
+    """
+    Faster-Whisper based transcription with streaming support.
+    
+    Runs on GPU (GTX 1660) with small model for balance of speed/quality.
+    """
+    
+    def __init__(
+        self,
+        model_size: str = "small",
+        device: str = "cuda",
+        compute_type: str = "float16",
+        language: str = "en",
+        beam_size: int = 5
+    ):
+        """
+        Initialize Whisper transcriber.
+        
+        Args:
+            model_size: Model size (tiny, base, small, medium, large)
+            device: Device to run on (cuda or cpu)
+            compute_type: Compute precision (float16, int8, int8_float16)
+            language: Language code for transcription
+            beam_size: Beam search size (higher = better quality, slower)
+        """
+        self.model_size = model_size
+        self.device = device
+        self.compute_type = compute_type
+        self.language = language
+        self.beam_size = beam_size
+        
+        logger.info(f"Loading Faster-Whisper model: {model_size} on {device}...")
+        
+        # Load model
+        self.model = WhisperModel(
+            model_size,
+            device=device,
+            compute_type=compute_type,
+            download_root="/models"
+        )
+        
+        # Thread pool for blocking transcription calls
+        self.executor = ThreadPoolExecutor(max_workers=2)
+        
+        logger.info(f"Whisper model loaded: {model_size} ({compute_type})")
+    
+    async def transcribe_async(
+        self,
+        audio: np.ndarray,
+        sample_rate: int = 16000,
+        initial_prompt: Optional[str] = None
+    ) -> str:
+        """
+        Transcribe audio asynchronously (non-blocking).
+        
+        Args:
+            audio: Audio data as numpy array (float32)
+            sample_rate: Audio sample rate
+            initial_prompt: Optional prompt to guide transcription
+        
+        Returns:
+            Transcribed text
+        """
+        loop = asyncio.get_event_loop()
+        
+        # Run transcription in thread pool to avoid blocking
+        result = await loop.run_in_executor(
+            self.executor,
+            self._transcribe_blocking,
+            audio,
+            sample_rate,
+            initial_prompt
+        )
+        
+        return result
+    
+    def _transcribe_blocking(
+        self,
+        audio: np.ndarray,
+        sample_rate: int,
+        initial_prompt: Optional[str]
+    ) -> str:
+        """
+        Blocking transcription call (runs in thread pool).
+        """
+        # Convert to float32 if needed
+        if audio.dtype != np.float32:
+            audio = audio.astype(np.float32) / 32768.0
+        
+        # Transcribe
+        segments, info = self.model.transcribe(
+            audio,
+            language=self.language,
+            beam_size=self.beam_size,
+            initial_prompt=initial_prompt,
+            vad_filter=False,  # We handle VAD separately
+            word_timestamps=False  # Can enable for word-level timing
+        )
+        
+        # Collect all segments
+        text_parts = []
+        for segment in segments:
+            text_parts.append(segment.text.strip())
+        
+        full_text = " ".join(text_parts).strip()
+        
+        logger.debug(f"Transcribed: '{full_text}' (language: {info.language}, "
+                    f"probability: {info.language_probability:.2f})")
+        
+        return full_text
+    
+    async def transcribe_streaming(
+        self,
+        audio_stream: Iterator[np.ndarray],
+        sample_rate: int = 16000,
+        chunk_duration_s: float = 2.0
+    ) -> Iterator[dict]:
+        """
+        Transcribe audio stream with partial results.
+        
+        Args:
+            audio_stream: Iterator yielding audio chunks
+            sample_rate: Audio sample rate
+            chunk_duration_s: Duration of each chunk to transcribe
+        
+        Yields:
+            {"type": "partial", "text": "partial transcript"}
+            {"type": "final", "text": "complete transcript"}
+        """
+        accumulated_audio = []
+        chunk_samples = int(chunk_duration_s * sample_rate)
+        
+        async for audio_chunk in audio_stream:
+            accumulated_audio.append(audio_chunk)
+            
+            # Check if we have enough audio for transcription
+            total_samples = sum(len(chunk) for chunk in accumulated_audio)
+            
+            if total_samples >= chunk_samples:
+                # Concatenate accumulated audio
+                audio_data = np.concatenate(accumulated_audio)
+                
+                # Transcribe current accumulated audio
+                text = await self.transcribe_async(audio_data, sample_rate)
+                
+                if text:
+                    yield {
+                        "type": "partial",
+                        "text": text,
+                        "duration": total_samples / sample_rate
+                    }
+        
+        # Final transcription of remaining audio
+        if accumulated_audio:
+            audio_data = np.concatenate(accumulated_audio)
+            text = await self.transcribe_async(audio_data, sample_rate)
+            
+            if text:
+                yield {
+                    "type": "final",
+                    "text": text,
+                    "duration": len(audio_data) / sample_rate
+                }
+    
+    def get_supported_languages(self) -> List[str]:
+        """Get list of supported language codes."""
+        return [
+            "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr",
+            "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi",
+            "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no"
+        ]
+    
+    def cleanup(self):
+        """Cleanup resources."""
+        self.executor.shutdown(wait=True)
+        logger.info("Whisper transcriber cleaned up")