Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.
This commit is contained in:
@@ -49,6 +49,15 @@ class ParakeetTranscriber:
|
||||
|
||||
logger.info(f"Loading Parakeet model: {model_name} on {device}...")
|
||||
|
||||
# Set PyTorch memory allocator settings for better memory management
|
||||
if device == "cuda":
|
||||
# Enable expandable segments to reduce fragmentation
|
||||
import os
|
||||
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
||||
|
||||
# Clear cache before loading model
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Load model via NeMo from HuggingFace
|
||||
self.model = EncDecRNNTBPEModel.from_pretrained(
|
||||
model_name=model_name,
|
||||
@@ -58,6 +67,11 @@ class ParakeetTranscriber:
|
||||
self.model.eval()
|
||||
if device == "cuda":
|
||||
self.model = self.model.cuda()
|
||||
# Enable memory efficient attention if available
|
||||
try:
|
||||
self.model.encoder.use_memory_efficient_attention = True
|
||||
except:
|
||||
pass
|
||||
|
||||
# Thread pool for blocking transcription calls
|
||||
self.executor = ThreadPoolExecutor(max_workers=2)
|
||||
@@ -119,7 +133,7 @@ class ParakeetTranscriber:
|
||||
|
||||
# Transcribe using NeMo model
|
||||
with torch.no_grad():
|
||||
# Convert to tensor
|
||||
# Convert to tensor and keep on GPU to avoid CPU/GPU bouncing
|
||||
audio_signal = torch.from_numpy(audio).unsqueeze(0)
|
||||
audio_signal_len = torch.tensor([len(audio)])
|
||||
|
||||
@@ -127,12 +141,14 @@ class ParakeetTranscriber:
|
||||
audio_signal = audio_signal.cuda()
|
||||
audio_signal_len = audio_signal_len.cuda()
|
||||
|
||||
# Get transcription with timestamps
|
||||
# NeMo returns list of Hypothesis objects when timestamps=True
|
||||
# Get transcription
|
||||
# NeMo returns list of Hypothesis objects
|
||||
# Note: timestamps=True causes significant VRAM usage (~1-2GB extra)
|
||||
# Only enable for final transcriptions, not streaming partials
|
||||
transcriptions = self.model.transcribe(
|
||||
audio=[audio_signal.squeeze(0).cpu().numpy()],
|
||||
audio=[audio], # Pass NumPy array directly (NeMo handles it efficiently)
|
||||
batch_size=1,
|
||||
timestamps=True # Enable timestamps to get word-level data
|
||||
timestamps=return_timestamps # Only use timestamps when explicitly requested
|
||||
)
|
||||
|
||||
# Extract text from Hypothesis object
|
||||
@@ -144,9 +160,9 @@ class ParakeetTranscriber:
|
||||
# Hypothesis object has .text attribute
|
||||
text = hypothesis.text.strip() if hasattr(hypothesis, 'text') else str(hypothesis).strip()
|
||||
|
||||
# Extract word-level timestamps if available
|
||||
# Extract word-level timestamps if available and requested
|
||||
words = []
|
||||
if hasattr(hypothesis, 'timestamp') and hypothesis.timestamp:
|
||||
if return_timestamps and hasattr(hypothesis, 'timestamp') and hypothesis.timestamp:
|
||||
# timestamp is a dict with 'word' key containing list of word timestamps
|
||||
word_timestamps = hypothesis.timestamp.get('word', [])
|
||||
for word_info in word_timestamps:
|
||||
@@ -165,6 +181,10 @@ class ParakeetTranscriber:
|
||||
}
|
||||
else:
|
||||
return text
|
||||
|
||||
# Note: We do NOT call torch.cuda.empty_cache() here
|
||||
# That breaks PyTorch's memory allocator and causes fragmentation
|
||||
# Let PyTorch manage its own memory pool
|
||||
|
||||
async def transcribe_streaming(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user