Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.
This commit is contained in:
@@ -27,13 +27,13 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
decodes/resamples as needed, and sends to STT clients for transcription.
|
||||
"""
|
||||
|
||||
def __init__(self, voice_manager, stt_url: str = "ws://miku-stt:8000/ws/stt"):
|
||||
def __init__(self, voice_manager, stt_url: str = "ws://miku-stt:8766/ws/stt"):
|
||||
"""
|
||||
Initialize voice receiver sink.
|
||||
Initialize Voice Receiver.
|
||||
|
||||
Args:
|
||||
voice_manager: Reference to VoiceManager for callbacks
|
||||
stt_url: Base URL for STT WebSocket server with path (port 8000 inside container)
|
||||
voice_manager: The voice manager instance
|
||||
stt_url: Base URL for STT WebSocket server with path (port 8766 inside container)
|
||||
"""
|
||||
super().__init__()
|
||||
self.voice_manager = voice_manager
|
||||
@@ -56,6 +56,17 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
# User info (for logging)
|
||||
self.users: Dict[int, discord.User] = {}
|
||||
|
||||
# Silence tracking for detecting end of speech
|
||||
self.last_audio_time: Dict[int, float] = {}
|
||||
self.silence_tasks: Dict[int, asyncio.Task] = {}
|
||||
self.silence_timeout = 1.0 # seconds of silence before sending "final"
|
||||
|
||||
# Interruption detection
|
||||
self.interruption_start_time: Dict[int, float] = {}
|
||||
self.interruption_audio_count: Dict[int, int] = {}
|
||||
self.interruption_threshold_time = 0.8 # seconds of speech to count as interruption
|
||||
self.interruption_threshold_chunks = 8 # minimum audio chunks to count as interruption
|
||||
|
||||
# Active flag
|
||||
self.active = False
|
||||
|
||||
@@ -232,6 +243,17 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
if user_id in self.users:
|
||||
del self.users[user_id]
|
||||
|
||||
# Cancel silence detection task
|
||||
if user_id in self.silence_tasks and not self.silence_tasks[user_id].done():
|
||||
self.silence_tasks[user_id].cancel()
|
||||
del self.silence_tasks[user_id]
|
||||
if user_id in self.last_audio_time:
|
||||
del self.last_audio_time[user_id]
|
||||
|
||||
# Clear interruption tracking
|
||||
self.interruption_start_time.pop(user_id, None)
|
||||
self.interruption_audio_count.pop(user_id, None)
|
||||
|
||||
# Cleanup opus decoder for this user
|
||||
if hasattr(self, '_opus_decoders') and user_id in self._opus_decoders:
|
||||
del self._opus_decoders[user_id]
|
||||
@@ -299,10 +321,95 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
else:
|
||||
# Put remaining partial chunk back in buffer
|
||||
buffer.append(chunk)
|
||||
|
||||
# Track audio time for silence detection
|
||||
import time
|
||||
current_time = time.time()
|
||||
self.last_audio_time[user_id] = current_time
|
||||
|
||||
# ===== INTERRUPTION DETECTION =====
|
||||
# Check if Miku is speaking and user is interrupting
|
||||
# Note: self.voice_manager IS the VoiceSession, not the VoiceManager singleton
|
||||
miku_speaking = self.voice_manager.miku_speaking
|
||||
logger.debug(f"[INTERRUPTION CHECK] user={user_id}, miku_speaking={miku_speaking}")
|
||||
|
||||
if miku_speaking:
|
||||
# Track interruption
|
||||
if user_id not in self.interruption_start_time:
|
||||
# First chunk during Miku's speech
|
||||
self.interruption_start_time[user_id] = current_time
|
||||
self.interruption_audio_count[user_id] = 1
|
||||
else:
|
||||
# Increment chunk count
|
||||
self.interruption_audio_count[user_id] += 1
|
||||
|
||||
# Calculate interruption duration
|
||||
interruption_duration = current_time - self.interruption_start_time[user_id]
|
||||
chunk_count = self.interruption_audio_count[user_id]
|
||||
|
||||
# Check if interruption threshold is met
|
||||
if (interruption_duration >= self.interruption_threshold_time and
|
||||
chunk_count >= self.interruption_threshold_chunks):
|
||||
|
||||
# Trigger interruption!
|
||||
logger.info(f"🛑 User {user_id} interrupted Miku (duration={interruption_duration:.2f}s, chunks={chunk_count})")
|
||||
logger.info(f" → Stopping Miku's TTS and LLM, will process user's speech when finished")
|
||||
|
||||
# Reset interruption tracking
|
||||
self.interruption_start_time.pop(user_id, None)
|
||||
self.interruption_audio_count.pop(user_id, None)
|
||||
|
||||
# Call interruption handler (this sets miku_speaking=False)
|
||||
asyncio.create_task(
|
||||
self.voice_manager.on_user_interruption(user_id)
|
||||
)
|
||||
else:
|
||||
# Miku not speaking, clear interruption tracking
|
||||
self.interruption_start_time.pop(user_id, None)
|
||||
self.interruption_audio_count.pop(user_id, None)
|
||||
|
||||
# Cancel existing silence task if any
|
||||
if user_id in self.silence_tasks and not self.silence_tasks[user_id].done():
|
||||
self.silence_tasks[user_id].cancel()
|
||||
|
||||
# Start new silence detection task
|
||||
self.silence_tasks[user_id] = asyncio.create_task(
|
||||
self._detect_silence(user_id)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send audio chunk for user {user_id}: {e}")
|
||||
|
||||
async def _detect_silence(self, user_id: int):
|
||||
"""
|
||||
Wait for silence timeout and send 'final' command to STT.
|
||||
|
||||
This is called after each audio chunk. If no more audio arrives within
|
||||
the silence_timeout period, we send the 'final' command to get the
|
||||
complete transcription.
|
||||
|
||||
Args:
|
||||
user_id: Discord user ID
|
||||
"""
|
||||
try:
|
||||
# Wait for silence timeout
|
||||
await asyncio.sleep(self.silence_timeout)
|
||||
|
||||
# Check if we still have an active STT client
|
||||
stt_client = self.stt_clients.get(user_id)
|
||||
if not stt_client or not stt_client.is_connected():
|
||||
return
|
||||
|
||||
# Send final command to get complete transcription
|
||||
logger.debug(f"Silence detected for user {user_id}, requesting final transcript")
|
||||
await stt_client.send_final()
|
||||
|
||||
except asyncio.CancelledError:
|
||||
# Task was cancelled because new audio arrived
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error(f"Error in silence detection for user {user_id}: {e}")
|
||||
|
||||
async def _on_vad_event(self, user_id: int, event: dict):
|
||||
"""
|
||||
Handle VAD event from STT.
|
||||
|
||||
Reference in New Issue
Block a user