Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.
This commit is contained in:
@@ -8,6 +8,8 @@ Uses the discord-ext-voice-recv extension for proper audio receiving support.
|
||||
import asyncio
|
||||
import audioop
|
||||
import logging
|
||||
import struct
|
||||
import array
|
||||
from typing import Dict, Optional
|
||||
from collections import deque
|
||||
|
||||
@@ -27,13 +29,13 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
decodes/resamples as needed, and sends to STT clients for transcription.
|
||||
"""
|
||||
|
||||
def __init__(self, voice_manager, stt_url: str = "ws://miku-stt:8766/ws/stt"):
|
||||
def __init__(self, voice_manager, stt_url: str = "ws://miku-stt:8766"):
|
||||
"""
|
||||
Initialize Voice Receiver.
|
||||
|
||||
Args:
|
||||
voice_manager: The voice manager instance
|
||||
stt_url: Base URL for STT WebSocket server with path (port 8766 inside container)
|
||||
stt_url: WebSocket URL for RealtimeSTT server (port 8766 inside container)
|
||||
"""
|
||||
super().__init__()
|
||||
self.voice_manager = voice_manager
|
||||
@@ -72,6 +74,68 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
|
||||
logger.info("VoiceReceiverSink initialized")
|
||||
|
||||
@staticmethod
|
||||
def _preprocess_audio(pcm_data: bytes) -> bytes:
|
||||
"""
|
||||
Preprocess audio for better STT accuracy.
|
||||
|
||||
Applies:
|
||||
1. DC offset removal
|
||||
2. High-pass filter (80Hz) to remove rumble
|
||||
3. RMS normalization
|
||||
|
||||
Args:
|
||||
pcm_data: Raw PCM audio (16-bit mono, 16kHz)
|
||||
|
||||
Returns:
|
||||
Preprocessed PCM audio
|
||||
"""
|
||||
try:
|
||||
# Convert bytes to array of int16 samples
|
||||
samples = array.array('h', pcm_data)
|
||||
|
||||
# 1. Remove DC offset (mean)
|
||||
mean = sum(samples) / len(samples) if samples else 0
|
||||
samples = array.array('h', [int(s - mean) for s in samples])
|
||||
|
||||
# 2. Simple high-pass filter (80Hz @ 16kHz)
|
||||
# Using a simple first-order HPF: y[n] = x[n] - x[n-1] + 0.95 * y[n-1]
|
||||
alpha = 0.95 # Filter coefficient (roughly 80Hz cutoff at 16kHz)
|
||||
filtered = array.array('h')
|
||||
prev_input = 0
|
||||
prev_output = 0
|
||||
|
||||
for sample in samples:
|
||||
output = sample - prev_input + alpha * prev_output
|
||||
filtered.append(int(max(-32768, min(32767, output)))) # Clamp to int16 range
|
||||
prev_input = sample
|
||||
prev_output = output
|
||||
|
||||
# 3. RMS normalization to target level
|
||||
# Calculate RMS
|
||||
sum_squares = sum(s * s for s in filtered)
|
||||
rms = (sum_squares / len(filtered)) ** 0.5 if filtered else 1.0
|
||||
|
||||
# Target RMS (roughly -20dB)
|
||||
target_rms = 3276.8 # 10% of max int16 range
|
||||
|
||||
# Normalize if RMS is too low or too high
|
||||
if rms > 100: # Only normalize if there's actual signal
|
||||
gain = target_rms / rms
|
||||
# Limit gain to prevent over-amplification of noise
|
||||
gain = min(gain, 4.0) # Max 12dB boost
|
||||
normalized = array.array('h', [
|
||||
int(max(-32768, min(32767, s * gain))) for s in filtered
|
||||
])
|
||||
return normalized.tobytes()
|
||||
else:
|
||||
# Signal too weak, return filtered without normalization
|
||||
return filtered.tobytes()
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Audio preprocessing failed, using raw audio: {e}")
|
||||
return pcm_data
|
||||
|
||||
def wants_opus(self) -> bool:
|
||||
"""
|
||||
Tell discord-ext-voice-recv we want Opus data, NOT decoded PCM.
|
||||
@@ -144,6 +208,10 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
# Discord sends 20ms chunks: 960 samples @ 48kHz → 320 samples @ 16kHz
|
||||
pcm_16k, _ = audioop.ratecv(pcm_mono, 2, 1, 48000, 16000, None)
|
||||
|
||||
# Preprocess audio for better STT accuracy
|
||||
# (DC offset removal, high-pass filter, RMS normalization)
|
||||
pcm_16k = self._preprocess_audio(pcm_16k)
|
||||
|
||||
# Send to STT client (schedule on event loop thread-safely)
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self._send_audio_chunk(user_id, pcm_16k),
|
||||
@@ -184,21 +252,16 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
self.audio_buffers[user_id] = deque(maxlen=1000)
|
||||
|
||||
# Create STT client with callbacks
|
||||
# RealtimeSTT handles VAD internally, so we only need partial/final callbacks
|
||||
stt_client = STTClient(
|
||||
user_id=user_id,
|
||||
stt_url=self.stt_url,
|
||||
on_vad_event=lambda event: asyncio.create_task(
|
||||
self._on_vad_event(user_id, event)
|
||||
),
|
||||
on_partial_transcript=lambda text, timestamp: asyncio.create_task(
|
||||
self._on_partial_transcript(user_id, text)
|
||||
),
|
||||
on_final_transcript=lambda text, timestamp: asyncio.create_task(
|
||||
self._on_final_transcript(user_id, text, user)
|
||||
),
|
||||
on_interruption=lambda prob: asyncio.create_task(
|
||||
self._on_interruption(user_id, prob)
|
||||
)
|
||||
)
|
||||
|
||||
# Connect to STT server
|
||||
@@ -279,16 +342,16 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
"""
|
||||
Send audio chunk to STT client.
|
||||
|
||||
Buffers audio until we have 512 samples (32ms @ 16kHz) which is what
|
||||
Silero VAD expects. Discord sends 320 samples (20ms), so we buffer
|
||||
2 chunks and send 640 samples, then the STT server can split it.
|
||||
RealtimeSTT expects 16kHz mono 16-bit PCM audio.
|
||||
We buffer audio to send larger chunks for efficiency.
|
||||
VAD and silence detection is handled by RealtimeSTT.
|
||||
|
||||
Args:
|
||||
user_id: Discord user ID
|
||||
audio_data: PCM audio (int16, 16kHz mono, 320 samples = 640 bytes)
|
||||
audio_data: PCM audio (int16, 16kHz mono)
|
||||
"""
|
||||
stt_client = self.stt_clients.get(user_id)
|
||||
if not stt_client or not stt_client.is_connected():
|
||||
if not stt_client or not stt_client.connected:
|
||||
return
|
||||
|
||||
try:
|
||||
@@ -299,11 +362,9 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
buffer = self.audio_buffers[user_id]
|
||||
buffer.append(audio_data)
|
||||
|
||||
# Silero VAD expects 512 samples @ 16kHz (1024 bytes)
|
||||
# Discord gives us 320 samples (640 bytes) every 20ms
|
||||
# Buffer 2 chunks = 640 samples = 1280 bytes, send as one chunk
|
||||
SAMPLES_NEEDED = 512 # What VAD wants
|
||||
BYTES_NEEDED = SAMPLES_NEEDED * 2 # int16 = 2 bytes per sample
|
||||
# Buffer and send in larger chunks for efficiency
|
||||
# RealtimeSTT will handle VAD internally
|
||||
BYTES_NEEDED = 1024 # 512 samples * 2 bytes
|
||||
|
||||
# Check if we have enough buffered audio
|
||||
total_bytes = sum(len(chunk) for chunk in buffer)
|
||||
@@ -313,16 +374,10 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
combined = b''.join(buffer)
|
||||
buffer.clear()
|
||||
|
||||
# Send in 512-sample (1024-byte) chunks
|
||||
for i in range(0, len(combined), BYTES_NEEDED):
|
||||
chunk = combined[i:i+BYTES_NEEDED]
|
||||
if len(chunk) == BYTES_NEEDED:
|
||||
await stt_client.send_audio(chunk)
|
||||
else:
|
||||
# Put remaining partial chunk back in buffer
|
||||
buffer.append(chunk)
|
||||
# Send all audio to STT (RealtimeSTT handles VAD internally)
|
||||
await stt_client.send_audio(combined)
|
||||
|
||||
# Track audio time for silence detection
|
||||
# Track audio time for interruption detection
|
||||
import time
|
||||
current_time = time.time()
|
||||
self.last_audio_time[user_id] = current_time
|
||||
@@ -331,103 +386,57 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
# Check if Miku is speaking and user is interrupting
|
||||
# Note: self.voice_manager IS the VoiceSession, not the VoiceManager singleton
|
||||
miku_speaking = self.voice_manager.miku_speaking
|
||||
logger.debug(f"[INTERRUPTION CHECK] user={user_id}, miku_speaking={miku_speaking}")
|
||||
|
||||
if miku_speaking:
|
||||
# Track interruption
|
||||
if user_id not in self.interruption_start_time:
|
||||
# First chunk during Miku's speech
|
||||
self.interruption_start_time[user_id] = current_time
|
||||
self.interruption_audio_count[user_id] = 1
|
||||
# Calculate RMS to detect if user is actually speaking
|
||||
# (not just silence/background noise)
|
||||
rms = audioop.rms(combined, 2)
|
||||
RMS_THRESHOLD = 500 # Adjust threshold - higher = less sensitive
|
||||
|
||||
if rms > RMS_THRESHOLD:
|
||||
# User is actually speaking - track as potential interruption
|
||||
if user_id not in self.interruption_start_time:
|
||||
# First chunk during Miku's speech with actual audio
|
||||
self.interruption_start_time[user_id] = current_time
|
||||
self.interruption_audio_count[user_id] = 1
|
||||
logger.debug(f"Potential interruption start (rms={rms})")
|
||||
else:
|
||||
# Increment chunk count
|
||||
self.interruption_audio_count[user_id] += 1
|
||||
|
||||
# Calculate interruption duration
|
||||
interruption_duration = current_time - self.interruption_start_time[user_id]
|
||||
chunk_count = self.interruption_audio_count[user_id]
|
||||
|
||||
# Check if interruption threshold is met
|
||||
if (interruption_duration >= self.interruption_threshold_time and
|
||||
chunk_count >= self.interruption_threshold_chunks):
|
||||
|
||||
# Trigger interruption!
|
||||
logger.info(f"🛑 User {user_id} interrupted Miku (duration={interruption_duration:.2f}s, chunks={chunk_count}, rms={rms})")
|
||||
logger.info(f" → Stopping Miku's TTS and LLM, will process user's speech when finished")
|
||||
|
||||
# Reset interruption tracking
|
||||
self.interruption_start_time.pop(user_id, None)
|
||||
self.interruption_audio_count.pop(user_id, None)
|
||||
|
||||
# Call interruption handler (this sets miku_speaking=False)
|
||||
asyncio.create_task(
|
||||
self.voice_manager.on_user_interruption(user_id)
|
||||
)
|
||||
else:
|
||||
# Increment chunk count
|
||||
self.interruption_audio_count[user_id] += 1
|
||||
|
||||
# Calculate interruption duration
|
||||
interruption_duration = current_time - self.interruption_start_time[user_id]
|
||||
chunk_count = self.interruption_audio_count[user_id]
|
||||
|
||||
# Check if interruption threshold is met
|
||||
if (interruption_duration >= self.interruption_threshold_time and
|
||||
chunk_count >= self.interruption_threshold_chunks):
|
||||
|
||||
# Trigger interruption!
|
||||
logger.info(f"🛑 User {user_id} interrupted Miku (duration={interruption_duration:.2f}s, chunks={chunk_count})")
|
||||
logger.info(f" → Stopping Miku's TTS and LLM, will process user's speech when finished")
|
||||
|
||||
# Reset interruption tracking
|
||||
# Audio below RMS threshold (silence) - reset interruption tracking
|
||||
# This ensures brief pauses in speech reset the counter
|
||||
self.interruption_start_time.pop(user_id, None)
|
||||
self.interruption_audio_count.pop(user_id, None)
|
||||
|
||||
# Call interruption handler (this sets miku_speaking=False)
|
||||
asyncio.create_task(
|
||||
self.voice_manager.on_user_interruption(user_id)
|
||||
)
|
||||
else:
|
||||
# Miku not speaking, clear interruption tracking
|
||||
self.interruption_start_time.pop(user_id, None)
|
||||
self.interruption_audio_count.pop(user_id, None)
|
||||
|
||||
# Cancel existing silence task if any
|
||||
if user_id in self.silence_tasks and not self.silence_tasks[user_id].done():
|
||||
self.silence_tasks[user_id].cancel()
|
||||
|
||||
# Start new silence detection task
|
||||
self.silence_tasks[user_id] = asyncio.create_task(
|
||||
self._detect_silence(user_id)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send audio chunk for user {user_id}: {e}")
|
||||
|
||||
async def _detect_silence(self, user_id: int):
|
||||
"""
|
||||
Wait for silence timeout and send 'final' command to STT.
|
||||
|
||||
This is called after each audio chunk. If no more audio arrives within
|
||||
the silence_timeout period, we send the 'final' command to get the
|
||||
complete transcription.
|
||||
|
||||
Args:
|
||||
user_id: Discord user ID
|
||||
"""
|
||||
try:
|
||||
# Wait for silence timeout
|
||||
await asyncio.sleep(self.silence_timeout)
|
||||
|
||||
# Check if we still have an active STT client
|
||||
stt_client = self.stt_clients.get(user_id)
|
||||
if not stt_client or not stt_client.is_connected():
|
||||
return
|
||||
|
||||
# Send final command to get complete transcription
|
||||
logger.debug(f"Silence detected for user {user_id}, requesting final transcript")
|
||||
await stt_client.send_final()
|
||||
|
||||
except asyncio.CancelledError:
|
||||
# Task was cancelled because new audio arrived
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error(f"Error in silence detection for user {user_id}: {e}")
|
||||
|
||||
async def _on_vad_event(self, user_id: int, event: dict):
|
||||
"""
|
||||
Handle VAD event from STT.
|
||||
|
||||
Args:
|
||||
user_id: Discord user ID
|
||||
event: VAD event dictionary with 'event' and 'probability' keys
|
||||
"""
|
||||
user = self.users.get(user_id)
|
||||
event_type = event.get('event', 'unknown')
|
||||
probability = event.get('probability', 0.0)
|
||||
|
||||
logger.debug(f"VAD [{user.name if user else user_id}]: {event_type} (prob={probability:.3f})")
|
||||
|
||||
# Notify voice manager - pass the full event dict
|
||||
if hasattr(self.voice_manager, 'on_user_vad_event'):
|
||||
await self.voice_manager.on_user_vad_event(user_id, event)
|
||||
|
||||
async def _on_partial_transcript(self, user_id: int, text: str):
|
||||
"""
|
||||
Handle partial transcript from STT.
|
||||
@@ -438,7 +447,6 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
"""
|
||||
user = self.users.get(user_id)
|
||||
logger.info(f"[VOICE_RECEIVER] Partial [{user.name if user else user_id}]: {text}")
|
||||
print(f"[DEBUG] PARTIAL TRANSCRIPT RECEIVED: {text}") # Extra debug
|
||||
|
||||
# Notify voice manager
|
||||
if hasattr(self.voice_manager, 'on_partial_transcript'):
|
||||
@@ -456,29 +464,11 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
user: Discord user object
|
||||
"""
|
||||
logger.info(f"[VOICE_RECEIVER] Final [{user.name if user else user_id}]: {text}")
|
||||
print(f"[DEBUG] FINAL TRANSCRIPT RECEIVED: {text}") # Extra debug
|
||||
|
||||
# Notify voice manager - THIS TRIGGERS LLM RESPONSE
|
||||
if hasattr(self.voice_manager, 'on_final_transcript'):
|
||||
await self.voice_manager.on_final_transcript(user_id, text)
|
||||
|
||||
async def _on_interruption(self, user_id: int, probability: float):
|
||||
"""
|
||||
Handle interruption detection from STT.
|
||||
|
||||
This cancels Miku's current speech if user interrupts.
|
||||
|
||||
Args:
|
||||
user_id: Discord user ID
|
||||
probability: Interruption confidence probability
|
||||
"""
|
||||
user = self.users.get(user_id)
|
||||
logger.info(f"Interruption from [{user.name if user else user_id}] (prob={probability:.3f})")
|
||||
|
||||
# Notify voice manager - THIS CANCELS MIKU'S SPEECH
|
||||
if hasattr(self.voice_manager, 'on_user_interruption'):
|
||||
await self.voice_manager.on_user_interruption(user_id, probability)
|
||||
|
||||
def get_listening_users(self) -> list:
|
||||
"""
|
||||
Get list of users currently being listened to.
|
||||
@@ -489,30 +479,10 @@ class VoiceReceiverSink(voice_recv.AudioSink):
|
||||
return [
|
||||
{
|
||||
'user_id': user_id,
|
||||
'username': user.name if user else 'Unknown',
|
||||
'connected': client.is_connected()
|
||||
'username': self.users.get(user_id, {}).name if self.users.get(user_id) else 'Unknown',
|
||||
'connected': self.stt_clients.get(user_id, {}).connected if self.stt_clients.get(user_id) else False
|
||||
}
|
||||
for user_id, (user, client) in
|
||||
[(uid, (self.users.get(uid), self.stt_clients.get(uid)))
|
||||
for uid in self.stt_clients.keys()]
|
||||
for user_id in self.stt_clients.keys()
|
||||
]
|
||||
|
||||
@voice_recv.AudioSink.listener()
|
||||
def on_voice_member_speaking_start(self, member: discord.Member):
|
||||
"""
|
||||
Called when a member starts speaking (green circle appears).
|
||||
|
||||
This is a virtual event from discord-ext-voice-recv based on packet activity.
|
||||
"""
|
||||
if member.id in self.stt_clients:
|
||||
logger.debug(f"🎤 {member.name} started speaking")
|
||||
|
||||
@voice_recv.AudioSink.listener()
|
||||
def on_voice_member_speaking_stop(self, member: discord.Member):
|
||||
"""
|
||||
Called when a member stops speaking (green circle disappears).
|
||||
|
||||
This is a virtual event from discord-ext-voice-recv based on packet activity.
|
||||
"""
|
||||
if member.id in self.stt_clients:
|
||||
logger.debug(f"🔇 {member.name} stopped speaking")
|
||||
# Discord VAD events removed - we rely entirely on RealtimeSTT's VAD for speech detection
|
||||
|
||||
Reference in New Issue
Block a user