From 3e59e5d2f643dbe8af2bed07413a39f413c9531c Mon Sep 17 00:00:00 2001 From: koko210Serve Date: Sat, 17 Jan 2026 00:01:17 +0200 Subject: [PATCH] =?UTF-8?q?Phase=203=20implemented=20=E2=80=94=20Text=20LL?= =?UTF-8?q?M=20can=20now=20stream=20to=20the=20TTS=20pipeline=20with=20the?= =?UTF-8?q?=20!miku=20say=20command?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bot/bot.py | 4 +- bot/commands/voice.py | 108 +++++++++++++++++++++++++++++++++++++++ bot/utils/voice_audio.py | 13 +++++ 3 files changed, 123 insertions(+), 2 deletions(-) diff --git a/bot/bot.py b/bot/bot.py index fb580ae..5809b1f 100644 --- a/bot/bot.py +++ b/bot/bot.py @@ -125,7 +125,7 @@ async def on_message(message): if message.author == globals.client.user: return - # Check for voice commands first (!miku join, !miku leave, !miku voice-status, !miku test) + # Check for voice commands first (!miku join, !miku leave, !miku voice-status, !miku test, !miku say) if not isinstance(message.channel, discord.DMChannel) and message.content.strip().lower().startswith('!miku '): from commands.voice import handle_voice_command @@ -134,7 +134,7 @@ async def on_message(message): cmd = parts[1].lower() args = parts[2:] if len(parts) > 2 else [] - if cmd in ['join', 'leave', 'voice-status', 'test']: + if cmd in ['join', 'leave', 'voice-status', 'test', 'say']: await handle_voice_command(message, cmd, args) return diff --git a/bot/commands/voice.py b/bot/commands/voice.py index 9298f13..f027a40 100644 --- a/bot/commands/voice.py +++ b/bot/commands/voice.py @@ -5,8 +5,11 @@ Handles joining, leaving, and status commands for voice chat sessions. """ import discord +import aiohttp +import json from utils.voice_manager import voice_manager from utils.logger import get_logger +from utils.llm import get_current_gpu_url logger = get_logger('voice_commands') @@ -33,6 +36,9 @@ async def handle_voice_command(message, cmd, args): elif cmd == 'test': await _handle_test(message, args) + elif cmd == 'say': + await _handle_say(message, args) + else: await message.channel.send(f"❌ Unknown voice command: `{cmd}`") @@ -263,3 +269,105 @@ async def _handle_test(message, args): except Exception as e: logger.error(f"Failed to test voice playback: {e}", exc_info=True) await message.channel.send(f"❌ Error testing voice: {e}") + + +async def _handle_say(message, args): + """ + Handle !miku say command. + Send user message to LLM and speak the response in voice chat. + + Phase 3: Text → LLM → Voice (STT deferred to later phase) + """ + # Validate args + if not args: + await message.channel.send("❌ Usage: `!miku say `") + return + + # Check active voice session + session = voice_manager.active_session + if not session: + await message.channel.send("❌ No active voice session! Use `!miku join` first.") + return + + if not session.audio_source: + await message.channel.send("❌ Audio source not connected!") + return + + # Extract user message + user_message = " ".join(args) + + try: + # Show processing indicator + await message.channel.send(f"💭 Processing: *\"{user_message}\"*") + logger.info(f"Voice say: user={message.author.name}, message={user_message}") + + # Prepare LLM payload (based on query_llama logic) + from utils.llm import get_current_gpu_url + import globals + + # Simple system prompt for voice responses + system_prompt = """You are Hatsune Miku, the virtual singer. +Respond naturally and concisely as Miku would in a voice conversation. +Keep responses short (1-3 sentences) since they will be spoken aloud.""" + + payload = { + "model": globals.TEXT_MODEL, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_message} + ], + "stream": True, + "temperature": 0.8, + "max_tokens": 200 # Shorter for voice + } + + headers = {'Content-Type': 'application/json'} + llama_url = get_current_gpu_url() + + logger.info(f"Streaming LLM from {llama_url}") + + # Stream LLM response and send tokens to TTS + async with aiohttp.ClientSession() as http_session: + async with http_session.post( + f"{llama_url}/v1/chat/completions", + json=payload, + headers=headers, + timeout=aiohttp.ClientTimeout(total=60) + ) as response: + if response.status != 200: + error_text = await response.text() + raise Exception(f"LLM error {response.status}: {error_text}") + + # Process streaming response + full_response = "" + async for line in response.content: + line = line.decode('utf-8').strip() + if line.startswith('data: '): + data_str = line[6:] # Remove 'data: ' prefix + if data_str == '[DONE]': + break + + try: + data = json.loads(data_str) + if 'choices' in data and len(data['choices']) > 0: + delta = data['choices'][0].get('delta', {}) + content = delta.get('content', '') + if content: + # Send token to TTS + await session.audio_source.send_token(content) + full_response += content + except json.JSONDecodeError: + continue + + # Send flush command to trigger synthesis of remaining tokens + await session.audio_source.flush() + + # Show what Miku said + await message.channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*") + logger.info(f"✓ Voice say complete: {full_response.strip()}") + await message.add_reaction("✅") + + except Exception as e: + logger.error(f"Voice say failed: {e}", exc_info=True) + await message.channel.send(f"❌ Voice say failed: {str(e)}") + diff --git a/bot/utils/voice_audio.py b/bot/utils/voice_audio.py index 7dc0c0e..2ea9b61 100644 --- a/bot/utils/voice_audio.py +++ b/bot/utils/voice_audio.py @@ -281,8 +281,21 @@ class MikuVoiceSource(discord.AudioSource): await self.send_token(word + " ", pitch_shift) # Small delay to avoid overwhelming the TTS await asyncio.sleep(0.05) + + async def flush(self): + """ + Send flush command to TTS to trigger synthesis of buffered tokens. + This ensures any remaining text in the TTS buffer is synthesized. + """ + if self.websocket: + try: + await self.websocket.send_json({"flush": True}) + logger.debug("Sent flush command to TTS") + except Exception as e: + logger.error(f"Failed to send flush command: {e}") + async def _receive_audio(self): """Background task to receive audio from WebSocket and buffer it.""" try: