From 3e59e5d2f643dbe8af2bed07413a39f413c9531c Mon Sep 17 00:00:00 2001
From: koko210Serve <koko.yordanov@proton.me>
Date: Sat, 17 Jan 2026 00:01:17 +0200
Subject: [PATCH] =?UTF-8?q?Phase=203=20implemented=20=E2=80=94=20Text=20LL?=
 =?UTF-8?q?M=20can=20now=20stream=20to=20the=20TTS=20pipeline=20with=20the?=
 =?UTF-8?q?=20!miku=20say=20command?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bot/bot.py               |   4 +-
 bot/commands/voice.py    | 108 +++++++++++++++++++++++++++++++++++++++
 bot/utils/voice_audio.py |  13 +++++
 3 files changed, 123 insertions(+), 2 deletions(-)
diff --git a/bot/bot.py b/bot/bot.py
index fb580ae..5809b1f 100644
--- a/bot/bot.py
+++ b/bot/bot.py
@@ -125,7 +125,7 @@ async def on_message(message):
     if message.author == globals.client.user:
         return
     
-    # Check for voice commands first (!miku join, !miku leave, !miku voice-status, !miku test)
+    # Check for voice commands first (!miku join, !miku leave, !miku voice-status, !miku test, !miku say)
     if not isinstance(message.channel, discord.DMChannel) and message.content.strip().lower().startswith('!miku '):
         from commands.voice import handle_voice_command
         
@@ -134,7 +134,7 @@ async def on_message(message):
             cmd = parts[1].lower()
             args = parts[2:] if len(parts) > 2 else []
             
-            if cmd in ['join', 'leave', 'voice-status', 'test']:
+            if cmd in ['join', 'leave', 'voice-status', 'test', 'say']:
                 await handle_voice_command(message, cmd, args)
                 return
     
diff --git a/bot/commands/voice.py b/bot/commands/voice.py
index 9298f13..f027a40 100644
--- a/bot/commands/voice.py
+++ b/bot/commands/voice.py
@@ -5,8 +5,11 @@ Handles joining, leaving, and status commands for voice chat sessions.
 """
 
 import discord
+import aiohttp
+import json
 from utils.voice_manager import voice_manager
 from utils.logger import get_logger
+from utils.llm import get_current_gpu_url
 
 logger = get_logger('voice_commands')
 
@@ -33,6 +36,9 @@ async def handle_voice_command(message, cmd, args):
     elif cmd == 'test':
         await _handle_test(message, args)
     
+    elif cmd == 'say':
+        await _handle_say(message, args)
+    
     else:
         await message.channel.send(f"❌ Unknown voice command: `{cmd}`")
 
@@ -263,3 +269,105 @@ async def _handle_test(message, args):
     except Exception as e:
         logger.error(f"Failed to test voice playback: {e}", exc_info=True)
         await message.channel.send(f"❌ Error testing voice: {e}")
+
+
+async def _handle_say(message, args):
+    """
+    Handle !miku say command.
+    Send user message to LLM and speak the response in voice chat.
+    
+    Phase 3: Text → LLM → Voice (STT deferred to later phase)
+    """
+    # Validate args
+    if not args:
+        await message.channel.send("❌ Usage: `!miku say <your message>`")
+        return
+    
+    # Check active voice session
+    session = voice_manager.active_session
+    if not session:
+        await message.channel.send("❌ No active voice session! Use `!miku join` first.")
+        return
+    
+    if not session.audio_source:
+        await message.channel.send("❌ Audio source not connected!")
+        return
+    
+    # Extract user message
+    user_message = " ".join(args)
+    
+    try:
+        # Show processing indicator
+        await message.channel.send(f"💭 Processing: *\"{user_message}\"*")
+        logger.info(f"Voice say: user={message.author.name}, message={user_message}")
+        
+        # Prepare LLM payload (based on query_llama logic)
+        from utils.llm import get_current_gpu_url
+        import globals
+        
+        # Simple system prompt for voice responses
+        system_prompt = """You are Hatsune Miku, the virtual singer. 
+Respond naturally and concisely as Miku would in a voice conversation.
+Keep responses short (1-3 sentences) since they will be spoken aloud."""
+        
+        payload = {
+            "model": globals.TEXT_MODEL,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_message}
+            ],
+            "stream": True,
+            "temperature": 0.8,
+            "max_tokens": 200  # Shorter for voice
+        }
+        
+        headers = {'Content-Type': 'application/json'}
+        llama_url = get_current_gpu_url()
+        
+        logger.info(f"Streaming LLM from {llama_url}")
+        
+        # Stream LLM response and send tokens to TTS
+        async with aiohttp.ClientSession() as http_session:
+            async with http_session.post(
+                f"{llama_url}/v1/chat/completions",
+                json=payload,
+                headers=headers,
+                timeout=aiohttp.ClientTimeout(total=60)
+            ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    raise Exception(f"LLM error {response.status}: {error_text}")
+                
+                # Process streaming response
+                full_response = ""
+                async for line in response.content:
+                    line = line.decode('utf-8').strip()
+                    if line.startswith('data: '):
+                        data_str = line[6:]  # Remove 'data: ' prefix
+                        if data_str == '[DONE]':
+                            break
+                        
+                        try:
+                            data = json.loads(data_str)
+                            if 'choices' in data and len(data['choices']) > 0:
+                                delta = data['choices'][0].get('delta', {})
+                                content = delta.get('content', '')
+                                if content:
+                                    # Send token to TTS
+                                    await session.audio_source.send_token(content)
+                                    full_response += content
+                        except json.JSONDecodeError:
+                            continue
+                
+                # Send flush command to trigger synthesis of remaining tokens
+                await session.audio_source.flush()
+                
+                # Show what Miku said
+                await message.channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*")
+                logger.info(f"✓ Voice say complete: {full_response.strip()}")
+                await message.add_reaction("✅")
+                
+    except Exception as e:
+        logger.error(f"Voice say failed: {e}", exc_info=True)
+        await message.channel.send(f"❌ Voice say failed: {str(e)}")
+
diff --git a/bot/utils/voice_audio.py b/bot/utils/voice_audio.py
index 7dc0c0e..2ea9b61 100644
--- a/bot/utils/voice_audio.py
+++ b/bot/utils/voice_audio.py
@@ -281,8 +281,21 @@ class MikuVoiceSource(discord.AudioSource):
             await self.send_token(word + " ", pitch_shift)
             # Small delay to avoid overwhelming the TTS
             await asyncio.sleep(0.05)
+    
+    async def flush(self):
+        """
+        Send flush command to TTS to trigger synthesis of buffered tokens.
+        This ensures any remaining text in the TTS buffer is synthesized.
+        """
+        if self.websocket:
+            try:
+                await self.websocket.send_json({"flush": True})
+                logger.debug("Sent flush command to TTS")
+            except Exception as e:
+                logger.error(f"Failed to send flush command: {e}")
 
     
+
     async def _receive_audio(self):
         """Background task to receive audio from WebSocket and buffer it."""
         try: