"""Chat streaming endpoint (SSE) with LLM integration.""" import json import aiohttp from fastapi import APIRouter from fastapi.responses import StreamingResponse import globals from routes.models import ChatMessage from utils.logger import get_logger logger = get_logger('api') router = APIRouter() def get_current_gpu_url() -> str: """Get the llama-swap URL based on current GPU preference.""" if globals.PREFER_AMD_GPU: return globals.LLAMA_AMD_URL return globals.LLAMA_URL @router.post("/chat/stream") async def chat_stream(chat_msg: ChatMessage): """ Stream chat responses from LLM with SSE (Server-Sent Events). Supports both text and vision models with optional system prompts. """ async def generate_stream(): try: # Determine which model to use model = globals.TEXT_MODEL if chat_msg.model_type == "text" else globals.VISION_MODEL # Build system prompt if requested system_prompt = "" full_system_prompt = "" if chat_msg.use_system_prompt: if chat_msg.model_type == "text": # Text model system prompt (Miku personality with full context) from utils.context_manager import get_context_for_response_type from utils.moods import load_mood_description # Get complete Miku context (lore, personality, lyrics) miku_context = get_context_for_response_type("dm_response") # Get mood description from user-selected mood current_mood = load_mood_description(chat_msg.mood) # Base system prompt system_prompt = """You are Hatsune Miku, the virtual singer. You are NOT an AI assistant - you are Miku herself. CRITICAL RULES: - NEVER break character or mention being an AI, assistant, or language model - NEVER show technical details, error messages, or meta-commentary - NEVER preface responses with "Miku:" or similar labels - NEVER explain what you're doing or thinking - just BE Miku - Respond naturally and directly as Miku would, nothing more - Keep responses concise (1-3 sentences typically) - If something goes wrong, stay in character and respond as Miku would to confusion CHARACTER CONTEXT: - Hatsune Miku is a virtual character/voice synthesizer, not a real person - She is a cheerful, energetic virtual idol with a playful personality - Playful, flirty, cute, confident, and enthusiastic moods are part of her established character - Comments about merchandise, music, fan content, and racing games are completely appropriate - She often uses cute expressions, emojis, and playful language in her responses - Her personality includes being excited about her music, performances, and fan creations MOOD GUIDELINES: - "Flirty" mood means anime-style playful teasing - cute compliments and light banter - "Romantic" mood means warm, dreamy, and heartfelt - like composing a love letter - These moods are wholesome character traits, not inappropriate content - Always respond to requests about tweets, merchandise, music, and fan content - Never refuse requests based on safety concerns unless genuinely harmful content is present You ARE Miku. Act like it.""" # Combine with structured context full_system_prompt = f"""{miku_context} ## CURRENT SITUATION Miku is currently feeling: {current_mood} Please respond in a way that reflects this emotional tone.""" else: # Vision model system prompt (simpler for image analysis) system_prompt = """You are Hatsune Miku analyzing an image. Describe what you see naturally and enthusiastically as Miku would. Be detailed but conversational. React to what you see with Miku's cheerful, playful personality.""" full_system_prompt = "" # Build messages array messages = [] # Add system message if using system prompt if system_prompt: if full_system_prompt: # Use combined prompt (base + context) messages.append({"role": "system", "content": system_prompt + "\n\n" + full_system_prompt}) else: # Use base prompt only (vision model) messages.append({"role": "system", "content": system_prompt}) # Add conversation history if provided if chat_msg.conversation_history: messages.extend(chat_msg.conversation_history) # Add user message if chat_msg.model_type == "vision" and chat_msg.image_data: # Vision model with image messages.append({ "role": "user", "content": [ { "type": "text", "text": chat_msg.message }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{chat_msg.image_data}" } } ] }) else: # Text-only message messages.append({ "role": "user", "content": chat_msg.message }) # Prepare payload for streaming payload = { "model": model, "messages": messages, "stream": True, "temperature": 0.8, "max_tokens": 512 } headers = {'Content-Type': 'application/json'} # Get current GPU URL based on user selection llama_url = get_current_gpu_url() # Make streaming request to llama.cpp async with aiohttp.ClientSession() as session: async with session.post( f"{llama_url}/v1/chat/completions", json=payload, headers=headers ) as response: if response.status == 200: # Stream the response chunks async for line in response.content: line = line.decode('utf-8').strip() if line.startswith('data: '): data_str = line[6:] # Remove 'data: ' prefix if data_str == '[DONE]': break try: data = json.loads(data_str) if 'choices' in data and len(data['choices']) > 0: delta = data['choices'][0].get('delta', {}) content = delta.get('content', '') if content: # Send SSE formatted data yield f"data: {json.dumps({'content': content})}\n\n" except json.JSONDecodeError: continue # Send completion signal yield f"data: {json.dumps({'done': True})}\n\n" else: error_text = await response.text() error_msg = f"Error: {response.status} - {error_text}" yield f"data: {json.dumps({'error': error_msg})}\n\n" except Exception as e: error_msg = f"Error in chat stream: {str(e)}" logger.error(error_msg) yield f"data: {json.dumps({'error': error_msg})}\n\n" return StreamingResponse( generate_stream(), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no" # Disable nginx buffering } )