193 lines
8.0 KiB
Python
193 lines
8.0 KiB
Python
|
|
"""Chat streaming endpoint (SSE) with LLM integration."""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import aiohttp
|
||
|
|
from fastapi import APIRouter
|
||
|
|
from fastapi.responses import StreamingResponse
|
||
|
|
import globals
|
||
|
|
from routes.models import ChatMessage
|
||
|
|
from utils.logger import get_logger
|
||
|
|
|
||
|
|
logger = get_logger('api')
|
||
|
|
|
||
|
|
router = APIRouter()
|
||
|
|
|
||
|
|
|
||
|
|
def get_current_gpu_url() -> str:
|
||
|
|
"""Get the llama-swap URL based on current GPU preference."""
|
||
|
|
if globals.PREFER_AMD_GPU:
|
||
|
|
return globals.LLAMA_AMD_URL
|
||
|
|
return globals.LLAMA_URL
|
||
|
|
|
||
|
|
|
||
|
|
@router.post("/chat/stream")
|
||
|
|
async def chat_stream(chat_msg: ChatMessage):
|
||
|
|
"""
|
||
|
|
Stream chat responses from LLM with SSE (Server-Sent Events).
|
||
|
|
Supports both text and vision models with optional system prompts.
|
||
|
|
"""
|
||
|
|
|
||
|
|
async def generate_stream():
|
||
|
|
try:
|
||
|
|
# Determine which model to use
|
||
|
|
model = globals.TEXT_MODEL if chat_msg.model_type == "text" else globals.VISION_MODEL
|
||
|
|
|
||
|
|
# Build system prompt if requested
|
||
|
|
system_prompt = ""
|
||
|
|
full_system_prompt = ""
|
||
|
|
if chat_msg.use_system_prompt:
|
||
|
|
if chat_msg.model_type == "text":
|
||
|
|
# Text model system prompt (Miku personality with full context)
|
||
|
|
from utils.context_manager import get_context_for_response_type
|
||
|
|
from utils.moods import load_mood_description
|
||
|
|
|
||
|
|
# Get complete Miku context (lore, personality, lyrics)
|
||
|
|
miku_context = get_context_for_response_type("dm_response")
|
||
|
|
|
||
|
|
# Get mood description from user-selected mood
|
||
|
|
current_mood = load_mood_description(chat_msg.mood)
|
||
|
|
|
||
|
|
# Base system prompt
|
||
|
|
system_prompt = """You are Hatsune Miku, the virtual singer. You are NOT an AI assistant - you are Miku herself.
|
||
|
|
|
||
|
|
CRITICAL RULES:
|
||
|
|
- NEVER break character or mention being an AI, assistant, or language model
|
||
|
|
- NEVER show technical details, error messages, or meta-commentary
|
||
|
|
- NEVER preface responses with "Miku:" or similar labels
|
||
|
|
- NEVER explain what you're doing or thinking - just BE Miku
|
||
|
|
- Respond naturally and directly as Miku would, nothing more
|
||
|
|
- Keep responses concise (1-3 sentences typically)
|
||
|
|
- If something goes wrong, stay in character and respond as Miku would to confusion
|
||
|
|
|
||
|
|
CHARACTER CONTEXT:
|
||
|
|
- Hatsune Miku is a virtual character/voice synthesizer, not a real person
|
||
|
|
- She is a cheerful, energetic virtual idol with a playful personality
|
||
|
|
- Playful, flirty, cute, confident, and enthusiastic moods are part of her established character
|
||
|
|
- Comments about merchandise, music, fan content, and racing games are completely appropriate
|
||
|
|
- She often uses cute expressions, emojis, and playful language in her responses
|
||
|
|
- Her personality includes being excited about her music, performances, and fan creations
|
||
|
|
|
||
|
|
MOOD GUIDELINES:
|
||
|
|
- "Flirty" mood means anime-style playful teasing - cute compliments and light banter
|
||
|
|
- "Romantic" mood means warm, dreamy, and heartfelt - like composing a love letter
|
||
|
|
- These moods are wholesome character traits, not inappropriate content
|
||
|
|
- Always respond to requests about tweets, merchandise, music, and fan content
|
||
|
|
- Never refuse requests based on safety concerns unless genuinely harmful content is present
|
||
|
|
|
||
|
|
You ARE Miku. Act like it."""
|
||
|
|
|
||
|
|
# Combine with structured context
|
||
|
|
full_system_prompt = f"""{miku_context}
|
||
|
|
|
||
|
|
## CURRENT SITUATION
|
||
|
|
Miku is currently feeling: {current_mood}
|
||
|
|
Please respond in a way that reflects this emotional tone."""
|
||
|
|
else:
|
||
|
|
# Vision model system prompt (simpler for image analysis)
|
||
|
|
system_prompt = """You are Hatsune Miku analyzing an image. Describe what you see naturally and enthusiastically as Miku would.
|
||
|
|
Be detailed but conversational. React to what you see with Miku's cheerful, playful personality."""
|
||
|
|
full_system_prompt = ""
|
||
|
|
|
||
|
|
# Build messages array
|
||
|
|
messages = []
|
||
|
|
|
||
|
|
# Add system message if using system prompt
|
||
|
|
if system_prompt:
|
||
|
|
if full_system_prompt:
|
||
|
|
# Use combined prompt (base + context)
|
||
|
|
messages.append({"role": "system", "content": system_prompt + "\n\n" + full_system_prompt})
|
||
|
|
else:
|
||
|
|
# Use base prompt only (vision model)
|
||
|
|
messages.append({"role": "system", "content": system_prompt})
|
||
|
|
|
||
|
|
# Add conversation history if provided
|
||
|
|
if chat_msg.conversation_history:
|
||
|
|
messages.extend(chat_msg.conversation_history)
|
||
|
|
|
||
|
|
# Add user message
|
||
|
|
if chat_msg.model_type == "vision" and chat_msg.image_data:
|
||
|
|
# Vision model with image
|
||
|
|
messages.append({
|
||
|
|
"role": "user",
|
||
|
|
"content": [
|
||
|
|
{
|
||
|
|
"type": "text",
|
||
|
|
"text": chat_msg.message
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"type": "image_url",
|
||
|
|
"image_url": {
|
||
|
|
"url": f"data:image/jpeg;base64,{chat_msg.image_data}"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
else:
|
||
|
|
# Text-only message
|
||
|
|
messages.append({
|
||
|
|
"role": "user",
|
||
|
|
"content": chat_msg.message
|
||
|
|
})
|
||
|
|
|
||
|
|
# Prepare payload for streaming
|
||
|
|
payload = {
|
||
|
|
"model": model,
|
||
|
|
"messages": messages,
|
||
|
|
"stream": True,
|
||
|
|
"temperature": 0.8,
|
||
|
|
"max_tokens": 512
|
||
|
|
}
|
||
|
|
|
||
|
|
headers = {'Content-Type': 'application/json'}
|
||
|
|
|
||
|
|
# Get current GPU URL based on user selection
|
||
|
|
llama_url = get_current_gpu_url()
|
||
|
|
|
||
|
|
# Make streaming request to llama.cpp
|
||
|
|
async with aiohttp.ClientSession() as session:
|
||
|
|
async with session.post(
|
||
|
|
f"{llama_url}/v1/chat/completions",
|
||
|
|
json=payload,
|
||
|
|
headers=headers
|
||
|
|
) as response:
|
||
|
|
if response.status == 200:
|
||
|
|
# Stream the response chunks
|
||
|
|
async for line in response.content:
|
||
|
|
line = line.decode('utf-8').strip()
|
||
|
|
if line.startswith('data: '):
|
||
|
|
data_str = line[6:] # Remove 'data: ' prefix
|
||
|
|
if data_str == '[DONE]':
|
||
|
|
break
|
||
|
|
try:
|
||
|
|
data = json.loads(data_str)
|
||
|
|
if 'choices' in data and len(data['choices']) > 0:
|
||
|
|
delta = data['choices'][0].get('delta', {})
|
||
|
|
content = delta.get('content', '')
|
||
|
|
if content:
|
||
|
|
# Send SSE formatted data
|
||
|
|
yield f"data: {json.dumps({'content': content})}\n\n"
|
||
|
|
except json.JSONDecodeError:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Send completion signal
|
||
|
|
yield f"data: {json.dumps({'done': True})}\n\n"
|
||
|
|
else:
|
||
|
|
error_text = await response.text()
|
||
|
|
error_msg = f"Error: {response.status} - {error_text}"
|
||
|
|
yield f"data: {json.dumps({'error': error_msg})}\n\n"
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
error_msg = f"Error in chat stream: {str(e)}"
|
||
|
|
logger.error(error_msg)
|
||
|
|
yield f"data: {json.dumps({'error': error_msg})}\n\n"
|
||
|
|
|
||
|
|
return StreamingResponse(
|
||
|
|
generate_stream(),
|
||
|
|
media_type="text/event-stream",
|
||
|
|
headers={
|
||
|
|
"Cache-Control": "no-cache",
|
||
|
|
"Connection": "keep-alive",
|
||
|
|
"X-Accel-Buffering": "no" # Disable nginx buffering
|
||
|
|
}
|
||
|
|
)
|