Implemented new Japanese only text mode with WebUI toggle, utilizing a llama3.1 swallow dataset model. Next up is Japanese TTS.

2026-01-23 15:02:36 +02:00
parent eb03dfce4d
commit fe0962118b
8 changed files with 318 additions and 44 deletions
--- a/bot/utils/llm.py
+++ b/bot/utils/llm.py
@@ -38,8 +38,47 @@ def get_vision_gpu_url():
    Strategy: Always use NVIDIA GPU for vision to avoid unloading/reloading.
    - When NVIDIA is primary: Use NVIDIA for both text and vision
    - When AMD is primary: Use AMD for text, NVIDIA for vision (keeps vision loaded)
+    
+    Important: Vision model (MiniCPM-V) is ONLY configured on NVIDIA GPU.
+    This ensures vision inference is always fast and doesn't interfere with 
+    AMD text model inference.
    """
-    return globals.LLAMA_URL  # Always use NVIDIA for vision
+    current_text_gpu = get_current_gpu_url()
+    nvidia_vision_url = globals.LLAMA_URL
+    
+    # Vision ALWAYS uses NVIDIA, regardless of which GPU is primary for text
+    # Log this decision when GPU switching is active (primary text GPU is AMD)
+    if current_text_gpu == globals.LLAMA_AMD_URL:
+        logger.debug(f"Primary GPU is AMD for text, but using NVIDIA for vision model")
+    
+    return nvidia_vision_url  # Always use NVIDIA for vision
+
+async def check_vision_endpoint_health():
+    """
+    Check if NVIDIA GPU vision endpoint is healthy and responsive.
+    This is important when AMD is the primary GPU to ensure vision still works.
+    
+    Returns:
+        Tuple of (is_healthy: bool, error_message: Optional[str])
+    """
+    import aiohttp
+    vision_url = get_vision_gpu_url()
+    
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f"{vision_url}/health", timeout=aiohttp.ClientTimeout(total=5)) as response:
+                is_healthy = response.status == 200
+                if is_healthy:
+                    logger.info(f"Vision endpoint ({vision_url}) health check: OK")
+                else:
+                    logger.warning(f"Vision endpoint ({vision_url}) health check failed: status {response.status}")
+                return is_healthy, None if is_healthy else f"Status {response.status}"
+    except asyncio.TimeoutError:
+        logger.error(f"Vision endpoint ({vision_url}) health check: timeout")
+        return False, "Endpoint timeout"
+    except Exception as e:
+        logger.error(f"Vision endpoint ({vision_url}) health check error: {e}")
+        return False, str(e)

 def _strip_surrounding_quotes(text):
    """
@@ -108,8 +147,12 @@ async def query_llama(user_prompt, user_id, guild_id=None, response_type="dm_res
        if evil_mode:
            model = globals.EVIL_TEXT_MODEL  # Use DarkIdol uncensored model
            logger.info(f"Using evil model: {model}")
+        elif globals.LANGUAGE_MODE == "japanese":
+            model = globals.JAPANESE_TEXT_MODEL  # Use Swallow for Japanese
+            logger.info(f"Using Japanese model: {model}")
        else:
            model = globals.TEXT_MODEL
+            logger.info(f"Using default model: {model}")
    
    # Determine channel_id for conversation history
    # For servers, use guild_id; for DMs, use user_id