Implemented new Japanese only text mode with WebUI toggle, utilizing a llama3.1 swallow dataset model. Next up is Japanese TTS.

2026-01-23 15:02:36 +02:00
parent eb03dfce4d
commit fe0962118b
8 changed files with 318 additions and 44 deletions
--- a/bot/utils/image_handling.py
+++ b/bot/utils/image_handling.py
@@ -239,7 +239,13 @@ async def analyze_image_with_vision(base64_img):
    Uses OpenAI-compatible chat completions API with image_url.
    Always uses NVIDIA GPU for vision model.
    """
-    from utils.llm import get_vision_gpu_url
+    from utils.llm import get_vision_gpu_url, check_vision_endpoint_health
+    
+    # Check if vision endpoint is healthy before attempting request
+    is_healthy, error = await check_vision_endpoint_health()
+    if not is_healthy:
+        logger.warning(f"Vision endpoint unhealthy: {error}")
+        return f"Vision service currently unavailable: {error}"
    
    payload = {
        "model": globals.VISION_MODEL,
@@ -269,17 +275,20 @@ async def analyze_image_with_vision(base64_img):
    async with aiohttp.ClientSession() as session:
        try:
            vision_url = get_vision_gpu_url()
-            async with session.post(f"{vision_url}/v1/chat/completions", json=payload, headers=headers) as response:
+            logger.info(f"Sending vision request to {vision_url} using model: {globals.VISION_MODEL}")
+            
+            async with session.post(f"{vision_url}/v1/chat/completions", json=payload, headers=headers, timeout=aiohttp.ClientTimeout(total=60)) as response:
                if response.status == 200:
                    data = await response.json()
-                    return data.get("choices", [{}])[0].get("message", {}).get("content", "No description.")
+                    result = data.get("choices", [{}])[0].get("message", {}).get("content", "No description.")
+                    logger.info(f"Vision analysis completed successfully")
+                    return result
                else:
                    error_text = await response.text()
-                    logger.error(f"Vision API error: {response.status} - {error_text}")
+                    logger.error(f"Vision API error: {response.status} - {error_text} (endpoint: {vision_url})")
                    return f"Error analyzing image: {response.status}"
        except Exception as e:
-            logger.error(f"Error in analyze_image_with_vision: {e}")
-            return f"Error analyzing image: {str(e)}"
+            logger.error(f"Error in analyze_image_with_vision: {e}", exc_info=True)


 async def analyze_video_with_vision(video_frames, media_type="video"):
@@ -288,6 +297,13 @@ async def analyze_video_with_vision(video_frames, media_type="video"):
    video_frames: list of base64-encoded frames
    media_type: "video", "gif", or "tenor_gif" to customize the analysis prompt
    """
+    from utils.llm import get_vision_gpu_url, check_vision_endpoint_health
+    
+    # Check if vision endpoint is healthy before attempting request
+    is_healthy, error = await check_vision_endpoint_health()
+    if not is_healthy:
+        logger.warning(f"Vision endpoint unhealthy: {error}")
+        return f"Vision service currently unavailable: {error}"
    
    # Customize prompt based on media type
    if media_type == "gif":
@@ -331,16 +347,20 @@ async def analyze_video_with_vision(video_frames, media_type="video"):
    async with aiohttp.ClientSession() as session:
        try:
            vision_url = get_vision_gpu_url()
-            async with session.post(f"{vision_url}/v1/chat/completions", json=payload, headers=headers) as response:
+            logger.info(f"Sending video analysis request to {vision_url} using model: {globals.VISION_MODEL} (media_type: {media_type}, frames: {len(video_frames)})")
+            
+            async with session.post(f"{vision_url}/v1/chat/completions", json=payload, headers=headers, timeout=aiohttp.ClientTimeout(total=120)) as response:
                if response.status == 200:
                    data = await response.json()
-                    return data.get("choices", [{}])[0].get("message", {}).get("content", "No description.")
+                    result = data.get("choices", [{}])[0].get("message", {}).get("content", "No description.")
+                    logger.info(f"Video analysis completed successfully")
+                    return result
                else:
                    error_text = await response.text()
-                    logger.error(f"Vision API error: {response.status} - {error_text}")
+                    logger.error(f"Vision API error: {response.status} - {error_text} (endpoint: {vision_url})")
                    return f"Error analyzing video: {response.status}"
        except Exception as e:
-            logger.error(f"Error in analyze_video_with_vision: {e}")
+            logger.error(f"Error in analyze_video_with_vision: {e}", exc_info=True)
            return f"Error analyzing video: {str(e)}"