Fix vision pipeline: route images through Cat, pass user question to vision model

- Fix silent None return in analyze_image_with_vision exception handler - Add None/empty guards after vision analysis in bot.py (image, video, GIF, Tenor) - Route all image/video/GIF responses through Cheshire Cat pipeline (was calling query_llama directly), enabling episodic memory storage for media interactions and correct Last Prompt display in Web UI - Add media_type parameter to cat_adapter.query() and forward as discord_media_type in WebSocket payload - Update discord_bridge plugin to read media_type from payload and inject MEDIA NOTE into system prefix in before_agent_starts hook - Add _extract_vision_question() helper to strip Discord mentions and bot-name triggers from user message; pass cleaned question to vision model so specific questions (e.g. 'what is the person wearing?') go directly to the vision model instead of the generic 'Describe this image in detail.' fallback - Pass user_prompt to all analyze_image_with_qwen / analyze_video_with_vision call sites in bot.py (image, video, GIF, Tenor, embed paths) - Fix autonomous reaction loops skipping messages that @mention the bot or have media attachments in DMs, preventing duplicate vision model calls for images already being processed by the main message handler - Increase vision max_tokens: images 300->800, video/GIF 400->1000 (no VRAM impact; KV cache is pre-allocated at model load time)
2026-03-05 21:59:27 +02:00
parent ae1e0aa144
commit d5b9964ce7
5 changed files with 144 additions and 20 deletions
--- a/bot/bot.py
+++ b/bot/bot.py
@@ -277,7 +277,10 @@ async def on_message(message):
                            return
                        
                        # Analyze image (objective description)
-                        qwen_description = await analyze_image_with_qwen(base64_img)
+                        qwen_description = await analyze_image_with_qwen(base64_img, user_prompt=prompt)
+                        if not qwen_description or not qwen_description.strip():
+                            await message.channel.send("I couldn't see that image clearly, sorry! Try sending it again.")
+                            return
                        # For DMs, pass None as guild_id to use DM mood
                        guild_id = message.guild.id if message.guild else None
                        miku_reply = await rephrase_as_miku(
@@ -349,7 +352,10 @@ async def on_message(message):
                        logger.debug(f"📹 Extracted {len(frames)} frames from {attachment.filename}")
                        
                        # Analyze the video/GIF with appropriate media type
-                        video_description = await analyze_video_with_vision(frames, media_type=media_type)
+                        video_description = await analyze_video_with_vision(frames, media_type=media_type, user_prompt=prompt)
+                        if not video_description or not video_description.strip():
+                            await message.channel.send(f"I couldn't analyze that {media_type} clearly, sorry! Try sending it again.")
+                            return
                        # For DMs, pass None as guild_id to use DM mood
                        guild_id = message.guild.id if message.guild else None
                        miku_reply = await rephrase_as_miku(
@@ -432,7 +438,10 @@ async def on_message(message):
                        logger.info(f"📹 Extracted {len(frames)} frames from Tenor GIF")
                        
                        # Analyze the GIF with tenor_gif media type
-                        video_description = await analyze_video_with_vision(frames, media_type="tenor_gif")
+                        video_description = await analyze_video_with_vision(frames, media_type="tenor_gif", user_prompt=prompt)
+                        if not video_description or not video_description.strip():
+                            await message.channel.send("I couldn't analyze that GIF clearly, sorry! Try sending it again.")
+                            return
                        guild_id = message.guild.id if message.guild else None
                        miku_reply = await rephrase_as_miku(
                            video_description, 
@@ -490,7 +499,7 @@ async def on_message(message):
                                    if base64_img:
                                        logger.info(f"Image downloaded, analyzing with vision model...")
                                        # Analyze image
-                                        qwen_description = await analyze_image_with_qwen(base64_img)
+                                        qwen_description = await analyze_image_with_qwen(base64_img, user_prompt=prompt)
                                        truncated = (qwen_description[:50] + "...") if len(qwen_description) > 50 else qwen_description
                                        logger.error(f"Vision analysis result: {truncated}")
                                        if qwen_description and qwen_description.strip():
@@ -514,7 +523,7 @@ async def on_message(message):
                                        frames = await extract_video_frames(media_bytes, num_frames=6)
                                        if frames:
                                            logger.info(f"📹 Extracted {len(frames)} frames, analyzing with vision model...")
-                                            video_description = await analyze_video_with_vision(frames, media_type="video")
+                                            video_description = await analyze_video_with_vision(frames, media_type="video", user_prompt=prompt)
                                            logger.info(f"Video analysis result: {video_description[:100]}...")
                                            if video_description and video_description.strip():
                                                embed_context_parts.append(f"[Embedded video shows: {video_description}]")
--- a/bot/utils/autonomous_v1_legacy.py
+++ b/bot/utils/autonomous_v1_legacy.py
@@ -814,6 +814,9 @@ async def miku_autonomous_reaction_for_server(guild_id: int, force_message=None,
                # Skip bot's own messages
                if message.author == globals.client.user:
                    continue
+                # Skip messages that directly addressed Miku (handled by main handler)
+                if globals.client.user and globals.client.user in message.mentions:
+                    continue
                # Skip messages we've already reacted to
                if message.id in _reacted_message_ids:
                    continue
@@ -979,6 +982,11 @@ async def miku_autonomous_reaction_for_dm(user_id: int, force_message=None):
                # Skip bot's own messages
                if message.author == globals.client.user:
                    continue
+                # Skip messages with media attachments in DMs — these are always directed at
+                # Miku and already processed by the main on_message handler, so analyzing them
+                # again here would trigger a redundant vision model call
+                if message.attachments:
+                    continue
                # Skip messages we've already reacted to
                if message.id in _reacted_message_ids:
                    continue
--- a/bot/utils/cat_client.py
+++ b/bot/utils/cat_client.py
@@ -107,6 +107,7 @@ class CatAdapter:
        author_name: Optional[str] = None,
        mood: Optional[str] = None,
        response_type: str = "dm_response",
+        media_type: Optional[str] = None,
    ) -> Optional[tuple]:
        """
        Send a message through the Cat pipeline via WebSocket and get a response.
@@ -123,6 +124,7 @@ class CatAdapter:
            author_name: Display name of the user
            mood: Current mood name (passed as metadata for Cat hooks)
            response_type: Type of response context
+            media_type: Type of media attachment ("image", "video", "gif", "tenor_gif")
            
        Returns:
            Tuple of (response_text, full_prompt) on success, or None if Cat
@@ -156,6 +158,9 @@ class CatAdapter:
            payload["discord_response_type"] = response_type
        # Pass evil mode flag so discord_bridge stores it in working_memory
        payload["discord_evil_mode"] = globals.EVIL_MODE
+        # Pass media type so discord_bridge can add MEDIA NOTE to the prompt
+        if media_type:
+            payload["discord_media_type"] = media_type

        try:
            # Build WebSocket URL from HTTP base URL
--- a/bot/utils/image_handling.py
+++ b/bot/utils/image_handling.py
@@ -17,6 +17,26 @@ logger = get_logger('vision')
 # No need for switch_model anymore - llama-swap handles this automatically


+def _extract_vision_question(prompt: str):
+    """
+    Strip Discord mentions and bot-name triggers from the user's message to
+    produce a clean question suitable for passing directly to the vision model.
+
+    Returns the cleaned question string, or None if nothing meaningful remains
+    (e.g. the message was just "@Miku" or "miku," with no actual question).
+    """
+    if not prompt:
+        return None
+    # Remove Discord user/role mentions: <@123456789>, <@!123456789>
+    text = re.sub(r'<@[!&]?\d+>', '', prompt).strip()
+    # Strip common bot-name invocation prefixes at the very start (case-insensitive)
+    # e.g. "miku,", "hey miku,", "miku!", "Miku: "
+    text = re.sub(r'^(?:hey\s+)?miku[,!:\s]+', '', text, flags=re.IGNORECASE).strip()
+    # Drop any residual leading punctuation/whitespace
+    text = text.lstrip(',.!? ')
+    return text if text else None
+
+
 async def download_and_encode_image(url):
    """Download and encode an image to base64."""
    async with aiohttp.ClientSession() as session:
@@ -233,11 +253,15 @@ async def extract_video_frames(video_bytes, num_frames=4):
    return None


-async def analyze_image_with_vision(base64_img):
+async def analyze_image_with_vision(base64_img, user_prompt=None):
    """
    Analyze an image using llama.cpp multimodal capabilities.
    Uses OpenAI-compatible chat completions API with image_url.
    Always uses NVIDIA GPU for vision model.
+
+    If user_prompt is provided (and contains a meaningful question after stripping
+    mentions/triggers), that question is sent to the vision model instead of the
+    generic "Describe this image in detail." prompt.
    """
    from utils.llm import get_vision_gpu_url, check_vision_endpoint_health
    
@@ -247,6 +271,10 @@ async def analyze_image_with_vision(base64_img):
        logger.warning(f"Vision endpoint unhealthy: {error}")
        return f"Vision service currently unavailable: {error}"
    
+    question = _extract_vision_question(user_prompt)
+    vision_prompt_text = question if question else "Describe this image in detail."
+    logger.info(f"Vision prompt for image: {vision_prompt_text!r}")
+
    payload = {
        "model": globals.VISION_MODEL,
        "messages": [
@@ -255,7 +283,7 @@ async def analyze_image_with_vision(base64_img):
                "content": [
                    {
                        "type": "text",
-                        "text": "Describe this image in detail."
+                        "text": vision_prompt_text
                    },
                    {
                        "type": "image_url",
@@ -267,7 +295,7 @@ async def analyze_image_with_vision(base64_img):
            }
        ],
        "stream": False,
-        "max_tokens": 300
+        "max_tokens": 800
    }

    headers = {"Content-Type": "application/json"}
@@ -289,13 +317,16 @@ async def analyze_image_with_vision(base64_img):
                    return f"Error analyzing image: {response.status}"
        except Exception as e:
            logger.error(f"Error in analyze_image_with_vision: {e}", exc_info=True)
+            return f"Error analyzing image: {str(e)}"


-async def analyze_video_with_vision(video_frames, media_type="video"):
+async def analyze_video_with_vision(video_frames, media_type="video", user_prompt=None):
    """
    Analyze a video or GIF by analyzing multiple frames.
    video_frames: list of base64-encoded frames
    media_type: "video", "gif", or "tenor_gif" to customize the analysis prompt
+    user_prompt: optional raw user message; the vision model will be asked to answer
+                 the specific question instead of giving a generic description.
    """
    from utils.llm import get_vision_gpu_url, check_vision_endpoint_health
    
@@ -305,8 +336,12 @@ async def analyze_video_with_vision(video_frames, media_type="video"):
        logger.warning(f"Vision endpoint unhealthy: {error}")
        return f"Vision service currently unavailable: {error}"
    
-    # Customize prompt based on media type
-    if media_type == "gif":
+    # Customize prompt based on media type, overridden by user question if present
+    question = _extract_vision_question(user_prompt)
+    if question:
+        prompt_text = question
+        logger.info(f"Vision prompt for {media_type}: {prompt_text!r}")
+    elif media_type == "gif":
        prompt_text = "Describe what's happening in this GIF animation. Analyze the sequence of frames and describe the action, motion, and any repeating patterns."
    elif media_type == "tenor_gif":
        prompt_text = "Describe what's happening in this animated GIF. Analyze the sequence of frames and describe the action, emotion, or reaction being shown."
@@ -339,7 +374,7 @@ async def analyze_video_with_vision(video_frames, media_type="video"):
            }
        ],
        "stream": False,
-        "max_tokens": 400
+        "max_tokens": 1000
    }

    headers = {"Content-Type": "application/json"}
@@ -368,6 +403,9 @@ async def rephrase_as_miku(vision_output, user_prompt, guild_id=None, user_id=No
    """
    Rephrase vision model's image analysis as Miku would respond to it.
    
+    Routes through Cheshire Cat pipeline for memory-augmented responses,
+    falling back to direct query_llama() if Cat is unavailable.
+    
    Args:
        vision_output: Description from vision model
        user_prompt: User's original message
@@ -402,14 +440,64 @@ async def rephrase_as_miku(vision_output, user_prompt, guild_id=None, user_id=No
    # Use the actual user_id for history tracking, fall back to "image_analysis" for backward compatibility
    history_user_id = user_id if user_id else "image_analysis"
    
-    return await query_llama(
-        formatted_prompt, 
-        user_id=history_user_id, 
-        guild_id=guild_id, 
-        response_type=response_type,
-        author_name=author_name,
-        media_type=media_type  # Pass media type to Miku's LLM
-    )
+    # Determine current mood for Cat pipeline
+    current_mood = globals.DM_MOOD
+    if guild_id:
+        try:
+            from server_manager import server_manager
+            sc = server_manager.get_server_config(guild_id)
+            if sc:
+                current_mood = sc.current_mood_name
+        except Exception:
+            pass
+
+    # Phase 3: Try Cheshire Cat pipeline first (memory-augmented response)
+    # This allows image interactions to be stored in episodic memory and
+    # benefit from declarative memory recall, just like text messages.
+    response = None
+    if globals.USE_CHESHIRE_CAT:
+        try:
+            from utils.cat_client import cat_adapter
+            cat_result = await cat_adapter.query(
+                text=formatted_prompt,
+                user_id=history_user_id,
+                guild_id=str(guild_id) if guild_id else None,
+                author_name=author_name,
+                mood=current_mood,
+                response_type=response_type,
+                media_type=media_type,
+            )
+            if cat_result:
+                response, cat_full_prompt = cat_result
+                effective_mood = current_mood
+                if globals.EVIL_MODE:
+                    effective_mood = f"EVIL:{getattr(globals, 'EVIL_DM_MOOD', 'evil_neutral')}"
+                logger.info(f"🐱 Cat {media_type} response for {author_name} (mood: {effective_mood})")
+                # Track Cat interaction for Web UI Last Prompt view
+                import datetime
+                globals.LAST_CAT_INTERACTION = {
+                    "full_prompt": cat_full_prompt,
+                    "response": response[:500] if response else "",
+                    "user": author_name or history_user_id,
+                    "mood": effective_mood,
+                    "timestamp": datetime.datetime.now().isoformat(),
+                }
+        except Exception as e:
+            logger.warning(f"🐱 Cat {media_type} pipeline error, falling back to query_llama: {e}")
+            response = None
+
+    # Fallback to direct LLM query if Cat didn't respond
+    if not response:
+        response = await query_llama(
+            formatted_prompt, 
+            user_id=history_user_id, 
+            guild_id=guild_id, 
+            response_type=response_type,
+            author_name=author_name,
+            media_type=media_type  # Pass media type to Miku's LLM
+        )
+    
+    return response

 # Backward compatibility aliases
 analyze_image_with_qwen = analyze_image_with_vision
--- a/cat-plugins/discord_bridge/discord_bridge.py
+++ b/cat-plugins/discord_bridge/discord_bridge.py
@@ -42,6 +42,7 @@ def before_cat_reads_message(user_message_json: dict, cat) -> dict:
    mood = user_message_json.get('discord_mood', None)
    response_type = user_message_json.get('discord_response_type', None)
    evil_mode = user_message_json.get('discord_evil_mode', False)
+    media_type = user_message_json.get('discord_media_type', None)

    # Also check working memory for backward compatibility
    if not guild_id:
@@ -53,6 +54,7 @@ def before_cat_reads_message(user_message_json: dict, cat) -> dict:
    cat.working_memory['mood'] = mood
    cat.working_memory['response_type'] = response_type
    cat.working_memory['evil_mode'] = evil_mode
+    cat.working_memory['media_type'] = media_type
    
    return user_message_json

@@ -263,6 +265,18 @@ CRITICAL RULES:
 Miku is currently feeling: {mood_description}
 Please respond in a way that reflects this emotional tone."""

+        # Add media type awareness if provided (image/video/gif analysis)
+        media_type = cat.working_memory.get('media_type', None)
+        if media_type:
+            media_descriptions = {
+                "image": "The user has sent you an image.",
+                "video": "The user has sent you a video clip.",
+                "gif": "The user has sent you an animated GIF.",
+                "tenor_gif": "The user has sent you an animated GIF (from Tenor - likely a reaction GIF or meme)."
+            }
+            media_note = media_descriptions.get(media_type, f"The user has sent you {media_type}.")
+            system_prefix += f"\n\n📎 MEDIA NOTE: {media_note}\nYour vision analysis of this {media_type} is included in the user's message with the [Looking at...] prefix."
+
    except Exception as e:
        print(f"   [Discord Bridge] Error building system prefix: {e}")
        system_prefix = cat.working_memory.get('full_system_prefix', '[system prefix not available]')