diff --git a/bot/bot.py b/bot/bot.py index 71def0d..c3459bd 100644 --- a/bot/bot.py +++ b/bot/bot.py @@ -277,7 +277,10 @@ async def on_message(message): return # Analyze image (objective description) - qwen_description = await analyze_image_with_qwen(base64_img) + qwen_description = await analyze_image_with_qwen(base64_img, user_prompt=prompt) + if not qwen_description or not qwen_description.strip(): + await message.channel.send("I couldn't see that image clearly, sorry! Try sending it again.") + return # For DMs, pass None as guild_id to use DM mood guild_id = message.guild.id if message.guild else None miku_reply = await rephrase_as_miku( @@ -349,7 +352,10 @@ async def on_message(message): logger.debug(f"šŸ“¹ Extracted {len(frames)} frames from {attachment.filename}") # Analyze the video/GIF with appropriate media type - video_description = await analyze_video_with_vision(frames, media_type=media_type) + video_description = await analyze_video_with_vision(frames, media_type=media_type, user_prompt=prompt) + if not video_description or not video_description.strip(): + await message.channel.send(f"I couldn't analyze that {media_type} clearly, sorry! Try sending it again.") + return # For DMs, pass None as guild_id to use DM mood guild_id = message.guild.id if message.guild else None miku_reply = await rephrase_as_miku( @@ -432,7 +438,10 @@ async def on_message(message): logger.info(f"šŸ“¹ Extracted {len(frames)} frames from Tenor GIF") # Analyze the GIF with tenor_gif media type - video_description = await analyze_video_with_vision(frames, media_type="tenor_gif") + video_description = await analyze_video_with_vision(frames, media_type="tenor_gif", user_prompt=prompt) + if not video_description or not video_description.strip(): + await message.channel.send("I couldn't analyze that GIF clearly, sorry! Try sending it again.") + return guild_id = message.guild.id if message.guild else None miku_reply = await rephrase_as_miku( video_description, @@ -490,7 +499,7 @@ async def on_message(message): if base64_img: logger.info(f"Image downloaded, analyzing with vision model...") # Analyze image - qwen_description = await analyze_image_with_qwen(base64_img) + qwen_description = await analyze_image_with_qwen(base64_img, user_prompt=prompt) truncated = (qwen_description[:50] + "...") if len(qwen_description) > 50 else qwen_description logger.error(f"Vision analysis result: {truncated}") if qwen_description and qwen_description.strip(): @@ -514,7 +523,7 @@ async def on_message(message): frames = await extract_video_frames(media_bytes, num_frames=6) if frames: logger.info(f"šŸ“¹ Extracted {len(frames)} frames, analyzing with vision model...") - video_description = await analyze_video_with_vision(frames, media_type="video") + video_description = await analyze_video_with_vision(frames, media_type="video", user_prompt=prompt) logger.info(f"Video analysis result: {video_description[:100]}...") if video_description and video_description.strip(): embed_context_parts.append(f"[Embedded video shows: {video_description}]") diff --git a/bot/utils/autonomous_v1_legacy.py b/bot/utils/autonomous_v1_legacy.py index 49847d9..9a5c5e1 100644 --- a/bot/utils/autonomous_v1_legacy.py +++ b/bot/utils/autonomous_v1_legacy.py @@ -814,6 +814,9 @@ async def miku_autonomous_reaction_for_server(guild_id: int, force_message=None, # Skip bot's own messages if message.author == globals.client.user: continue + # Skip messages that directly addressed Miku (handled by main handler) + if globals.client.user and globals.client.user in message.mentions: + continue # Skip messages we've already reacted to if message.id in _reacted_message_ids: continue @@ -979,6 +982,11 @@ async def miku_autonomous_reaction_for_dm(user_id: int, force_message=None): # Skip bot's own messages if message.author == globals.client.user: continue + # Skip messages with media attachments in DMs — these are always directed at + # Miku and already processed by the main on_message handler, so analyzing them + # again here would trigger a redundant vision model call + if message.attachments: + continue # Skip messages we've already reacted to if message.id in _reacted_message_ids: continue diff --git a/bot/utils/cat_client.py b/bot/utils/cat_client.py index feeb9a5..b0d297b 100644 --- a/bot/utils/cat_client.py +++ b/bot/utils/cat_client.py @@ -107,6 +107,7 @@ class CatAdapter: author_name: Optional[str] = None, mood: Optional[str] = None, response_type: str = "dm_response", + media_type: Optional[str] = None, ) -> Optional[tuple]: """ Send a message through the Cat pipeline via WebSocket and get a response. @@ -123,6 +124,7 @@ class CatAdapter: author_name: Display name of the user mood: Current mood name (passed as metadata for Cat hooks) response_type: Type of response context + media_type: Type of media attachment ("image", "video", "gif", "tenor_gif") Returns: Tuple of (response_text, full_prompt) on success, or None if Cat @@ -156,6 +158,9 @@ class CatAdapter: payload["discord_response_type"] = response_type # Pass evil mode flag so discord_bridge stores it in working_memory payload["discord_evil_mode"] = globals.EVIL_MODE + # Pass media type so discord_bridge can add MEDIA NOTE to the prompt + if media_type: + payload["discord_media_type"] = media_type try: # Build WebSocket URL from HTTP base URL diff --git a/bot/utils/image_handling.py b/bot/utils/image_handling.py index b080d5f..cde61dc 100644 --- a/bot/utils/image_handling.py +++ b/bot/utils/image_handling.py @@ -17,6 +17,26 @@ logger = get_logger('vision') # No need for switch_model anymore - llama-swap handles this automatically +def _extract_vision_question(prompt: str): + """ + Strip Discord mentions and bot-name triggers from the user's message to + produce a clean question suitable for passing directly to the vision model. + + Returns the cleaned question string, or None if nothing meaningful remains + (e.g. the message was just "@Miku" or "miku," with no actual question). + """ + if not prompt: + return None + # Remove Discord user/role mentions: <@123456789>, <@!123456789> + text = re.sub(r'<@[!&]?\d+>', '', prompt).strip() + # Strip common bot-name invocation prefixes at the very start (case-insensitive) + # e.g. "miku,", "hey miku,", "miku!", "Miku: " + text = re.sub(r'^(?:hey\s+)?miku[,!:\s]+', '', text, flags=re.IGNORECASE).strip() + # Drop any residual leading punctuation/whitespace + text = text.lstrip(',.!? ') + return text if text else None + + async def download_and_encode_image(url): """Download and encode an image to base64.""" async with aiohttp.ClientSession() as session: @@ -233,11 +253,15 @@ async def extract_video_frames(video_bytes, num_frames=4): return None -async def analyze_image_with_vision(base64_img): +async def analyze_image_with_vision(base64_img, user_prompt=None): """ Analyze an image using llama.cpp multimodal capabilities. Uses OpenAI-compatible chat completions API with image_url. Always uses NVIDIA GPU for vision model. + + If user_prompt is provided (and contains a meaningful question after stripping + mentions/triggers), that question is sent to the vision model instead of the + generic "Describe this image in detail." prompt. """ from utils.llm import get_vision_gpu_url, check_vision_endpoint_health @@ -247,6 +271,10 @@ async def analyze_image_with_vision(base64_img): logger.warning(f"Vision endpoint unhealthy: {error}") return f"Vision service currently unavailable: {error}" + question = _extract_vision_question(user_prompt) + vision_prompt_text = question if question else "Describe this image in detail." + logger.info(f"Vision prompt for image: {vision_prompt_text!r}") + payload = { "model": globals.VISION_MODEL, "messages": [ @@ -255,7 +283,7 @@ async def analyze_image_with_vision(base64_img): "content": [ { "type": "text", - "text": "Describe this image in detail." + "text": vision_prompt_text }, { "type": "image_url", @@ -267,7 +295,7 @@ async def analyze_image_with_vision(base64_img): } ], "stream": False, - "max_tokens": 300 + "max_tokens": 800 } headers = {"Content-Type": "application/json"} @@ -289,13 +317,16 @@ async def analyze_image_with_vision(base64_img): return f"Error analyzing image: {response.status}" except Exception as e: logger.error(f"Error in analyze_image_with_vision: {e}", exc_info=True) + return f"Error analyzing image: {str(e)}" -async def analyze_video_with_vision(video_frames, media_type="video"): +async def analyze_video_with_vision(video_frames, media_type="video", user_prompt=None): """ Analyze a video or GIF by analyzing multiple frames. video_frames: list of base64-encoded frames media_type: "video", "gif", or "tenor_gif" to customize the analysis prompt + user_prompt: optional raw user message; the vision model will be asked to answer + the specific question instead of giving a generic description. """ from utils.llm import get_vision_gpu_url, check_vision_endpoint_health @@ -305,8 +336,12 @@ async def analyze_video_with_vision(video_frames, media_type="video"): logger.warning(f"Vision endpoint unhealthy: {error}") return f"Vision service currently unavailable: {error}" - # Customize prompt based on media type - if media_type == "gif": + # Customize prompt based on media type, overridden by user question if present + question = _extract_vision_question(user_prompt) + if question: + prompt_text = question + logger.info(f"Vision prompt for {media_type}: {prompt_text!r}") + elif media_type == "gif": prompt_text = "Describe what's happening in this GIF animation. Analyze the sequence of frames and describe the action, motion, and any repeating patterns." elif media_type == "tenor_gif": prompt_text = "Describe what's happening in this animated GIF. Analyze the sequence of frames and describe the action, emotion, or reaction being shown." @@ -339,7 +374,7 @@ async def analyze_video_with_vision(video_frames, media_type="video"): } ], "stream": False, - "max_tokens": 400 + "max_tokens": 1000 } headers = {"Content-Type": "application/json"} @@ -368,6 +403,9 @@ async def rephrase_as_miku(vision_output, user_prompt, guild_id=None, user_id=No """ Rephrase vision model's image analysis as Miku would respond to it. + Routes through Cheshire Cat pipeline for memory-augmented responses, + falling back to direct query_llama() if Cat is unavailable. + Args: vision_output: Description from vision model user_prompt: User's original message @@ -402,14 +440,64 @@ async def rephrase_as_miku(vision_output, user_prompt, guild_id=None, user_id=No # Use the actual user_id for history tracking, fall back to "image_analysis" for backward compatibility history_user_id = user_id if user_id else "image_analysis" - return await query_llama( - formatted_prompt, - user_id=history_user_id, - guild_id=guild_id, - response_type=response_type, - author_name=author_name, - media_type=media_type # Pass media type to Miku's LLM - ) + # Determine current mood for Cat pipeline + current_mood = globals.DM_MOOD + if guild_id: + try: + from server_manager import server_manager + sc = server_manager.get_server_config(guild_id) + if sc: + current_mood = sc.current_mood_name + except Exception: + pass + + # Phase 3: Try Cheshire Cat pipeline first (memory-augmented response) + # This allows image interactions to be stored in episodic memory and + # benefit from declarative memory recall, just like text messages. + response = None + if globals.USE_CHESHIRE_CAT: + try: + from utils.cat_client import cat_adapter + cat_result = await cat_adapter.query( + text=formatted_prompt, + user_id=history_user_id, + guild_id=str(guild_id) if guild_id else None, + author_name=author_name, + mood=current_mood, + response_type=response_type, + media_type=media_type, + ) + if cat_result: + response, cat_full_prompt = cat_result + effective_mood = current_mood + if globals.EVIL_MODE: + effective_mood = f"EVIL:{getattr(globals, 'EVIL_DM_MOOD', 'evil_neutral')}" + logger.info(f"🐱 Cat {media_type} response for {author_name} (mood: {effective_mood})") + # Track Cat interaction for Web UI Last Prompt view + import datetime + globals.LAST_CAT_INTERACTION = { + "full_prompt": cat_full_prompt, + "response": response[:500] if response else "", + "user": author_name or history_user_id, + "mood": effective_mood, + "timestamp": datetime.datetime.now().isoformat(), + } + except Exception as e: + logger.warning(f"🐱 Cat {media_type} pipeline error, falling back to query_llama: {e}") + response = None + + # Fallback to direct LLM query if Cat didn't respond + if not response: + response = await query_llama( + formatted_prompt, + user_id=history_user_id, + guild_id=guild_id, + response_type=response_type, + author_name=author_name, + media_type=media_type # Pass media type to Miku's LLM + ) + + return response # Backward compatibility aliases analyze_image_with_qwen = analyze_image_with_vision diff --git a/cat-plugins/discord_bridge/discord_bridge.py b/cat-plugins/discord_bridge/discord_bridge.py index 2a0c62c..d77ae8f 100644 --- a/cat-plugins/discord_bridge/discord_bridge.py +++ b/cat-plugins/discord_bridge/discord_bridge.py @@ -42,6 +42,7 @@ def before_cat_reads_message(user_message_json: dict, cat) -> dict: mood = user_message_json.get('discord_mood', None) response_type = user_message_json.get('discord_response_type', None) evil_mode = user_message_json.get('discord_evil_mode', False) + media_type = user_message_json.get('discord_media_type', None) # Also check working memory for backward compatibility if not guild_id: @@ -53,6 +54,7 @@ def before_cat_reads_message(user_message_json: dict, cat) -> dict: cat.working_memory['mood'] = mood cat.working_memory['response_type'] = response_type cat.working_memory['evil_mode'] = evil_mode + cat.working_memory['media_type'] = media_type return user_message_json @@ -263,6 +265,18 @@ CRITICAL RULES: Miku is currently feeling: {mood_description} Please respond in a way that reflects this emotional tone.""" + # Add media type awareness if provided (image/video/gif analysis) + media_type = cat.working_memory.get('media_type', None) + if media_type: + media_descriptions = { + "image": "The user has sent you an image.", + "video": "The user has sent you a video clip.", + "gif": "The user has sent you an animated GIF.", + "tenor_gif": "The user has sent you an animated GIF (from Tenor - likely a reaction GIF or meme)." + } + media_note = media_descriptions.get(media_type, f"The user has sent you {media_type}.") + system_prefix += f"\n\nšŸ“Ž MEDIA NOTE: {media_note}\nYour vision analysis of this {media_type} is included in the user's message with the [Looking at...] prefix." + except Exception as e: print(f" [Discord Bridge] Error building system prefix: {e}") system_prefix = cat.working_memory.get('full_system_prefix', '[system prefix not available]')