diff --git a/bot/bot.py b/bot/bot.py
index 71def0d..c3459bd 100644
--- a/bot/bot.py
+++ b/bot/bot.py
@@ -277,7 +277,10 @@ async def on_message(message):
                             return
                         
                         # Analyze image (objective description)
-                        qwen_description = await analyze_image_with_qwen(base64_img)
+                        qwen_description = await analyze_image_with_qwen(base64_img, user_prompt=prompt)
+                        if not qwen_description or not qwen_description.strip():
+                            await message.channel.send("I couldn't see that image clearly, sorry! Try sending it again.")
+                            return
                         # For DMs, pass None as guild_id to use DM mood
                         guild_id = message.guild.id if message.guild else None
                         miku_reply = await rephrase_as_miku(
@@ -349,7 +352,10 @@ async def on_message(message):
                         logger.debug(f"📹 Extracted {len(frames)} frames from {attachment.filename}")
                         
                         # Analyze the video/GIF with appropriate media type
-                        video_description = await analyze_video_with_vision(frames, media_type=media_type)
+                        video_description = await analyze_video_with_vision(frames, media_type=media_type, user_prompt=prompt)
+                        if not video_description or not video_description.strip():
+                            await message.channel.send(f"I couldn't analyze that {media_type} clearly, sorry! Try sending it again.")
+                            return
                         # For DMs, pass None as guild_id to use DM mood
                         guild_id = message.guild.id if message.guild else None
                         miku_reply = await rephrase_as_miku(
@@ -432,7 +438,10 @@ async def on_message(message):
                         logger.info(f"📹 Extracted {len(frames)} frames from Tenor GIF")
                         
                         # Analyze the GIF with tenor_gif media type
-                        video_description = await analyze_video_with_vision(frames, media_type="tenor_gif")
+                        video_description = await analyze_video_with_vision(frames, media_type="tenor_gif", user_prompt=prompt)
+                        if not video_description or not video_description.strip():
+                            await message.channel.send("I couldn't analyze that GIF clearly, sorry! Try sending it again.")
+                            return
                         guild_id = message.guild.id if message.guild else None
                         miku_reply = await rephrase_as_miku(
                             video_description, 
@@ -490,7 +499,7 @@ async def on_message(message):
                                     if base64_img:
                                         logger.info(f"Image downloaded, analyzing with vision model...")
                                         # Analyze image
-                                        qwen_description = await analyze_image_with_qwen(base64_img)
+                                        qwen_description = await analyze_image_with_qwen(base64_img, user_prompt=prompt)
                                         truncated = (qwen_description[:50] + "...") if len(qwen_description) > 50 else qwen_description
                                         logger.error(f"Vision analysis result: {truncated}")
                                         if qwen_description and qwen_description.strip():
@@ -514,7 +523,7 @@ async def on_message(message):
                                         frames = await extract_video_frames(media_bytes, num_frames=6)
                                         if frames:
                                             logger.info(f"📹 Extracted {len(frames)} frames, analyzing with vision model...")
-                                            video_description = await analyze_video_with_vision(frames, media_type="video")
+                                            video_description = await analyze_video_with_vision(frames, media_type="video", user_prompt=prompt)
                                             logger.info(f"Video analysis result: {video_description[:100]}...")
                                             if video_description and video_description.strip():
                                                 embed_context_parts.append(f"[Embedded video shows: {video_description}]")
diff --git a/bot/utils/autonomous_v1_legacy.py b/bot/utils/autonomous_v1_legacy.py
index 49847d9..9a5c5e1 100644
--- a/bot/utils/autonomous_v1_legacy.py
+++ b/bot/utils/autonomous_v1_legacy.py
@@ -814,6 +814,9 @@ async def miku_autonomous_reaction_for_server(guild_id: int, force_message=None,
                 # Skip bot's own messages
                 if message.author == globals.client.user:
                     continue
+                # Skip messages that directly addressed Miku (handled by main handler)
+                if globals.client.user and globals.client.user in message.mentions:
+                    continue
                 # Skip messages we've already reacted to
                 if message.id in _reacted_message_ids:
                     continue
@@ -979,6 +982,11 @@ async def miku_autonomous_reaction_for_dm(user_id: int, force_message=None):
                 # Skip bot's own messages
                 if message.author == globals.client.user:
                     continue
+                # Skip messages with media attachments in DMs — these are always directed at
+                # Miku and already processed by the main on_message handler, so analyzing them
+                # again here would trigger a redundant vision model call
+                if message.attachments:
+                    continue
                 # Skip messages we've already reacted to
                 if message.id in _reacted_message_ids:
                     continue
diff --git a/bot/utils/cat_client.py b/bot/utils/cat_client.py
index feeb9a5..b0d297b 100644
--- a/bot/utils/cat_client.py
+++ b/bot/utils/cat_client.py
@@ -107,6 +107,7 @@ class CatAdapter:
         author_name: Optional[str] = None,
         mood: Optional[str] = None,
         response_type: str = "dm_response",
+        media_type: Optional[str] = None,
     ) -> Optional[tuple]:
         """
         Send a message through the Cat pipeline via WebSocket and get a response.
@@ -123,6 +124,7 @@ class CatAdapter:
             author_name: Display name of the user
             mood: Current mood name (passed as metadata for Cat hooks)
             response_type: Type of response context
+            media_type: Type of media attachment ("image", "video", "gif", "tenor_gif")
             
         Returns:
             Tuple of (response_text, full_prompt) on success, or None if Cat
@@ -156,6 +158,9 @@ class CatAdapter:
             payload["discord_response_type"] = response_type
         # Pass evil mode flag so discord_bridge stores it in working_memory
         payload["discord_evil_mode"] = globals.EVIL_MODE
+        # Pass media type so discord_bridge can add MEDIA NOTE to the prompt
+        if media_type:
+            payload["discord_media_type"] = media_type
 
         try:
             # Build WebSocket URL from HTTP base URL
diff --git a/bot/utils/image_handling.py b/bot/utils/image_handling.py
index b080d5f..cde61dc 100644
--- a/bot/utils/image_handling.py
+++ b/bot/utils/image_handling.py
@@ -17,6 +17,26 @@ logger = get_logger('vision')
 # No need for switch_model anymore - llama-swap handles this automatically
 
 
+def _extract_vision_question(prompt: str):
+    """
+    Strip Discord mentions and bot-name triggers from the user's message to
+    produce a clean question suitable for passing directly to the vision model.
+
+    Returns the cleaned question string, or None if nothing meaningful remains
+    (e.g. the message was just "@Miku" or "miku," with no actual question).
+    """
+    if not prompt:
+        return None
+    # Remove Discord user/role mentions: <@123456789>, <@!123456789>
+    text = re.sub(r'<@[!&]?\d+>', '', prompt).strip()
+    # Strip common bot-name invocation prefixes at the very start (case-insensitive)
+    # e.g. "miku,", "hey miku,", "miku!", "Miku: "
+    text = re.sub(r'^(?:hey\s+)?miku[,!:\s]+', '', text, flags=re.IGNORECASE).strip()
+    # Drop any residual leading punctuation/whitespace
+    text = text.lstrip(',.!? ')
+    return text if text else None
+
+
 async def download_and_encode_image(url):
     """Download and encode an image to base64."""
     async with aiohttp.ClientSession() as session:
@@ -233,11 +253,15 @@ async def extract_video_frames(video_bytes, num_frames=4):
     return None
 
 
-async def analyze_image_with_vision(base64_img):
+async def analyze_image_with_vision(base64_img, user_prompt=None):
     """
     Analyze an image using llama.cpp multimodal capabilities.
     Uses OpenAI-compatible chat completions API with image_url.
     Always uses NVIDIA GPU for vision model.
+
+    If user_prompt is provided (and contains a meaningful question after stripping
+    mentions/triggers), that question is sent to the vision model instead of the
+    generic "Describe this image in detail." prompt.
     """
     from utils.llm import get_vision_gpu_url, check_vision_endpoint_health
     
@@ -247,6 +271,10 @@ async def analyze_image_with_vision(base64_img):
         logger.warning(f"Vision endpoint unhealthy: {error}")
         return f"Vision service currently unavailable: {error}"
     
+    question = _extract_vision_question(user_prompt)
+    vision_prompt_text = question if question else "Describe this image in detail."
+    logger.info(f"Vision prompt for image: {vision_prompt_text!r}")
+
     payload = {
         "model": globals.VISION_MODEL,
         "messages": [
@@ -255,7 +283,7 @@ async def analyze_image_with_vision(base64_img):
                 "content": [
                     {
                         "type": "text",
-                        "text": "Describe this image in detail."
+                        "text": vision_prompt_text
                     },
                     {
                         "type": "image_url",
@@ -267,7 +295,7 @@ async def analyze_image_with_vision(base64_img):
             }
         ],
         "stream": False,
-        "max_tokens": 300
+        "max_tokens": 800
     }
 
     headers = {"Content-Type": "application/json"}
@@ -289,13 +317,16 @@ async def analyze_image_with_vision(base64_img):
                     return f"Error analyzing image: {response.status}"
         except Exception as e:
             logger.error(f"Error in analyze_image_with_vision: {e}", exc_info=True)
+            return f"Error analyzing image: {str(e)}"
 
 
-async def analyze_video_with_vision(video_frames, media_type="video"):
+async def analyze_video_with_vision(video_frames, media_type="video", user_prompt=None):
     """
     Analyze a video or GIF by analyzing multiple frames.
     video_frames: list of base64-encoded frames
     media_type: "video", "gif", or "tenor_gif" to customize the analysis prompt
+    user_prompt: optional raw user message; the vision model will be asked to answer
+                 the specific question instead of giving a generic description.
     """
     from utils.llm import get_vision_gpu_url, check_vision_endpoint_health
     
@@ -305,8 +336,12 @@ async def analyze_video_with_vision(video_frames, media_type="video"):
         logger.warning(f"Vision endpoint unhealthy: {error}")
         return f"Vision service currently unavailable: {error}"
     
-    # Customize prompt based on media type
-    if media_type == "gif":
+    # Customize prompt based on media type, overridden by user question if present
+    question = _extract_vision_question(user_prompt)
+    if question:
+        prompt_text = question
+        logger.info(f"Vision prompt for {media_type}: {prompt_text!r}")
+    elif media_type == "gif":
         prompt_text = "Describe what's happening in this GIF animation. Analyze the sequence of frames and describe the action, motion, and any repeating patterns."
     elif media_type == "tenor_gif":
         prompt_text = "Describe what's happening in this animated GIF. Analyze the sequence of frames and describe the action, emotion, or reaction being shown."
@@ -339,7 +374,7 @@ async def analyze_video_with_vision(video_frames, media_type="video"):
             }
         ],
         "stream": False,
-        "max_tokens": 400
+        "max_tokens": 1000
     }
 
     headers = {"Content-Type": "application/json"}
@@ -368,6 +403,9 @@ async def rephrase_as_miku(vision_output, user_prompt, guild_id=None, user_id=No
     """
     Rephrase vision model's image analysis as Miku would respond to it.
     
+    Routes through Cheshire Cat pipeline for memory-augmented responses,
+    falling back to direct query_llama() if Cat is unavailable.
+    
     Args:
         vision_output: Description from vision model
         user_prompt: User's original message
@@ -402,14 +440,64 @@ async def rephrase_as_miku(vision_output, user_prompt, guild_id=None, user_id=No
     # Use the actual user_id for history tracking, fall back to "image_analysis" for backward compatibility
     history_user_id = user_id if user_id else "image_analysis"
     
-    return await query_llama(
-        formatted_prompt, 
-        user_id=history_user_id, 
-        guild_id=guild_id, 
-        response_type=response_type,
-        author_name=author_name,
-        media_type=media_type  # Pass media type to Miku's LLM
-    )
+    # Determine current mood for Cat pipeline
+    current_mood = globals.DM_MOOD
+    if guild_id:
+        try:
+            from server_manager import server_manager
+            sc = server_manager.get_server_config(guild_id)
+            if sc:
+                current_mood = sc.current_mood_name
+        except Exception:
+            pass
+
+    # Phase 3: Try Cheshire Cat pipeline first (memory-augmented response)
+    # This allows image interactions to be stored in episodic memory and
+    # benefit from declarative memory recall, just like text messages.
+    response = None
+    if globals.USE_CHESHIRE_CAT:
+        try:
+            from utils.cat_client import cat_adapter
+            cat_result = await cat_adapter.query(
+                text=formatted_prompt,
+                user_id=history_user_id,
+                guild_id=str(guild_id) if guild_id else None,
+                author_name=author_name,
+                mood=current_mood,
+                response_type=response_type,
+                media_type=media_type,
+            )
+            if cat_result:
+                response, cat_full_prompt = cat_result
+                effective_mood = current_mood
+                if globals.EVIL_MODE:
+                    effective_mood = f"EVIL:{getattr(globals, 'EVIL_DM_MOOD', 'evil_neutral')}"
+                logger.info(f"🐱 Cat {media_type} response for {author_name} (mood: {effective_mood})")
+                # Track Cat interaction for Web UI Last Prompt view
+                import datetime
+                globals.LAST_CAT_INTERACTION = {
+                    "full_prompt": cat_full_prompt,
+                    "response": response[:500] if response else "",
+                    "user": author_name or history_user_id,
+                    "mood": effective_mood,
+                    "timestamp": datetime.datetime.now().isoformat(),
+                }
+        except Exception as e:
+            logger.warning(f"🐱 Cat {media_type} pipeline error, falling back to query_llama: {e}")
+            response = None
+
+    # Fallback to direct LLM query if Cat didn't respond
+    if not response:
+        response = await query_llama(
+            formatted_prompt, 
+            user_id=history_user_id, 
+            guild_id=guild_id, 
+            response_type=response_type,
+            author_name=author_name,
+            media_type=media_type  # Pass media type to Miku's LLM
+        )
+    
+    return response
 
 # Backward compatibility aliases
 analyze_image_with_qwen = analyze_image_with_vision
diff --git a/cat-plugins/discord_bridge/discord_bridge.py b/cat-plugins/discord_bridge/discord_bridge.py
index 2a0c62c..d77ae8f 100644
--- a/cat-plugins/discord_bridge/discord_bridge.py
+++ b/cat-plugins/discord_bridge/discord_bridge.py
@@ -42,6 +42,7 @@ def before_cat_reads_message(user_message_json: dict, cat) -> dict:
     mood = user_message_json.get('discord_mood', None)
     response_type = user_message_json.get('discord_response_type', None)
     evil_mode = user_message_json.get('discord_evil_mode', False)
+    media_type = user_message_json.get('discord_media_type', None)
 
     # Also check working memory for backward compatibility
     if not guild_id:
@@ -53,6 +54,7 @@ def before_cat_reads_message(user_message_json: dict, cat) -> dict:
     cat.working_memory['mood'] = mood
     cat.working_memory['response_type'] = response_type
     cat.working_memory['evil_mode'] = evil_mode
+    cat.working_memory['media_type'] = media_type
     
     return user_message_json
 
@@ -263,6 +265,18 @@ CRITICAL RULES:
 Miku is currently feeling: {mood_description}
 Please respond in a way that reflects this emotional tone."""
 
+        # Add media type awareness if provided (image/video/gif analysis)
+        media_type = cat.working_memory.get('media_type', None)
+        if media_type:
+            media_descriptions = {
+                "image": "The user has sent you an image.",
+                "video": "The user has sent you a video clip.",
+                "gif": "The user has sent you an animated GIF.",
+                "tenor_gif": "The user has sent you an animated GIF (from Tenor - likely a reaction GIF or meme)."
+            }
+            media_note = media_descriptions.get(media_type, f"The user has sent you {media_type}.")
+            system_prefix += f"\n\n📎 MEDIA NOTE: {media_note}\nYour vision analysis of this {media_type} is included in the user's message with the [Looking at...] prefix."
+
     except Exception as e:
         print(f"   [Discord Bridge] Error building system prefix: {e}")
         system_prefix = cat.working_memory.get('full_system_prefix', '[system prefix not available]')