fix: resolve Cat personality startup race condition

Bot was calling restore_evil_cat_state() in on_ready() before Cheshire Cat finished booting (~25s), causing all plugin toggle API calls to fail silently. Evil Miku plugin was left disabled and the bot used Cat's default personality instead. Changes: - cat_client.py: add wait_for_ready() that polls Cat health endpoint every 5s for up to 120s before attempting any admin API calls - evil_mode.py: rewrite restore_evil_cat_state() with: - wait_for_ready() gate before any plugin/model switching - 3-second extra delay after Cat is up (plugin registry fully loaded) - up to 3 retries on failure - post-switch verification that the correct plugins are actually active Also fixes helcyon model references that leaked into the container image (cat_client.py was switching Cat's LLM to 'helcyon' which has no llama-swap handler; reverted to correct 'darkidol' / 'llama3.1').
2026-03-01 00:57:13 +02:00
parent f0b5d71097
commit a0a16e6784
2 changed files with 278 additions and 10 deletions
--- a/bot/utils/cat_client.py
+++ b/bot/utils/cat_client.py
@@ -146,10 +146,15 @@ class CatAdapter:
            payload["discord_guild_id"] = str(guild_id)
        if author_name:
            payload["discord_author_name"] = author_name
-        if mood:
+        # When evil mode is active, send the evil mood name instead of the normal mood
+        if globals.EVIL_MODE:
+            payload["discord_mood"] = getattr(globals, 'EVIL_DM_MOOD', 'evil_neutral')
+        elif mood:
            payload["discord_mood"] = mood
        if response_type:
            payload["discord_response_type"] = response_type
+        # Pass evil mode flag so discord_bridge stores it in working_memory
+        payload["discord_evil_mode"] = globals.EVIL_MODE

        try:
            # Build WebSocket URL from HTTP base URL
@@ -634,6 +639,222 @@ class CatAdapter:
            logger.error(f"Consolidation error: {e}")
            return None

+    # ====================================================================
+    # Admin API helpers – plugin toggling & LLM model switching
+    # ====================================================================
+
+    async def wait_for_ready(self, max_wait: int = 120, interval: int = 5) -> bool:
+        """Wait for Cat to become reachable, polling with interval.
+        
+        Used on startup to avoid race conditions when bot starts before Cat.
+        Returns True once Cat responds, False if max_wait exceeded.
+        """
+        start = time.time()
+        attempt = 0
+        while time.time() - start < max_wait:
+            attempt += 1
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(
+                        f"{self._base_url}/",
+                        timeout=aiohttp.ClientTimeout(total=5),
+                    ) as resp:
+                        if resp.status == 200:
+                            elapsed = time.time() - start
+                            logger.info(f"🐱 Cat is ready (took {elapsed:.1f}s, {attempt} attempts)")
+                            self._healthy = True
+                            self._last_health_check = time.time()
+                            return True
+            except Exception:
+                pass
+            if attempt == 1:
+                logger.info(f"⏳ Waiting for Cat to become ready (up to {max_wait}s)...")
+            await asyncio.sleep(interval)
+        logger.error(f"Cat did not become ready within {max_wait}s ({attempt} attempts)")
+        return False
+
+    async def toggle_plugin(self, plugin_id: str) -> bool:
+        """Toggle a Cat plugin on/off via the admin API.
+        
+        PUT /plugins/toggle/{plugin_id}
+        Returns True on success, False on failure.
+        """
+        url = f"{self._base_url}/plugins/toggle/{plugin_id}"
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.put(
+                    url,
+                    headers=self._get_headers(),
+                    timeout=aiohttp.ClientTimeout(total=15),
+                ) as resp:
+                    if resp.status == 200:
+                        logger.info(f"🐱 Toggled Cat plugin: {plugin_id}")
+                        return True
+                    else:
+                        body = await resp.text()
+                        logger.error(f"Cat plugin toggle failed ({resp.status}): {body}")
+                        return False
+        except Exception as e:
+            logger.error(f"Cat plugin toggle error for {plugin_id}: {e}")
+            return False
+
+    async def set_llm_model(self, model_name: str) -> bool:
+        """Switch the Cheshire Cat's active LLM model via settings API.
+        
+        The Cat settings API uses UUIDs: we must first GET /settings/ to find
+        the setting_id for LLMOpenAIChatConfig, then PUT /settings/{setting_id}.
+        llama-swap handles the actual model loading based on model_name.
+        Returns True on success, False on failure.
+        """
+        try:
+            # Step 1: Find the setting_id for LLMOpenAIChatConfig
+            setting_id = None
+            async with aiohttp.ClientSession() as session:
+                async with session.get(
+                    f"{self._base_url}/settings/",
+                    headers=self._get_headers(),
+                    timeout=aiohttp.ClientTimeout(total=10),
+                ) as resp:
+                    if resp.status != 200:
+                        logger.error(f"Cat settings GET failed ({resp.status})")
+                        return False
+                    data = await resp.json()
+                    for s in data.get("settings", []):
+                        if s.get("name") == "LLMOpenAIChatConfig":
+                            setting_id = s["setting_id"]
+                            break
+
+            if not setting_id:
+                logger.error("Could not find LLMOpenAIChatConfig setting_id in Cat settings")
+                return False
+
+            # Step 2: PUT updated config to /settings/{setting_id}
+            payload = {
+                "name": "LLMOpenAIChatConfig",
+                "value": {
+                    "openai_api_key": "sk-dummy",
+                    "model_name": model_name,
+                    "temperature": 0.8,
+                    "streaming": False,
+                },
+                "category": "llm_factory",
+            }
+            async with aiohttp.ClientSession() as session:
+                async with session.put(
+                    f"{self._base_url}/settings/{setting_id}",
+                    json=payload,
+                    headers=self._get_headers(),
+                    timeout=aiohttp.ClientTimeout(total=15),
+                ) as resp:
+                    if resp.status == 200:
+                        logger.info(f"🐱 Set Cat LLM model to: {model_name}")
+                        return True
+                    else:
+                        body = await resp.text()
+                        logger.error(f"Cat LLM model switch failed ({resp.status}): {body}")
+                        return False
+        except Exception as e:
+            logger.error(f"Cat LLM model switch error: {e}")
+            return False
+
+    async def get_active_plugins(self) -> list:
+        """Get list of active Cat plugin IDs.
+        
+        GET /plugins  → returns {\"installed\": [...], \"filters\": {...}}
+        Each plugin has \"id\" and \"active\" fields.
+        """
+        url = f"{self._base_url}/plugins"
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(
+                    url,
+                    headers=self._get_headers(),
+                    timeout=aiohttp.ClientTimeout(total=10),
+                ) as resp:
+                    if resp.status == 200:
+                        data = await resp.json()
+                        installed = data.get("installed", [])
+                        return [p["id"] for p in installed if p.get("active")]
+                    else:
+                        logger.error(f"Cat get_active_plugins failed ({resp.status})")
+                        return []
+        except Exception as e:
+            logger.error(f"Cat get_active_plugins error: {e}")
+            return []
+
+    async def switch_to_evil_personality(self) -> bool:
+        """Disable miku_personality, enable evil_miku_personality, switch LLM to darkidol.
+        
+        Checks current plugin state first to avoid double-toggling
+        (the Cat API is a toggle, not enable/disable).
+        Returns True if all operations succeed, False if any fail.
+        """
+        logger.info("🐱 Switching Cat to Evil Miku personality...")
+        success = True
+
+        # Check current plugin state
+        active = await self.get_active_plugins()
+
+        # Step 1: Disable normal personality (only if currently active)
+        if "miku_personality" in active:
+            if not await self.toggle_plugin("miku_personality"):
+                logger.error("Failed to disable miku_personality plugin")
+                success = False
+            await asyncio.sleep(1)
+        else:
+            logger.debug("miku_personality already disabled, skipping toggle")
+
+        # Step 2: Enable evil personality (only if currently inactive)
+        if "evil_miku_personality" not in active:
+            if not await self.toggle_plugin("evil_miku_personality"):
+                logger.error("Failed to enable evil_miku_personality plugin")
+                success = False
+        else:
+            logger.debug("evil_miku_personality already active, skipping toggle")
+
+        # Step 3: Switch LLM model to darkidol (the uncensored evil model)
+        if not await self.set_llm_model("darkidol"):
+            logger.error("Failed to switch Cat LLM to darkidol")
+            success = False
+
+        return success
+
+    async def switch_to_normal_personality(self) -> bool:
+        """Disable evil_miku_personality, enable miku_personality, switch LLM to llama3.1.
+        
+        Checks current plugin state first to avoid double-toggling.
+        Returns True if all operations succeed, False if any fail.
+        """
+        logger.info("🐱 Switching Cat to normal Miku personality...")
+        success = True
+
+        # Check current plugin state
+        active = await self.get_active_plugins()
+
+        # Step 1: Disable evil personality (only if currently active)
+        if "evil_miku_personality" in active:
+            if not await self.toggle_plugin("evil_miku_personality"):
+                logger.error("Failed to disable evil_miku_personality plugin")
+                success = False
+            await asyncio.sleep(1)
+        else:
+            logger.debug("evil_miku_personality already disabled, skipping toggle")
+
+        # Step 2: Enable normal personality (only if currently inactive)
+        if "miku_personality" not in active:
+            if not await self.toggle_plugin("miku_personality"):
+                logger.error("Failed to enable miku_personality plugin")
+                success = False
+        else:
+            logger.debug("miku_personality already active, skipping toggle")
+
+        # Step 3: Switch LLM model back to llama3.1 (normal model)
+        if not await self.set_llm_model("llama3.1"):
+            logger.error("Failed to switch Cat LLM to llama3.1")
+            success = False
+
+        return success
+

 # Singleton instance
 cat_adapter = CatAdapter()
--- a/bot/utils/evil_mode.py
+++ b/bot/utils/evil_mode.py
@@ -109,21 +109,68 @@ async def restore_evil_cat_state():
    """Switch Cat to the correct personality plugin + LLM model based on evil mode state.
    
    Must be called after the event loop is running (e.g., in on_ready).
+    Waits for Cat to become reachable, then retries plugin switching with
+    verification to handle the common race condition where bot starts before Cat.
    """
    try:
        from utils.cat_client import cat_adapter
        if not globals.USE_CHESHIRE_CAT:
            return
        
+        # Wait for Cat to actually be reachable before attempting any API calls
+        if not await cat_adapter.wait_for_ready(max_wait=120, interval=5):
+            logger.error("Cat never became ready — cannot restore personality state")
+            return
+        
+        # Small extra delay to let Cat fully initialize plugins after health endpoint is up
+        await asyncio.sleep(3)
+        
+        max_retries = 3
+        retry_delay = 5
+        
+        for attempt in range(1, max_retries + 1):
+            try:
                if globals.EVIL_MODE:
+                    if attempt == 1:
                        logger.info("Restoring Cat evil personality state on startup...")
+                    else:
+                        logger.info(f"Retry {attempt}/{max_retries}: restoring Cat evil personality...")
                    await cat_adapter.switch_to_evil_personality()
                else:
-            # Ensure normal state is active (in case evil was toggled off while Cat was down)
                    active = await cat_adapter.get_active_plugins()
                    if "evil_miku_personality" in active:
                        logger.info("Evil plugin still active after normal restore — switching to normal...")
                        await cat_adapter.switch_to_normal_personality()
+                    else:
+                        # Normal mode, normal plugins — nothing to do
+                        return
+                
+                # Verify the switch actually worked
+                await asyncio.sleep(2)
+                active = await cat_adapter.get_active_plugins()
+                
+                if globals.EVIL_MODE:
+                    if "evil_miku_personality" in active and "miku_personality" not in active:
+                        logger.info("✅ Cat evil personality verified active")
+                        return
+                    else:
+                        logger.warning(f"Cat plugin verification failed (attempt {attempt}): "
+                                       f"evil_active={'evil_miku_personality' in active}, "
+                                       f"normal_active={'miku_personality' in active}")
+                else:
+                    if "miku_personality" in active and "evil_miku_personality" not in active:
+                        logger.info("✅ Cat normal personality verified active")
+                        return
+                    else:
+                        logger.warning(f"Cat plugin verification failed (attempt {attempt})")
+                
+            except Exception as e:
+                logger.error(f"Cat personality restore attempt {attempt} error: {e}")
+            
+            if attempt < max_retries:
+                await asyncio.sleep(retry_delay)
+        
+        logger.error(f"Failed to restore Cat personality after {max_retries} attempts")
    except Exception as e:
        logger.error(f"Failed to restore Cat personality state on startup: {e}")