fix: resolve Cat personality startup race condition

Bot was calling restore_evil_cat_state() in on_ready() before Cheshire Cat finished booting (~25s), causing all plugin toggle API calls to fail silently. Evil Miku plugin was left disabled and the bot used Cat's default personality instead. Changes: - cat_client.py: add wait_for_ready() that polls Cat health endpoint every 5s for up to 120s before attempting any admin API calls - evil_mode.py: rewrite restore_evil_cat_state() with: - wait_for_ready() gate before any plugin/model switching - 3-second extra delay after Cat is up (plugin registry fully loaded) - up to 3 retries on failure - post-switch verification that the correct plugins are actually active Also fixes helcyon model references that leaked into the container image (cat_client.py was switching Cat's LLM to 'helcyon' which has no llama-swap handler; reverted to correct 'darkidol' / 'llama3.1').
2026-03-01 00:57:13 +02:00
parent f0b5d71097
commit a0a16e6784
2 changed files with 278 additions and 10 deletions
--- a/bot/utils/cat_client.py
+++ b/bot/utils/cat_client.py
@@ -146,10 +146,15 @@ class CatAdapter:
            payload["discord_guild_id"] = str(guild_id)
        if author_name:
            payload["discord_author_name"] = author_name
-        if mood:
+        # When evil mode is active, send the evil mood name instead of the normal mood
        if globals.EVIL_MODE:
            payload["discord_mood"] = getattr(globals, 'EVIL_DM_MOOD', 'evil_neutral')
        elif mood:
            payload["discord_mood"] = mood
        if response_type:
            payload["discord_response_type"] = response_type
        # Pass evil mode flag so discord_bridge stores it in working_memory
        payload["discord_evil_mode"] = globals.EVIL_MODE
        try:
            # Build WebSocket URL from HTTP base URL
@@ -634,6 +639,222 @@ class CatAdapter:
            logger.error(f"Consolidation error: {e}")
            return None
    # ====================================================================
    # Admin API helpers – plugin toggling & LLM model switching
    # ====================================================================
    async def wait_for_ready(self, max_wait: int = 120, interval: int = 5) -> bool:
        """Wait for Cat to become reachable, polling with interval.
        Used on startup to avoid race conditions when bot starts before Cat.
        Returns True once Cat responds, False if max_wait exceeded.
        """
        start = time.time()
        attempt = 0
        while time.time() - start < max_wait:
            attempt += 1
            try:
                async with aiohttp.ClientSession() as session:
                    async with session.get(
                        f"{self._base_url}/",
                        timeout=aiohttp.ClientTimeout(total=5),
                    ) as resp:
                        if resp.status == 200:
                            elapsed = time.time() - start
                            logger.info(f"🐱 Cat is ready (took {elapsed:.1f}s, {attempt} attempts)")
                            self._healthy = True
                            self._last_health_check = time.time()
                            return True
            except Exception:
                pass
            if attempt == 1:
                logger.info(f"⏳ Waiting for Cat to become ready (up to {max_wait}s)...")
            await asyncio.sleep(interval)
        logger.error(f"Cat did not become ready within {max_wait}s ({attempt} attempts)")
        return False
    async def toggle_plugin(self, plugin_id: str) -> bool:
        """Toggle a Cat plugin on/off via the admin API.
        PUT /plugins/toggle/{plugin_id}
        Returns True on success, False on failure.
        """
        url = f"{self._base_url}/plugins/toggle/{plugin_id}"
        try:
            async with aiohttp.ClientSession() as session:
                async with session.put(
                    url,
                    headers=self._get_headers(),
                    timeout=aiohttp.ClientTimeout(total=15),
                ) as resp:
                    if resp.status == 200:
                        logger.info(f"🐱 Toggled Cat plugin: {plugin_id}")
                        return True
                    else:
                        body = await resp.text()
                        logger.error(f"Cat plugin toggle failed ({resp.status}): {body}")
                        return False
        except Exception as e:
            logger.error(f"Cat plugin toggle error for {plugin_id}: {e}")
            return False
    async def set_llm_model(self, model_name: str) -> bool:
        """Switch the Cheshire Cat's active LLM model via settings API.
        The Cat settings API uses UUIDs: we must first GET /settings/ to find
        the setting_id for LLMOpenAIChatConfig, then PUT /settings/{setting_id}.
        llama-swap handles the actual model loading based on model_name.
        Returns True on success, False on failure.
        """
        try:
            # Step 1: Find the setting_id for LLMOpenAIChatConfig
            setting_id = None
            async with aiohttp.ClientSession() as session:
                async with session.get(
                    f"{self._base_url}/settings/",
                    headers=self._get_headers(),
                    timeout=aiohttp.ClientTimeout(total=10),
                ) as resp:
                    if resp.status != 200:
                        logger.error(f"Cat settings GET failed ({resp.status})")
                        return False
                    data = await resp.json()
                    for s in data.get("settings", []):
                        if s.get("name") == "LLMOpenAIChatConfig":
                            setting_id = s["setting_id"]
                            break
            if not setting_id:
                logger.error("Could not find LLMOpenAIChatConfig setting_id in Cat settings")
                return False
            # Step 2: PUT updated config to /settings/{setting_id}
            payload = {
                "name": "LLMOpenAIChatConfig",
                "value": {
                    "openai_api_key": "sk-dummy",
                    "model_name": model_name,
                    "temperature": 0.8,
                    "streaming": False,
                },
                "category": "llm_factory",
            }
            async with aiohttp.ClientSession() as session:
                async with session.put(
                    f"{self._base_url}/settings/{setting_id}",
                    json=payload,
                    headers=self._get_headers(),
                    timeout=aiohttp.ClientTimeout(total=15),
                ) as resp:
                    if resp.status == 200:
                        logger.info(f"🐱 Set Cat LLM model to: {model_name}")
                        return True
                    else:
                        body = await resp.text()
                        logger.error(f"Cat LLM model switch failed ({resp.status}): {body}")
                        return False
        except Exception as e:
            logger.error(f"Cat LLM model switch error: {e}")
            return False
    async def get_active_plugins(self) -> list:
        """Get list of active Cat plugin IDs.
        GET /plugins  → returns {\"installed\": [...], \"filters\": {...}}
        Each plugin has \"id\" and \"active\" fields.
        """
        url = f"{self._base_url}/plugins"
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(
                    url,
                    headers=self._get_headers(),
                    timeout=aiohttp.ClientTimeout(total=10),
                ) as resp:
                    if resp.status == 200:
                        data = await resp.json()
                        installed = data.get("installed", [])
                        return [p["id"] for p in installed if p.get("active")]
                    else:
                        logger.error(f"Cat get_active_plugins failed ({resp.status})")
                        return []
        except Exception as e:
            logger.error(f"Cat get_active_plugins error: {e}")
            return []
    async def switch_to_evil_personality(self) -> bool:
        """Disable miku_personality, enable evil_miku_personality, switch LLM to darkidol.
        Checks current plugin state first to avoid double-toggling
        (the Cat API is a toggle, not enable/disable).
        Returns True if all operations succeed, False if any fail.
        """
        logger.info("🐱 Switching Cat to Evil Miku personality...")
        success = True
        # Check current plugin state
        active = await self.get_active_plugins()
        # Step 1: Disable normal personality (only if currently active)
        if "miku_personality" in active:
            if not await self.toggle_plugin("miku_personality"):
                logger.error("Failed to disable miku_personality plugin")
                success = False
            await asyncio.sleep(1)
        else:
            logger.debug("miku_personality already disabled, skipping toggle")
        # Step 2: Enable evil personality (only if currently inactive)
        if "evil_miku_personality" not in active:
            if not await self.toggle_plugin("evil_miku_personality"):
                logger.error("Failed to enable evil_miku_personality plugin")
                success = False
        else:
            logger.debug("evil_miku_personality already active, skipping toggle")
        # Step 3: Switch LLM model to darkidol (the uncensored evil model)
        if not await self.set_llm_model("darkidol"):
            logger.error("Failed to switch Cat LLM to darkidol")
            success = False
        return success
    async def switch_to_normal_personality(self) -> bool:
        """Disable evil_miku_personality, enable miku_personality, switch LLM to llama3.1.
        Checks current plugin state first to avoid double-toggling.
        Returns True if all operations succeed, False if any fail.
        """
        logger.info("🐱 Switching Cat to normal Miku personality...")
        success = True
        # Check current plugin state
        active = await self.get_active_plugins()
        # Step 1: Disable evil personality (only if currently active)
        if "evil_miku_personality" in active:
            if not await self.toggle_plugin("evil_miku_personality"):
                logger.error("Failed to disable evil_miku_personality plugin")
                success = False
            await asyncio.sleep(1)
        else:
            logger.debug("evil_miku_personality already disabled, skipping toggle")
        # Step 2: Enable normal personality (only if currently inactive)
        if "miku_personality" not in active:
            if not await self.toggle_plugin("miku_personality"):
                logger.error("Failed to enable miku_personality plugin")
                success = False
        else:
            logger.debug("miku_personality already active, skipping toggle")
        # Step 3: Switch LLM model back to llama3.1 (normal model)
        if not await self.set_llm_model("llama3.1"):
            logger.error("Failed to switch Cat LLM to llama3.1")
            success = False
        return success
 # Singleton instance
 cat_adapter = CatAdapter()
--- a/bot/utils/evil_mode.py
+++ b/bot/utils/evil_mode.py
@@ -109,21 +109,68 @@ async def restore_evil_cat_state():
    """Switch Cat to the correct personality plugin + LLM model based on evil mode state.
    Must be called after the event loop is running (e.g., in on_ready).
    Waits for Cat to become reachable, then retries plugin switching with
    verification to handle the common race condition where bot starts before Cat.
    """
    try:
        from utils.cat_client import cat_adapter
        if not globals.USE_CHESHIRE_CAT:
            return
-        if globals.EVIL_MODE:
+        # Wait for Cat to actually be reachable before attempting any API calls
-            logger.info("Restoring Cat evil personality state on startup...")
+        if not await cat_adapter.wait_for_ready(max_wait=120, interval=5):
-            await cat_adapter.switch_to_evil_personality()
+            logger.error("Cat never became ready — cannot restore personality state")
-        else:
+            return
-            # Ensure normal state is active (in case evil was toggled off while Cat was down)
+        
-            active = await cat_adapter.get_active_plugins()
+        # Small extra delay to let Cat fully initialize plugins after health endpoint is up
-            if "evil_miku_personality" in active:
+        await asyncio.sleep(3)
-                logger.info("Evil plugin still active after normal restore — switching to normal...")
+        
-                await cat_adapter.switch_to_normal_personality()
+        max_retries = 3
        retry_delay = 5
        for attempt in range(1, max_retries + 1):
            try:
                if globals.EVIL_MODE:
                    if attempt == 1:
                        logger.info("Restoring Cat evil personality state on startup...")
                    else:
                        logger.info(f"Retry {attempt}/{max_retries}: restoring Cat evil personality...")
                    await cat_adapter.switch_to_evil_personality()
                else:
                    active = await cat_adapter.get_active_plugins()
                    if "evil_miku_personality" in active:
                        logger.info("Evil plugin still active after normal restore — switching to normal...")
                        await cat_adapter.switch_to_normal_personality()
                    else:
                        # Normal mode, normal plugins — nothing to do
                        return
                # Verify the switch actually worked
                await asyncio.sleep(2)
                active = await cat_adapter.get_active_plugins()
                if globals.EVIL_MODE:
                    if "evil_miku_personality" in active and "miku_personality" not in active:
                        logger.info("✅ Cat evil personality verified active")
                        return
                    else:
                        logger.warning(f"Cat plugin verification failed (attempt {attempt}): "
                                       f"evil_active={'evil_miku_personality' in active}, "
                                       f"normal_active={'miku_personality' in active}")
                else:
                    if "miku_personality" in active and "evil_miku_personality" not in active:
                        logger.info("✅ Cat normal personality verified active")
                        return
                    else:
                        logger.warning(f"Cat plugin verification failed (attempt {attempt})")
            except Exception as e:
                logger.error(f"Cat personality restore attempt {attempt} error: {e}")
            if attempt < max_retries:
                await asyncio.sleep(retry_delay)
        logger.error(f"Failed to restore Cat personality after {max_retries} attempts")
    except Exception as e:
        logger.error(f"Failed to restore Cat personality state on startup: {e}")