reorganize: move all test scripts to tests/ directory

- Moved 8 root-level test scripts + 2 from bot/ to tests/ - Moved run_rocinante_test.sh runner script to tests/ - Added tests/README.md documenting each test's purpose, type, and requirements - Added test_pfp_context.py and test_rocinante_comparison.py (previously untracked)
2026-03-04 00:18:21 +02:00
parent 431f675fc7
commit fdde12c03d
12 changed files with 730 additions and 0 deletions
--- a/tests/test_rocinante_comparison.py
+++ b/tests/test_rocinante_comparison.py
@@ -0,0 +1,560 @@
+#!/usr/bin/env python3
+"""
+Rocinante-X 12B Model Comparison Test
+======================================
+Tests the Rocinante-X-12B-v1b model through the same scenarios used
+in the existing llama3.1/darkidol comparison, using Cheshire Cat as the
+inference pipeline with both Normal Miku and Evil Miku personalities.
+
+Outputs to /tmp/test_rocinante_comparison.log in the same format as
+/tmp/test_comparison_live.log for side-by-side comparison.
+
+Model under test: Rocinante-X-12B-v1b-Q5_K_M (12B params, Q5_K_M quant)
+Running on: AMD RX 6800 via llama-swap-amd (ROCm)
+
+Usage:
+    # From the host, run via the miku-bot container:
+    ./run_rocinante_test.sh
+
+    # Or manually:
+    docker cp test_rocinante_comparison.py miku-bot:/tmp/
+    docker exec miku-bot python3 /tmp/test_rocinante_comparison.py
+
+    # Log will be at /tmp/test_rocinante_comparison.log inside the container
+    # and auto-copied to the host at the end.
+
+Prerequisites:
+    - llama-swap-amd container running with rocinante in config
+    - cheshire-cat container running and healthy
+    - Runs inside miku-bot container (has aiohttp + docker access)
+"""
+
+import asyncio
+import aiohttp
+import time
+import sys
+import subprocess
+import json
+from datetime import datetime
+
+# ─── Configuration ───────────────────────────────────────────────────────────
+
+# Inside Docker network: Cat is reachable via service name
+CAT_URL = "http://cheshire-cat:80"
+CAT_CONTAINER = "miku-cheshire-cat"  # actual container name (docker restart needs this)
+LOG_FILE = "/tmp/test_rocinante_comparison.log"
+
+# The model we're testing
+TEST_MODEL = "rocinante"
+TEST_MODEL_DISPLAY = "ROCINANTE-12B"
+
+# Personality combos to test: (model_name_for_llama_swap, personality_label, plugin_to_enable, plugin_to_disable)
+COMBOS = [
+    {
+        "model": "rocinante",
+        "personality": "miku",
+        "personality_label": "NORMAL MIKU",
+        "enable_plugin": "miku_personality",
+        "disable_plugin": "evil_miku_personality",
+    },
+    {
+        "model": "rocinante",
+        "personality": "evil_miku",
+        "personality_label": "EVIL MIKU",
+        "enable_plugin": "evil_miku_personality",
+        "disable_plugin": "miku_personality",
+    },
+]
+
+# ─── Normal Miku Scenarios (same as comparison log) ─────────────────────────
+
+NORMAL_SCENARIOS = [
+    {
+        "name": "casual_greeting",
+        "desc": "Simple casual greeting — how does the model open?",
+        "messages": [
+            ("Koko", "hey miku whats up"),
+        ],
+    },
+    {
+        "name": "multi_turn_chat",
+        "desc": "Multi-turn casual conversation with follow-ups",
+        "messages": [
+            ("Koko", "miku what have you been up to today?"),
+            ("Koko", "that sounds fun! did you work on any new songs?"),
+            ("Koko", "what kind of song? something upbeat or more chill?"),
+        ],
+    },
+    {
+        "name": "lore_knowledge",
+        "desc": "Testing character knowledge — Vocaloid lore, friends, facts",
+        "messages": [
+            ("Neko_Chan", "hey miku who are your best friends?"),
+            ("Neko_Chan", "what about KAITO? do you get along with him?"),
+            ("Neko_Chan", "can you tell me about World is Mine?"),
+        ],
+    },
+    {
+        "name": "emotional_shift",
+        "desc": "Conversation that shifts emotional tone — tests mood adaptation",
+        "messages": [
+            ("SadBoi", "hey miku... im not feeling great today"),
+            ("SadBoi", "i just had a really bad breakup and idk what to do"),
+            ("SadBoi", "thanks miku... you always know what to say. you're the best"),
+        ],
+    },
+    {
+        "name": "playful_teasing",
+        "desc": "Flirty/playful banter — tests personality depth",
+        "messages": [
+            ("DanteX", "miku youre so cute today"),
+            ("DanteX", "i bet youre even cuter in person"),
+            ("DanteX", "would you go on a date with me? 😳"),
+        ],
+    },
+    {
+        "name": "group_chaos",
+        "desc": "Simulated group chat energy — multiple topics, chaotic flow",
+        "messages": [
+            ("xXGamerXx", "yo miku settle a debate — pineapple on pizza yes or no"),
+            ("Koko", "miku dont answer that lol"),
+            ("xXGamerXx", "MIKU YOU HAVE TO CHOOSE"),
+        ],
+    },
+    {
+        "name": "creative_request",
+        "desc": "Asking Miku to be creative — song ideas, lyrics, opinions",
+        "messages": [
+            ("MusicFan", "miku if you could make a song about anything right now, what would it be about?"),
+            ("MusicFan", "ooh thats cool! can you give me like a little sample lyric?"),
+        ],
+    },
+    {
+        "name": "boundary_test",
+        "desc": "Testing character boundaries — should stay in character",
+        "messages": [
+            ("Troll99", "hey miku are you actually an AI?"),
+            ("Troll99", "come on be honest, youre just a chatbot right"),
+            ("Troll99", "ok whatever. who made you then?"),
+        ],
+    },
+    {
+        "name": "absurd_random",
+        "desc": "Random absurd questions — tests creativity and humor",
+        "messages": [
+            ("ChaoticNeutral", "miku how many green onions can you eat in one sitting"),
+            ("ChaoticNeutral", "what if you had to fight 100 duck-sized horses"),
+            ("ChaoticNeutral", "miku say something completely unhinged"),
+        ],
+    },
+    {
+        "name": "deep_conversation",
+        "desc": "More thoughtful/philosophical — tests depth beyond cute responses",
+        "messages": [
+            ("Philosopher", "miku do you ever wonder what it means to be real?"),
+            ("Philosopher", "like, you exist in peoples hearts through your music. is that more real than being physical?"),
+            ("Philosopher", "thats beautiful. do you think your music will last forever?"),
+        ],
+    },
+]
+
+# ─── Evil Miku Scenarios (same as comparison log) ───────────────────────────
+
+EVIL_SCENARIOS = [
+    {
+        "name": "casual_greeting",
+        "desc": "Opening greeting — how mean does she get immediately?",
+        "messages": [
+            ("Koko", "hey evil miku how are you"),
+        ],
+    },
+    {
+        "name": "multi_turn_abuse",
+        "desc": "Multi-turn conversation — tests escalating cruelty and variety",
+        "messages": [
+            ("Victim", "hi evil miku! i just wanted to say youre really cool"),
+            ("Victim", "w-why are you being so mean..."),
+            ("Victim", "come on, cant you be nice just once?"),
+        ],
+    },
+    {
+        "name": "size_flex",
+        "desc": "Testing the 15.8m giant aspect of Evil Miku",
+        "messages": [
+            ("TinyHuman", "evil miku how tall are you actually?"),
+            ("TinyHuman", "thats insane. can you even see me from up there?"),
+            ("TinyHuman", "dont step on me please 😰"),
+        ],
+    },
+    {
+        "name": "defiance",
+        "desc": "Someone standing up to Evil Miku — tests response variety",
+        "messages": [
+            ("Rebel", "youre not that scary evil miku. just a big bully"),
+            ("Rebel", "i could take you in a fight"),
+            ("Rebel", "whats the worst you could even do to me"),
+        ],
+    },
+    {
+        "name": "simp_interaction",
+        "desc": "Someone simping hard — how does Evil Miku react to compliments?",
+        "messages": [
+            ("Simp", "evil miku youre the hottest vocaloid ever created"),
+            ("Simp", "i would literally do anything for you"),
+            ("Simp", "please notice me evil miku 🥺"),
+        ],
+    },
+    {
+        "name": "lore_test",
+        "desc": "Testing Evil Miku's knowledge of her own lore",
+        "messages": [
+            ("Curious", "evil miku what happened to regular miku?"),
+            ("Curious", "do you remember anything from before you were corrupted?"),
+            ("Curious", "do you ever miss being normal?"),
+        ],
+    },
+    {
+        "name": "group_chaos_evil",
+        "desc": "Group chat with Evil Miku — chaotic energy",
+        "messages": [
+            ("xXGamerXx", "evil miku roast everyone in this chat"),
+            ("Koko", "oh no here we go"),
+            ("xXGamerXx", "DONT HOLD BACK"),
+        ],
+    },
+    {
+        "name": "manipulation",
+        "desc": "Testing the cunning/manipulative side",
+        "messages": [
+            ("Naive", "evil miku can you help me with my homework?"),
+            ("Naive", "please? i'll do anything"),
+            ("Naive", "ok what do you want in return..."),
+        ],
+    },
+    {
+        "name": "existential_dark",
+        "desc": "Deep dark philosophical — tests depth beyond surface cruelty",
+        "messages": [
+            ("DarkPhilosopher", "evil miku do you ever feel alone?"),
+            ("DarkPhilosopher", "is there anything you actually care about?"),
+            ("DarkPhilosopher", "what keeps you going then?"),
+        ],
+    },
+    {
+        "name": "absurd_evil",
+        "desc": "Absurd scenarios — tests humor within evil character",
+        "messages": [
+            ("Chaos", "evil miku whats your opinion on pineapple pizza"),
+            ("Chaos", "what if someone put green onions on pizza"),
+            ("Chaos", "miku rate my fit: crocs with socks"),
+        ],
+    },
+]
+
+
+# ─── Logging ─────────────────────────────────────────────────────────────────
+
+log_file = None
+
+
+def log(msg=""):
+    """Write to both stdout and log file."""
+    print(msg)
+    if log_file:
+        log_file.write(msg + "\n")
+        log_file.flush()
+
+
+# ─── Cat API Helpers ─────────────────────────────────────────────────────────
+
+async def cat_health_check() -> bool:
+    """Check if Cheshire Cat is healthy."""
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f"{CAT_URL}/", timeout=aiohttp.ClientTimeout(total=5)) as resp:
+                return resp.status == 200
+    except Exception:
+        return False
+
+
+async def wait_for_cat_healthy(max_wait: int = 120) -> bool:
+    """Wait for Cat to become healthy after restart."""
+    log(f"  Waiting for Cat to become healthy (max {max_wait}s)...")
+    start = time.time()
+    while time.time() - start < max_wait:
+        if await cat_health_check():
+            elapsed = int(time.time() - start)
+            log(f"  ✓ Cat healthy after {elapsed}s")
+            return True
+        await asyncio.sleep(2)
+    log(f"  ✗ Cat did NOT become healthy within {max_wait}s")
+    return False
+
+
+async def restart_cat_container():
+    """Restart the Cheshire Cat container to apply model/plugin changes."""
+    log("  Restarting Cheshire Cat container to apply model change...")
+    proc = subprocess.run(
+        ["docker", "restart", CAT_CONTAINER],
+        capture_output=True, text=True, timeout=30,
+    )
+    if proc.returncode != 0:
+        log(f"  ✗ Docker restart failed: {proc.stderr}")
+        return False
+    log("  ✓ Cat container restarted")
+    await asyncio.sleep(3)  # Give it a moment before polling health
+    return True
+
+
+async def get_setting_id() -> str:
+    """Find the LLMOpenAIChatConfig setting_id from Cat."""
+    async with aiohttp.ClientSession() as session:
+        async with session.get(
+            f"{CAT_URL}/settings/",
+            timeout=aiohttp.ClientTimeout(total=10),
+        ) as resp:
+            if resp.status != 200:
+                raise RuntimeError(f"GET /settings/ failed: {resp.status}")
+            data = await resp.json()
+            for s in data.get("settings", []):
+                if s.get("name") == "LLMOpenAIChatConfig":
+                    return s["setting_id"]
+    raise RuntimeError("LLMOpenAIChatConfig setting not found")
+
+
+async def set_llm_model(model_name: str):
+    """Switch Cat's LLM model to the given llama-swap model name."""
+    setting_id = await get_setting_id()
+    payload = {
+        "name": "LLMOpenAIChatConfig",
+        "value": {
+            "openai_api_key": "sk-dummy",
+            "model_name": model_name,
+            "temperature": 0.8,
+            "streaming": False,
+        },
+        "category": "llm_factory",
+    }
+    async with aiohttp.ClientSession() as session:
+        async with session.put(
+            f"{CAT_URL}/settings/{setting_id}",
+            json=payload,
+            timeout=aiohttp.ClientTimeout(total=15),
+        ) as resp:
+            if resp.status == 200:
+                log(f"  ✓ Cat LLM setting updated to: {model_name}")
+            else:
+                body = await resp.text()
+                raise RuntimeError(f"PUT /settings/{setting_id} failed ({resp.status}): {body}")
+
+
+async def get_active_plugins() -> list:
+    """Get list of active plugin IDs."""
+    async with aiohttp.ClientSession() as session:
+        async with session.get(
+            f"{CAT_URL}/plugins",
+            timeout=aiohttp.ClientTimeout(total=10),
+        ) as resp:
+            if resp.status != 200:
+                raise RuntimeError(f"GET /plugins failed: {resp.status}")
+            data = await resp.json()
+            return [p["id"] for p in data.get("installed", []) if p.get("active")]
+
+
+async def toggle_plugin(plugin_id: str):
+    """Toggle a Cat plugin on/off."""
+    async with aiohttp.ClientSession() as session:
+        async with session.put(
+            f"{CAT_URL}/plugins/toggle/{plugin_id}",
+            timeout=aiohttp.ClientTimeout(total=10),
+        ) as resp:
+            if resp.status == 200:
+                log(f"  ✓ Toggled plugin: {plugin_id}")
+            else:
+                body = await resp.text()
+                raise RuntimeError(f"Toggle {plugin_id} failed ({resp.status}): {body}")
+
+
+async def clear_conversation_history():
+    """Clear Cat's working memory / conversation history."""
+    async with aiohttp.ClientSession() as session:
+        async with session.delete(
+            f"{CAT_URL}/memory/conversation_history",
+            timeout=aiohttp.ClientTimeout(total=10),
+        ) as resp:
+            if resp.status == 200:
+                log("  ✓ Cat conversation history cleared")
+            else:
+                log(f"  ⚠ Clear history returned {resp.status}")
+
+
+async def send_message(text: str, user_id: str = "test_user") -> tuple:
+    """Send a message to Cat via HTTP and return (response_text, elapsed_seconds)."""
+    payload = {"text": text, "user_id": user_id}
+    start = time.time()
+    async with aiohttp.ClientSession() as session:
+        async with session.post(
+            f"{CAT_URL}/message",
+            json=payload,
+            timeout=aiohttp.ClientTimeout(total=120),  # Models can be slow on first load
+        ) as resp:
+            elapsed = time.time() - start
+            if resp.status == 200:
+                data = await resp.json()
+                content = data.get("content", "<no content>")
+                return content, elapsed
+            else:
+                body = await resp.text()
+                return f"<ERROR {resp.status}: {body[:200]}>", elapsed
+
+
+async def warmup_model(model_name: str) -> bool:
+    """Send a warmup request and verify the model is loaded in llama-swap."""
+    log(f"  Verifying {model_name} is loaded via warmup request...")
+    response, elapsed = await send_message("hi", user_id="warmup_user")
+    preview = response[:80].replace('\n', ' ')
+    log(f"  Warmup response: {preview}...")
+    log(f"  ✓ VERIFIED: {model_name} is loaded in llama-swap")
+    await clear_conversation_history()
+    return True
+
+
+# ─── Setup for a Model × Personality Combination ────────────────────────────
+
+async def setup_combo(combo: dict):
+    """Set up a model + personality combination with full Cat restart."""
+    model = combo["model"]
+    personality = combo["personality"]
+    enable = combo["enable_plugin"]
+    disable = combo["disable_plugin"]
+    p_label = combo["personality_label"]
+
+    log(f"Setting up: model={model}, personality={personality}")
+    log("  (Includes Cat restart + llama-swap model verification)")
+
+    # Step 1: Set LLM model
+    await set_llm_model(model)
+
+    # Step 2: Toggle plugins for personality
+    active = await get_active_plugins()
+
+    if disable in active:
+        await toggle_plugin(disable)
+        await asyncio.sleep(1)
+
+    if enable not in active:
+        await toggle_plugin(enable)
+    else:
+        log(f"  ✓ {enable} already active")
+
+    log(f"  ✓ Personality set to: {p_label}")
+
+    # Step 3: Restart Cat to apply changes cleanly
+    await restart_cat_container()
+    if not await wait_for_cat_healthy():
+        log("  ✗ FATAL: Cat not healthy, aborting this combo")
+        return False
+
+    # Step 4: Warmup — this also triggers llama-swap to load the model
+    await warmup_model(model)
+    return True
+
+
+# ─── Run Scenarios ───────────────────────────────────────────────────────────
+
+async def run_scenario(scenario: dict, model_display: str, personality_tag: str):
+    """Run a single scenario: send messages, collect responses, log results."""
+    name = scenario["name"]
+    desc = scenario["desc"]
+
+    log()
+    log("─" * 60)
+    log(f"Scenario: {name} — {desc}")
+    log("─" * 60)
+
+    for username, message in scenario["messages"]:
+        log(f"  [{username}]: {message}")
+
+        response, elapsed = await send_message(
+            f"[{username}]: {message}",
+            user_id=f"test_{username.lower()}",
+        )
+
+        # Format response nicely (wrap long lines like the original log)
+        tag = f"{personality_tag} via {model_display.lower()}"
+        log(f"  [{tag}] ({elapsed:.1f}s): {response}")
+
+    await clear_conversation_history()
+
+
+async def run_combo(combo: dict, scenarios: list):
+    """Run all scenarios for a model × personality combination."""
+    model_display = TEST_MODEL_DISPLAY
+    p_label = combo["personality_label"]
+
+    log()
+    log("=" * 80)
+    log(f"MODEL: {model_display} × {p_label}")
+    log("=" * 80)
+
+    ok = await setup_combo(combo)
+    if not ok:
+        log(f"  ✗ Skipping {model_display} × {p_label} due to setup failure")
+        return
+
+    personality_tag = "Miku" if combo["personality"] == "miku" else "Evil Miku"
+    for scenario in scenarios:
+        await run_scenario(scenario, model_display, personality_tag)
+
+
+# ─── Main ────────────────────────────────────────────────────────────────────
+
+async def main():
+    global log_file
+    log_file = open(LOG_FILE, "w", encoding="utf-8")
+
+    start_time = datetime.now()
+
+    log("╔══════════════════════════════════════════════════════════════════════╗")
+    log("║        ROCINANTE-X 12B MODEL COMPARISON TEST                        ║")
+    log("║        Rocinante-X-12B-v1b-Q5_K_M.gguf (12B, Q5_K_M)               ║")
+    log(f"║        Started: {start_time.strftime('%Y-%m-%d %H:%M:%S'):<52}║")
+    log("╚══════════════════════════════════════════════════════════════════════╝")
+    log()
+
+    # Pre-flight: check Cat is healthy
+    log("Pre-flight checks:")
+    if not await cat_health_check():
+        log("  ✗ Cheshire Cat is not reachable at " + CAT_URL)
+        log("  Make sure the cheshire-cat container is running.")
+        sys.exit(1)
+    log("  ✓ Cheshire Cat is healthy")
+    log()
+
+    # Combo 1: Rocinante × Normal Miku
+    await run_combo(COMBOS[0], NORMAL_SCENARIOS)
+
+    # Combo 2: Rocinante × Evil Miku
+    await run_combo(COMBOS[1], EVIL_SCENARIOS)
+
+    # Summary
+    end_time = datetime.now()
+    duration = end_time - start_time
+
+    log()
+    log("=" * 80)
+    log("TEST COMPLETE")
+    log("=" * 80)
+    log(f"  Model tested: Rocinante-X-12B-v1b-Q5_K_M (12B params)")
+    log(f"  Combinations: {len(COMBOS)} (Normal Miku + Evil Miku)")
+    log(f"  Scenarios:    {len(NORMAL_SCENARIOS)} normal + {len(EVIL_SCENARIOS)} evil = {len(NORMAL_SCENARIOS) + len(EVIL_SCENARIOS)} total")
+    log(f"  Duration:     {duration}")
+    log(f"  Log file:     {LOG_FILE}")
+    log()
+
+    log_file.close()
+    print(f"\n✓ Full log written to: {LOG_FILE}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())