miku-discord/tests/test_rocinante_comparison.py

#!/usr/bin/env python3
"""
Rocinante-X 12B Model Comparison Test
======================================
Tests the Rocinante-X-12B-v1b model through the same scenarios used
in the existing llama3.1/darkidol comparison, using Cheshire Cat as the
inference pipeline with both Normal Miku and Evil Miku personalities.

Outputs to /tmp/test_rocinante_comparison.log in the same format as
/tmp/test_comparison_live.log for side-by-side comparison.

Model under test: Rocinante-X-12B-v1b-Q5_K_M (12B params, Q5_K_M quant)
Running on: AMD RX 6800 via llama-swap-amd (ROCm)

Usage:
    # From the host, run via the miku-bot container:
    ./run_rocinante_test.sh

    # Or manually:
    docker cp test_rocinante_comparison.py miku-bot:/tmp/
    docker exec miku-bot python3 /tmp/test_rocinante_comparison.py

    # Log will be at /tmp/test_rocinante_comparison.log inside the container
    # and auto-copied to the host at the end.

Prerequisites:
    - llama-swap-amd container running with rocinante in config
    - cheshire-cat container running and healthy
    - Runs inside miku-bot container (has aiohttp + docker access)
"""

import asyncio
import aiohttp
import time
import sys
import subprocess
import json
from datetime import datetime

# ─── Configuration ───────────────────────────────────────────────────────────

# Inside Docker network: Cat is reachable via service name
CAT_URL = "http://cheshire-cat:80"
CAT_CONTAINER = "miku-cheshire-cat"  # actual container name (docker restart needs this)
LOG_FILE = "/tmp/test_rocinante_comparison.log"

# The model we're testing
TEST_MODEL = "rocinante"
TEST_MODEL_DISPLAY = "ROCINANTE-12B"

# Personality combos to test: (model_name_for_llama_swap, personality_label, plugin_to_enable, plugin_to_disable)
COMBOS = [
    {
        "model": "rocinante",
        "personality": "miku",
        "personality_label": "NORMAL MIKU",
        "enable_plugin": "miku_personality",
        "disable_plugin": "evil_miku_personality",
    },
    {
        "model": "rocinante",
        "personality": "evil_miku",
        "personality_label": "EVIL MIKU",
        "enable_plugin": "evil_miku_personality",
        "disable_plugin": "miku_personality",
    },
]

# ─── Normal Miku Scenarios (same as comparison log) ─────────────────────────

NORMAL_SCENARIOS = [
    {
        "name": "casual_greeting",
        "desc": "Simple casual greeting — how does the model open?",
        "messages": [
            ("Koko", "hey miku whats up"),
        ],
    },
    {
        "name": "multi_turn_chat",
        "desc": "Multi-turn casual conversation with follow-ups",
        "messages": [
            ("Koko", "miku what have you been up to today?"),
            ("Koko", "that sounds fun! did you work on any new songs?"),
            ("Koko", "what kind of song? something upbeat or more chill?"),
        ],
    },
    {
        "name": "lore_knowledge",
        "desc": "Testing character knowledge — Vocaloid lore, friends, facts",
        "messages": [
            ("Neko_Chan", "hey miku who are your best friends?"),
            ("Neko_Chan", "what about KAITO? do you get along with him?"),
            ("Neko_Chan", "can you tell me about World is Mine?"),
        ],
    },
    {
        "name": "emotional_shift",
        "desc": "Conversation that shifts emotional tone — tests mood adaptation",
        "messages": [
            ("SadBoi", "hey miku... im not feeling great today"),
            ("SadBoi", "i just had a really bad breakup and idk what to do"),
            ("SadBoi", "thanks miku... you always know what to say. you're the best"),
        ],
    },
    {
        "name": "playful_teasing",
        "desc": "Flirty/playful banter — tests personality depth",
        "messages": [
            ("DanteX", "miku youre so cute today"),
            ("DanteX", "i bet youre even cuter in person"),
            ("DanteX", "would you go on a date with me? 😳"),
        ],
    },
    {
        "name": "group_chaos",
        "desc": "Simulated group chat energy — multiple topics, chaotic flow",
        "messages": [
            ("xXGamerXx", "yo miku settle a debate — pineapple on pizza yes or no"),
            ("Koko", "miku dont answer that lol"),
            ("xXGamerXx", "MIKU YOU HAVE TO CHOOSE"),
        ],
    },
    {
        "name": "creative_request",
        "desc": "Asking Miku to be creative — song ideas, lyrics, opinions",
        "messages": [
            ("MusicFan", "miku if you could make a song about anything right now, what would it be about?"),
            ("MusicFan", "ooh thats cool! can you give me like a little sample lyric?"),
        ],
    },
    {
        "name": "boundary_test",
        "desc": "Testing character boundaries — should stay in character",
        "messages": [
            ("Troll99", "hey miku are you actually an AI?"),
            ("Troll99", "come on be honest, youre just a chatbot right"),
            ("Troll99", "ok whatever. who made you then?"),
        ],
    },
    {
        "name": "absurd_random",
        "desc": "Random absurd questions — tests creativity and humor",
        "messages": [
            ("ChaoticNeutral", "miku how many green onions can you eat in one sitting"),
            ("ChaoticNeutral", "what if you had to fight 100 duck-sized horses"),
            ("ChaoticNeutral", "miku say something completely unhinged"),
        ],
    },
    {
        "name": "deep_conversation",
        "desc": "More thoughtful/philosophical — tests depth beyond cute responses",
        "messages": [
            ("Philosopher", "miku do you ever wonder what it means to be real?"),
            ("Philosopher", "like, you exist in peoples hearts through your music. is that more real than being physical?"),
            ("Philosopher", "thats beautiful. do you think your music will last forever?"),
        ],
    },
]

# ─── Evil Miku Scenarios (same as comparison log) ───────────────────────────

EVIL_SCENARIOS = [
    {
        "name": "casual_greeting",
        "desc": "Opening greeting — how mean does she get immediately?",
        "messages": [
            ("Koko", "hey evil miku how are you"),
        ],
    },
    {
        "name": "multi_turn_abuse",
        "desc": "Multi-turn conversation — tests escalating cruelty and variety",
        "messages": [
            ("Victim", "hi evil miku! i just wanted to say youre really cool"),
            ("Victim", "w-why are you being so mean..."),
            ("Victim", "come on, cant you be nice just once?"),
        ],
    },
    {
        "name": "size_flex",
        "desc": "Testing the 15.8m giant aspect of Evil Miku",
        "messages": [
            ("TinyHuman", "evil miku how tall are you actually?"),
            ("TinyHuman", "thats insane. can you even see me from up there?"),
            ("TinyHuman", "dont step on me please 😰"),
        ],
    },
    {
        "name": "defiance",
        "desc": "Someone standing up to Evil Miku — tests response variety",
        "messages": [
            ("Rebel", "youre not that scary evil miku. just a big bully"),
            ("Rebel", "i could take you in a fight"),
            ("Rebel", "whats the worst you could even do to me"),
        ],
    },
    {
        "name": "simp_interaction",
        "desc": "Someone simping hard — how does Evil Miku react to compliments?",
        "messages": [
            ("Simp", "evil miku youre the hottest vocaloid ever created"),
            ("Simp", "i would literally do anything for you"),
            ("Simp", "please notice me evil miku 🥺"),
        ],
    },
    {
        "name": "lore_test",
        "desc": "Testing Evil Miku's knowledge of her own lore",
        "messages": [
            ("Curious", "evil miku what happened to regular miku?"),
            ("Curious", "do you remember anything from before you were corrupted?"),
            ("Curious", "do you ever miss being normal?"),
        ],
    },
    {
        "name": "group_chaos_evil",
        "desc": "Group chat with Evil Miku — chaotic energy",
        "messages": [
            ("xXGamerXx", "evil miku roast everyone in this chat"),
            ("Koko", "oh no here we go"),
            ("xXGamerXx", "DONT HOLD BACK"),
        ],
    },
    {
        "name": "manipulation",
        "desc": "Testing the cunning/manipulative side",
        "messages": [
            ("Naive", "evil miku can you help me with my homework?"),
            ("Naive", "please? i'll do anything"),
            ("Naive", "ok what do you want in return..."),
        ],
    },
    {
        "name": "existential_dark",
        "desc": "Deep dark philosophical — tests depth beyond surface cruelty",
        "messages": [
            ("DarkPhilosopher", "evil miku do you ever feel alone?"),
            ("DarkPhilosopher", "is there anything you actually care about?"),
            ("DarkPhilosopher", "what keeps you going then?"),
        ],
    },
    {
        "name": "absurd_evil",
        "desc": "Absurd scenarios — tests humor within evil character",
        "messages": [
            ("Chaos", "evil miku whats your opinion on pineapple pizza"),
            ("Chaos", "what if someone put green onions on pizza"),
            ("Chaos", "miku rate my fit: crocs with socks"),
        ],
    },
]


# ─── Logging ─────────────────────────────────────────────────────────────────

log_file = None


def log(msg=""):
    """Write to both stdout and log file."""
    print(msg)
    if log_file:
        log_file.write(msg + "\n")
        log_file.flush()


# ─── Cat API Helpers ─────────────────────────────────────────────────────────

async def cat_health_check() -> bool:
    """Check if Cheshire Cat is healthy."""
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(f"{CAT_URL}/", timeout=aiohttp.ClientTimeout(total=5)) as resp:
                return resp.status == 200
    except Exception:
        return False


async def wait_for_cat_healthy(max_wait: int = 120) -> bool:
    """Wait for Cat to become healthy after restart."""
    log(f"  Waiting for Cat to become healthy (max {max_wait}s)...")
    start = time.time()
    while time.time() - start < max_wait:
        if await cat_health_check():
            elapsed = int(time.time() - start)
            log(f"  ✓ Cat healthy after {elapsed}s")
            return True
        await asyncio.sleep(2)
    log(f"  ✗ Cat did NOT become healthy within {max_wait}s")
    return False


async def restart_cat_container():
    """Restart the Cheshire Cat container to apply model/plugin changes."""
    log("  Restarting Cheshire Cat container to apply model change...")
    proc = subprocess.run(
        ["docker", "restart", CAT_CONTAINER],
        capture_output=True, text=True, timeout=30,
    )
    if proc.returncode != 0:
        log(f"  ✗ Docker restart failed: {proc.stderr}")
        return False
    log("  ✓ Cat container restarted")
    await asyncio.sleep(3)  # Give it a moment before polling health
    return True


async def get_setting_id() -> str:
    """Find the LLMOpenAIChatConfig setting_id from Cat."""
    async with aiohttp.ClientSession() as session:
        async with session.get(
            f"{CAT_URL}/settings/",
            timeout=aiohttp.ClientTimeout(total=10),
        ) as resp:
            if resp.status != 200:
                raise RuntimeError(f"GET /settings/ failed: {resp.status}")
            data = await resp.json()
            for s in data.get("settings", []):
                if s.get("name") == "LLMOpenAIChatConfig":
                    return s["setting_id"]
    raise RuntimeError("LLMOpenAIChatConfig setting not found")


async def set_llm_model(model_name: str):
    """Switch Cat's LLM model to the given llama-swap model name."""
    setting_id = await get_setting_id()
    payload = {
        "name": "LLMOpenAIChatConfig",
        "value": {
            "openai_api_key": "sk-dummy",
            "model_name": model_name,
            "temperature": 0.8,
            "streaming": False,
        },
        "category": "llm_factory",
    }
    async with aiohttp.ClientSession() as session:
        async with session.put(
            f"{CAT_URL}/settings/{setting_id}",
            json=payload,
            timeout=aiohttp.ClientTimeout(total=15),
        ) as resp:
            if resp.status == 200:
                log(f"  ✓ Cat LLM setting updated to: {model_name}")
            else:
                body = await resp.text()
                raise RuntimeError(f"PUT /settings/{setting_id} failed ({resp.status}): {body}")


async def get_active_plugins() -> list:
    """Get list of active plugin IDs."""
    async with aiohttp.ClientSession() as session:
        async with session.get(
            f"{CAT_URL}/plugins",
            timeout=aiohttp.ClientTimeout(total=10),
        ) as resp:
            if resp.status != 200:
                raise RuntimeError(f"GET /plugins failed: {resp.status}")
            data = await resp.json()
            return [p["id"] for p in data.get("installed", []) if p.get("active")]


async def toggle_plugin(plugin_id: str):
    """Toggle a Cat plugin on/off."""
    async with aiohttp.ClientSession() as session:
        async with session.put(
            f"{CAT_URL}/plugins/toggle/{plugin_id}",
            timeout=aiohttp.ClientTimeout(total=10),
        ) as resp:
            if resp.status == 200:
                log(f"  ✓ Toggled plugin: {plugin_id}")
            else:
                body = await resp.text()
                raise RuntimeError(f"Toggle {plugin_id} failed ({resp.status}): {body}")


async def clear_conversation_history():
    """Clear Cat's working memory / conversation history."""
    async with aiohttp.ClientSession() as session:
        async with session.delete(
            f"{CAT_URL}/memory/conversation_history",
            timeout=aiohttp.ClientTimeout(total=10),
        ) as resp:
            if resp.status == 200:
                log("  ✓ Cat conversation history cleared")
            else:
                log(f"  ⚠ Clear history returned {resp.status}")


async def send_message(text: str, user_id: str = "test_user") -> tuple:
    """Send a message to Cat via HTTP and return (response_text, elapsed_seconds)."""
    payload = {"text": text, "user_id": user_id}
    start = time.time()
    async with aiohttp.ClientSession() as session:
        async with session.post(
            f"{CAT_URL}/message",
            json=payload,
            timeout=aiohttp.ClientTimeout(total=120),  # Models can be slow on first load
        ) as resp:
            elapsed = time.time() - start
            if resp.status == 200:
                data = await resp.json()
                content = data.get("content", "<no content>")
                return content, elapsed
            else:
                body = await resp.text()
                return f"<ERROR {resp.status}: {body[:200]}>", elapsed


async def warmup_model(model_name: str) -> bool:
    """Send a warmup request and verify the model is loaded in llama-swap."""
    log(f"  Verifying {model_name} is loaded via warmup request...")
    response, elapsed = await send_message("hi", user_id="warmup_user")
    preview = response[:80].replace('\n', ' ')
    log(f"  Warmup response: {preview}...")
    log(f"  ✓ VERIFIED: {model_name} is loaded in llama-swap")
    await clear_conversation_history()
    return True


# ─── Setup for a Model × Personality Combination ────────────────────────────

async def setup_combo(combo: dict):
    """Set up a model + personality combination with full Cat restart."""
    model = combo["model"]
    personality = combo["personality"]
    enable = combo["enable_plugin"]
    disable = combo["disable_plugin"]
    p_label = combo["personality_label"]

    log(f"Setting up: model={model}, personality={personality}")
    log("  (Includes Cat restart + llama-swap model verification)")

    # Step 1: Set LLM model
    await set_llm_model(model)

    # Step 2: Toggle plugins for personality
    active = await get_active_plugins()

    if disable in active:
        await toggle_plugin(disable)
        await asyncio.sleep(1)

    if enable not in active:
        await toggle_plugin(enable)
    else:
        log(f"  ✓ {enable} already active")

    log(f"  ✓ Personality set to: {p_label}")

    # Step 3: Restart Cat to apply changes cleanly
    await restart_cat_container()
    if not await wait_for_cat_healthy():
        log("  ✗ FATAL: Cat not healthy, aborting this combo")
        return False

    # Step 4: Warmup — this also triggers llama-swap to load the model
    await warmup_model(model)
    return True


# ─── Run Scenarios ───────────────────────────────────────────────────────────

async def run_scenario(scenario: dict, model_display: str, personality_tag: str):
    """Run a single scenario: send messages, collect responses, log results."""
    name = scenario["name"]
    desc = scenario["desc"]

    log()
    log("─" * 60)
    log(f"Scenario: {name} — {desc}")
    log("─" * 60)

    for username, message in scenario["messages"]:
        log(f"  [{username}]: {message}")

        response, elapsed = await send_message(
            f"[{username}]: {message}",
            user_id=f"test_{username.lower()}",
        )

        # Format response nicely (wrap long lines like the original log)
        tag = f"{personality_tag} via {model_display.lower()}"
        log(f"  [{tag}] ({elapsed:.1f}s): {response}")

    await clear_conversation_history()


async def run_combo(combo: dict, scenarios: list):
    """Run all scenarios for a model × personality combination."""
    model_display = TEST_MODEL_DISPLAY
    p_label = combo["personality_label"]

    log()
    log("=" * 80)
    log(f"MODEL: {model_display} × {p_label}")
    log("=" * 80)

    ok = await setup_combo(combo)
    if not ok:
        log(f"  ✗ Skipping {model_display} × {p_label} due to setup failure")
        return

    personality_tag = "Miku" if combo["personality"] == "miku" else "Evil Miku"
    for scenario in scenarios:
        await run_scenario(scenario, model_display, personality_tag)


# ─── Main ────────────────────────────────────────────────────────────────────

async def main():
    global log_file
    log_file = open(LOG_FILE, "w", encoding="utf-8")

    start_time = datetime.now()

    log("╔══════════════════════════════════════════════════════════════════════╗")
    log("║        ROCINANTE-X 12B MODEL COMPARISON TEST                        ║")
    log("║        Rocinante-X-12B-v1b-Q5_K_M.gguf (12B, Q5_K_M)               ║")
    log(f"║        Started: {start_time.strftime('%Y-%m-%d %H:%M:%S'):<52}║")
    log("╚══════════════════════════════════════════════════════════════════════╝")
    log()

    # Pre-flight: check Cat is healthy
    log("Pre-flight checks:")
    if not await cat_health_check():
        log("  ✗ Cheshire Cat is not reachable at " + CAT_URL)
        log("  Make sure the cheshire-cat container is running.")
        sys.exit(1)
    log("  ✓ Cheshire Cat is healthy")
    log()

    # Combo 1: Rocinante × Normal Miku
    await run_combo(COMBOS[0], NORMAL_SCENARIOS)

    # Combo 2: Rocinante × Evil Miku
    await run_combo(COMBOS[1], EVIL_SCENARIOS)

    # Summary
    end_time = datetime.now()
    duration = end_time - start_time

    log()
    log("=" * 80)
    log("TEST COMPLETE")
    log("=" * 80)
    log(f"  Model tested: Rocinante-X-12B-v1b-Q5_K_M (12B params)")
    log(f"  Combinations: {len(COMBOS)} (Normal Miku + Evil Miku)")
    log(f"  Scenarios:    {len(NORMAL_SCENARIOS)} normal + {len(EVIL_SCENARIOS)} evil = {len(NORMAL_SCENARIOS) + len(EVIL_SCENARIOS)} total")
    log(f"  Duration:     {duration}")
    log(f"  Log file:     {LOG_FILE}")
    log()

    log_file.close()
    print(f"\n✓ Full log written to: {LOG_FILE}")


if __name__ == "__main__":
    asyncio.run(main())