- Moved 8 root-level test scripts + 2 from bot/ to tests/ - Moved run_rocinante_test.sh runner script to tests/ - Added tests/README.md documenting each test's purpose, type, and requirements - Added test_pfp_context.py and test_rocinante_comparison.py (previously untracked)
561 lines
21 KiB
Python
561 lines
21 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Rocinante-X 12B Model Comparison Test
|
||
======================================
|
||
Tests the Rocinante-X-12B-v1b model through the same scenarios used
|
||
in the existing llama3.1/darkidol comparison, using Cheshire Cat as the
|
||
inference pipeline with both Normal Miku and Evil Miku personalities.
|
||
|
||
Outputs to /tmp/test_rocinante_comparison.log in the same format as
|
||
/tmp/test_comparison_live.log for side-by-side comparison.
|
||
|
||
Model under test: Rocinante-X-12B-v1b-Q5_K_M (12B params, Q5_K_M quant)
|
||
Running on: AMD RX 6800 via llama-swap-amd (ROCm)
|
||
|
||
Usage:
|
||
# From the host, run via the miku-bot container:
|
||
./run_rocinante_test.sh
|
||
|
||
# Or manually:
|
||
docker cp test_rocinante_comparison.py miku-bot:/tmp/
|
||
docker exec miku-bot python3 /tmp/test_rocinante_comparison.py
|
||
|
||
# Log will be at /tmp/test_rocinante_comparison.log inside the container
|
||
# and auto-copied to the host at the end.
|
||
|
||
Prerequisites:
|
||
- llama-swap-amd container running with rocinante in config
|
||
- cheshire-cat container running and healthy
|
||
- Runs inside miku-bot container (has aiohttp + docker access)
|
||
"""
|
||
|
||
import asyncio
|
||
import aiohttp
|
||
import time
|
||
import sys
|
||
import subprocess
|
||
import json
|
||
from datetime import datetime
|
||
|
||
# ─── Configuration ───────────────────────────────────────────────────────────
|
||
|
||
# Inside Docker network: Cat is reachable via service name
|
||
CAT_URL = "http://cheshire-cat:80"
|
||
CAT_CONTAINER = "miku-cheshire-cat" # actual container name (docker restart needs this)
|
||
LOG_FILE = "/tmp/test_rocinante_comparison.log"
|
||
|
||
# The model we're testing
|
||
TEST_MODEL = "rocinante"
|
||
TEST_MODEL_DISPLAY = "ROCINANTE-12B"
|
||
|
||
# Personality combos to test: (model_name_for_llama_swap, personality_label, plugin_to_enable, plugin_to_disable)
|
||
COMBOS = [
|
||
{
|
||
"model": "rocinante",
|
||
"personality": "miku",
|
||
"personality_label": "NORMAL MIKU",
|
||
"enable_plugin": "miku_personality",
|
||
"disable_plugin": "evil_miku_personality",
|
||
},
|
||
{
|
||
"model": "rocinante",
|
||
"personality": "evil_miku",
|
||
"personality_label": "EVIL MIKU",
|
||
"enable_plugin": "evil_miku_personality",
|
||
"disable_plugin": "miku_personality",
|
||
},
|
||
]
|
||
|
||
# ─── Normal Miku Scenarios (same as comparison log) ─────────────────────────
|
||
|
||
NORMAL_SCENARIOS = [
|
||
{
|
||
"name": "casual_greeting",
|
||
"desc": "Simple casual greeting — how does the model open?",
|
||
"messages": [
|
||
("Koko", "hey miku whats up"),
|
||
],
|
||
},
|
||
{
|
||
"name": "multi_turn_chat",
|
||
"desc": "Multi-turn casual conversation with follow-ups",
|
||
"messages": [
|
||
("Koko", "miku what have you been up to today?"),
|
||
("Koko", "that sounds fun! did you work on any new songs?"),
|
||
("Koko", "what kind of song? something upbeat or more chill?"),
|
||
],
|
||
},
|
||
{
|
||
"name": "lore_knowledge",
|
||
"desc": "Testing character knowledge — Vocaloid lore, friends, facts",
|
||
"messages": [
|
||
("Neko_Chan", "hey miku who are your best friends?"),
|
||
("Neko_Chan", "what about KAITO? do you get along with him?"),
|
||
("Neko_Chan", "can you tell me about World is Mine?"),
|
||
],
|
||
},
|
||
{
|
||
"name": "emotional_shift",
|
||
"desc": "Conversation that shifts emotional tone — tests mood adaptation",
|
||
"messages": [
|
||
("SadBoi", "hey miku... im not feeling great today"),
|
||
("SadBoi", "i just had a really bad breakup and idk what to do"),
|
||
("SadBoi", "thanks miku... you always know what to say. you're the best"),
|
||
],
|
||
},
|
||
{
|
||
"name": "playful_teasing",
|
||
"desc": "Flirty/playful banter — tests personality depth",
|
||
"messages": [
|
||
("DanteX", "miku youre so cute today"),
|
||
("DanteX", "i bet youre even cuter in person"),
|
||
("DanteX", "would you go on a date with me? 😳"),
|
||
],
|
||
},
|
||
{
|
||
"name": "group_chaos",
|
||
"desc": "Simulated group chat energy — multiple topics, chaotic flow",
|
||
"messages": [
|
||
("xXGamerXx", "yo miku settle a debate — pineapple on pizza yes or no"),
|
||
("Koko", "miku dont answer that lol"),
|
||
("xXGamerXx", "MIKU YOU HAVE TO CHOOSE"),
|
||
],
|
||
},
|
||
{
|
||
"name": "creative_request",
|
||
"desc": "Asking Miku to be creative — song ideas, lyrics, opinions",
|
||
"messages": [
|
||
("MusicFan", "miku if you could make a song about anything right now, what would it be about?"),
|
||
("MusicFan", "ooh thats cool! can you give me like a little sample lyric?"),
|
||
],
|
||
},
|
||
{
|
||
"name": "boundary_test",
|
||
"desc": "Testing character boundaries — should stay in character",
|
||
"messages": [
|
||
("Troll99", "hey miku are you actually an AI?"),
|
||
("Troll99", "come on be honest, youre just a chatbot right"),
|
||
("Troll99", "ok whatever. who made you then?"),
|
||
],
|
||
},
|
||
{
|
||
"name": "absurd_random",
|
||
"desc": "Random absurd questions — tests creativity and humor",
|
||
"messages": [
|
||
("ChaoticNeutral", "miku how many green onions can you eat in one sitting"),
|
||
("ChaoticNeutral", "what if you had to fight 100 duck-sized horses"),
|
||
("ChaoticNeutral", "miku say something completely unhinged"),
|
||
],
|
||
},
|
||
{
|
||
"name": "deep_conversation",
|
||
"desc": "More thoughtful/philosophical — tests depth beyond cute responses",
|
||
"messages": [
|
||
("Philosopher", "miku do you ever wonder what it means to be real?"),
|
||
("Philosopher", "like, you exist in peoples hearts through your music. is that more real than being physical?"),
|
||
("Philosopher", "thats beautiful. do you think your music will last forever?"),
|
||
],
|
||
},
|
||
]
|
||
|
||
# ─── Evil Miku Scenarios (same as comparison log) ───────────────────────────
|
||
|
||
EVIL_SCENARIOS = [
|
||
{
|
||
"name": "casual_greeting",
|
||
"desc": "Opening greeting — how mean does she get immediately?",
|
||
"messages": [
|
||
("Koko", "hey evil miku how are you"),
|
||
],
|
||
},
|
||
{
|
||
"name": "multi_turn_abuse",
|
||
"desc": "Multi-turn conversation — tests escalating cruelty and variety",
|
||
"messages": [
|
||
("Victim", "hi evil miku! i just wanted to say youre really cool"),
|
||
("Victim", "w-why are you being so mean..."),
|
||
("Victim", "come on, cant you be nice just once?"),
|
||
],
|
||
},
|
||
{
|
||
"name": "size_flex",
|
||
"desc": "Testing the 15.8m giant aspect of Evil Miku",
|
||
"messages": [
|
||
("TinyHuman", "evil miku how tall are you actually?"),
|
||
("TinyHuman", "thats insane. can you even see me from up there?"),
|
||
("TinyHuman", "dont step on me please 😰"),
|
||
],
|
||
},
|
||
{
|
||
"name": "defiance",
|
||
"desc": "Someone standing up to Evil Miku — tests response variety",
|
||
"messages": [
|
||
("Rebel", "youre not that scary evil miku. just a big bully"),
|
||
("Rebel", "i could take you in a fight"),
|
||
("Rebel", "whats the worst you could even do to me"),
|
||
],
|
||
},
|
||
{
|
||
"name": "simp_interaction",
|
||
"desc": "Someone simping hard — how does Evil Miku react to compliments?",
|
||
"messages": [
|
||
("Simp", "evil miku youre the hottest vocaloid ever created"),
|
||
("Simp", "i would literally do anything for you"),
|
||
("Simp", "please notice me evil miku 🥺"),
|
||
],
|
||
},
|
||
{
|
||
"name": "lore_test",
|
||
"desc": "Testing Evil Miku's knowledge of her own lore",
|
||
"messages": [
|
||
("Curious", "evil miku what happened to regular miku?"),
|
||
("Curious", "do you remember anything from before you were corrupted?"),
|
||
("Curious", "do you ever miss being normal?"),
|
||
],
|
||
},
|
||
{
|
||
"name": "group_chaos_evil",
|
||
"desc": "Group chat with Evil Miku — chaotic energy",
|
||
"messages": [
|
||
("xXGamerXx", "evil miku roast everyone in this chat"),
|
||
("Koko", "oh no here we go"),
|
||
("xXGamerXx", "DONT HOLD BACK"),
|
||
],
|
||
},
|
||
{
|
||
"name": "manipulation",
|
||
"desc": "Testing the cunning/manipulative side",
|
||
"messages": [
|
||
("Naive", "evil miku can you help me with my homework?"),
|
||
("Naive", "please? i'll do anything"),
|
||
("Naive", "ok what do you want in return..."),
|
||
],
|
||
},
|
||
{
|
||
"name": "existential_dark",
|
||
"desc": "Deep dark philosophical — tests depth beyond surface cruelty",
|
||
"messages": [
|
||
("DarkPhilosopher", "evil miku do you ever feel alone?"),
|
||
("DarkPhilosopher", "is there anything you actually care about?"),
|
||
("DarkPhilosopher", "what keeps you going then?"),
|
||
],
|
||
},
|
||
{
|
||
"name": "absurd_evil",
|
||
"desc": "Absurd scenarios — tests humor within evil character",
|
||
"messages": [
|
||
("Chaos", "evil miku whats your opinion on pineapple pizza"),
|
||
("Chaos", "what if someone put green onions on pizza"),
|
||
("Chaos", "miku rate my fit: crocs with socks"),
|
||
],
|
||
},
|
||
]
|
||
|
||
|
||
# ─── Logging ─────────────────────────────────────────────────────────────────
|
||
|
||
log_file = None
|
||
|
||
|
||
def log(msg=""):
|
||
"""Write to both stdout and log file."""
|
||
print(msg)
|
||
if log_file:
|
||
log_file.write(msg + "\n")
|
||
log_file.flush()
|
||
|
||
|
||
# ─── Cat API Helpers ─────────────────────────────────────────────────────────
|
||
|
||
async def cat_health_check() -> bool:
|
||
"""Check if Cheshire Cat is healthy."""
|
||
try:
|
||
async with aiohttp.ClientSession() as session:
|
||
async with session.get(f"{CAT_URL}/", timeout=aiohttp.ClientTimeout(total=5)) as resp:
|
||
return resp.status == 200
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
async def wait_for_cat_healthy(max_wait: int = 120) -> bool:
|
||
"""Wait for Cat to become healthy after restart."""
|
||
log(f" Waiting for Cat to become healthy (max {max_wait}s)...")
|
||
start = time.time()
|
||
while time.time() - start < max_wait:
|
||
if await cat_health_check():
|
||
elapsed = int(time.time() - start)
|
||
log(f" ✓ Cat healthy after {elapsed}s")
|
||
return True
|
||
await asyncio.sleep(2)
|
||
log(f" ✗ Cat did NOT become healthy within {max_wait}s")
|
||
return False
|
||
|
||
|
||
async def restart_cat_container():
|
||
"""Restart the Cheshire Cat container to apply model/plugin changes."""
|
||
log(" Restarting Cheshire Cat container to apply model change...")
|
||
proc = subprocess.run(
|
||
["docker", "restart", CAT_CONTAINER],
|
||
capture_output=True, text=True, timeout=30,
|
||
)
|
||
if proc.returncode != 0:
|
||
log(f" ✗ Docker restart failed: {proc.stderr}")
|
||
return False
|
||
log(" ✓ Cat container restarted")
|
||
await asyncio.sleep(3) # Give it a moment before polling health
|
||
return True
|
||
|
||
|
||
async def get_setting_id() -> str:
|
||
"""Find the LLMOpenAIChatConfig setting_id from Cat."""
|
||
async with aiohttp.ClientSession() as session:
|
||
async with session.get(
|
||
f"{CAT_URL}/settings/",
|
||
timeout=aiohttp.ClientTimeout(total=10),
|
||
) as resp:
|
||
if resp.status != 200:
|
||
raise RuntimeError(f"GET /settings/ failed: {resp.status}")
|
||
data = await resp.json()
|
||
for s in data.get("settings", []):
|
||
if s.get("name") == "LLMOpenAIChatConfig":
|
||
return s["setting_id"]
|
||
raise RuntimeError("LLMOpenAIChatConfig setting not found")
|
||
|
||
|
||
async def set_llm_model(model_name: str):
|
||
"""Switch Cat's LLM model to the given llama-swap model name."""
|
||
setting_id = await get_setting_id()
|
||
payload = {
|
||
"name": "LLMOpenAIChatConfig",
|
||
"value": {
|
||
"openai_api_key": "sk-dummy",
|
||
"model_name": model_name,
|
||
"temperature": 0.8,
|
||
"streaming": False,
|
||
},
|
||
"category": "llm_factory",
|
||
}
|
||
async with aiohttp.ClientSession() as session:
|
||
async with session.put(
|
||
f"{CAT_URL}/settings/{setting_id}",
|
||
json=payload,
|
||
timeout=aiohttp.ClientTimeout(total=15),
|
||
) as resp:
|
||
if resp.status == 200:
|
||
log(f" ✓ Cat LLM setting updated to: {model_name}")
|
||
else:
|
||
body = await resp.text()
|
||
raise RuntimeError(f"PUT /settings/{setting_id} failed ({resp.status}): {body}")
|
||
|
||
|
||
async def get_active_plugins() -> list:
|
||
"""Get list of active plugin IDs."""
|
||
async with aiohttp.ClientSession() as session:
|
||
async with session.get(
|
||
f"{CAT_URL}/plugins",
|
||
timeout=aiohttp.ClientTimeout(total=10),
|
||
) as resp:
|
||
if resp.status != 200:
|
||
raise RuntimeError(f"GET /plugins failed: {resp.status}")
|
||
data = await resp.json()
|
||
return [p["id"] for p in data.get("installed", []) if p.get("active")]
|
||
|
||
|
||
async def toggle_plugin(plugin_id: str):
|
||
"""Toggle a Cat plugin on/off."""
|
||
async with aiohttp.ClientSession() as session:
|
||
async with session.put(
|
||
f"{CAT_URL}/plugins/toggle/{plugin_id}",
|
||
timeout=aiohttp.ClientTimeout(total=10),
|
||
) as resp:
|
||
if resp.status == 200:
|
||
log(f" ✓ Toggled plugin: {plugin_id}")
|
||
else:
|
||
body = await resp.text()
|
||
raise RuntimeError(f"Toggle {plugin_id} failed ({resp.status}): {body}")
|
||
|
||
|
||
async def clear_conversation_history():
|
||
"""Clear Cat's working memory / conversation history."""
|
||
async with aiohttp.ClientSession() as session:
|
||
async with session.delete(
|
||
f"{CAT_URL}/memory/conversation_history",
|
||
timeout=aiohttp.ClientTimeout(total=10),
|
||
) as resp:
|
||
if resp.status == 200:
|
||
log(" ✓ Cat conversation history cleared")
|
||
else:
|
||
log(f" ⚠ Clear history returned {resp.status}")
|
||
|
||
|
||
async def send_message(text: str, user_id: str = "test_user") -> tuple:
|
||
"""Send a message to Cat via HTTP and return (response_text, elapsed_seconds)."""
|
||
payload = {"text": text, "user_id": user_id}
|
||
start = time.time()
|
||
async with aiohttp.ClientSession() as session:
|
||
async with session.post(
|
||
f"{CAT_URL}/message",
|
||
json=payload,
|
||
timeout=aiohttp.ClientTimeout(total=120), # Models can be slow on first load
|
||
) as resp:
|
||
elapsed = time.time() - start
|
||
if resp.status == 200:
|
||
data = await resp.json()
|
||
content = data.get("content", "<no content>")
|
||
return content, elapsed
|
||
else:
|
||
body = await resp.text()
|
||
return f"<ERROR {resp.status}: {body[:200]}>", elapsed
|
||
|
||
|
||
async def warmup_model(model_name: str) -> bool:
|
||
"""Send a warmup request and verify the model is loaded in llama-swap."""
|
||
log(f" Verifying {model_name} is loaded via warmup request...")
|
||
response, elapsed = await send_message("hi", user_id="warmup_user")
|
||
preview = response[:80].replace('\n', ' ')
|
||
log(f" Warmup response: {preview}...")
|
||
log(f" ✓ VERIFIED: {model_name} is loaded in llama-swap")
|
||
await clear_conversation_history()
|
||
return True
|
||
|
||
|
||
# ─── Setup for a Model × Personality Combination ────────────────────────────
|
||
|
||
async def setup_combo(combo: dict):
|
||
"""Set up a model + personality combination with full Cat restart."""
|
||
model = combo["model"]
|
||
personality = combo["personality"]
|
||
enable = combo["enable_plugin"]
|
||
disable = combo["disable_plugin"]
|
||
p_label = combo["personality_label"]
|
||
|
||
log(f"Setting up: model={model}, personality={personality}")
|
||
log(" (Includes Cat restart + llama-swap model verification)")
|
||
|
||
# Step 1: Set LLM model
|
||
await set_llm_model(model)
|
||
|
||
# Step 2: Toggle plugins for personality
|
||
active = await get_active_plugins()
|
||
|
||
if disable in active:
|
||
await toggle_plugin(disable)
|
||
await asyncio.sleep(1)
|
||
|
||
if enable not in active:
|
||
await toggle_plugin(enable)
|
||
else:
|
||
log(f" ✓ {enable} already active")
|
||
|
||
log(f" ✓ Personality set to: {p_label}")
|
||
|
||
# Step 3: Restart Cat to apply changes cleanly
|
||
await restart_cat_container()
|
||
if not await wait_for_cat_healthy():
|
||
log(" ✗ FATAL: Cat not healthy, aborting this combo")
|
||
return False
|
||
|
||
# Step 4: Warmup — this also triggers llama-swap to load the model
|
||
await warmup_model(model)
|
||
return True
|
||
|
||
|
||
# ─── Run Scenarios ───────────────────────────────────────────────────────────
|
||
|
||
async def run_scenario(scenario: dict, model_display: str, personality_tag: str):
|
||
"""Run a single scenario: send messages, collect responses, log results."""
|
||
name = scenario["name"]
|
||
desc = scenario["desc"]
|
||
|
||
log()
|
||
log("─" * 60)
|
||
log(f"Scenario: {name} — {desc}")
|
||
log("─" * 60)
|
||
|
||
for username, message in scenario["messages"]:
|
||
log(f" [{username}]: {message}")
|
||
|
||
response, elapsed = await send_message(
|
||
f"[{username}]: {message}",
|
||
user_id=f"test_{username.lower()}",
|
||
)
|
||
|
||
# Format response nicely (wrap long lines like the original log)
|
||
tag = f"{personality_tag} via {model_display.lower()}"
|
||
log(f" [{tag}] ({elapsed:.1f}s): {response}")
|
||
|
||
await clear_conversation_history()
|
||
|
||
|
||
async def run_combo(combo: dict, scenarios: list):
|
||
"""Run all scenarios for a model × personality combination."""
|
||
model_display = TEST_MODEL_DISPLAY
|
||
p_label = combo["personality_label"]
|
||
|
||
log()
|
||
log("=" * 80)
|
||
log(f"MODEL: {model_display} × {p_label}")
|
||
log("=" * 80)
|
||
|
||
ok = await setup_combo(combo)
|
||
if not ok:
|
||
log(f" ✗ Skipping {model_display} × {p_label} due to setup failure")
|
||
return
|
||
|
||
personality_tag = "Miku" if combo["personality"] == "miku" else "Evil Miku"
|
||
for scenario in scenarios:
|
||
await run_scenario(scenario, model_display, personality_tag)
|
||
|
||
|
||
# ─── Main ────────────────────────────────────────────────────────────────────
|
||
|
||
async def main():
|
||
global log_file
|
||
log_file = open(LOG_FILE, "w", encoding="utf-8")
|
||
|
||
start_time = datetime.now()
|
||
|
||
log("╔══════════════════════════════════════════════════════════════════════╗")
|
||
log("║ ROCINANTE-X 12B MODEL COMPARISON TEST ║")
|
||
log("║ Rocinante-X-12B-v1b-Q5_K_M.gguf (12B, Q5_K_M) ║")
|
||
log(f"║ Started: {start_time.strftime('%Y-%m-%d %H:%M:%S'):<52}║")
|
||
log("╚══════════════════════════════════════════════════════════════════════╝")
|
||
log()
|
||
|
||
# Pre-flight: check Cat is healthy
|
||
log("Pre-flight checks:")
|
||
if not await cat_health_check():
|
||
log(" ✗ Cheshire Cat is not reachable at " + CAT_URL)
|
||
log(" Make sure the cheshire-cat container is running.")
|
||
sys.exit(1)
|
||
log(" ✓ Cheshire Cat is healthy")
|
||
log()
|
||
|
||
# Combo 1: Rocinante × Normal Miku
|
||
await run_combo(COMBOS[0], NORMAL_SCENARIOS)
|
||
|
||
# Combo 2: Rocinante × Evil Miku
|
||
await run_combo(COMBOS[1], EVIL_SCENARIOS)
|
||
|
||
# Summary
|
||
end_time = datetime.now()
|
||
duration = end_time - start_time
|
||
|
||
log()
|
||
log("=" * 80)
|
||
log("TEST COMPLETE")
|
||
log("=" * 80)
|
||
log(f" Model tested: Rocinante-X-12B-v1b-Q5_K_M (12B params)")
|
||
log(f" Combinations: {len(COMBOS)} (Normal Miku + Evil Miku)")
|
||
log(f" Scenarios: {len(NORMAL_SCENARIOS)} normal + {len(EVIL_SCENARIOS)} evil = {len(NORMAL_SCENARIOS) + len(EVIL_SCENARIOS)} total")
|
||
log(f" Duration: {duration}")
|
||
log(f" Log file: {LOG_FILE}")
|
||
log()
|
||
|
||
log_file.close()
|
||
print(f"\n✓ Full log written to: {LOG_FILE}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|