miku-discord/cheshire-cat/compare_systems.py

#!/usr/bin/env python3
"""
Comparison Benchmark: Current System vs Cheshire Cat
Measures the difference in performance between the two approaches
"""

import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

import requests
import time
import statistics
from typing import List, Dict
import asyncio

CAT_URL = "http://localhost:1865"

# Import your current LLM function
try:
    from bot.utils import llm
    from bot import globals as bot_globals
    HAS_BOT_CODE = True
except ImportError:
    print("⚠️  Could not import bot code - will skip direct comparison")
    HAS_BOT_CODE = False

TEST_QUERIES = [
    "What is your favorite food?",
    "Tell me about your friends",
    "What's the song World is Mine about?",
    "Hello Miku!",
    "Do you like to sing?",
    "Who created you?",
    "What color is your hair?",
    "Tell me about green onions",
    "What do you do for fun?",
    "Are you a Vocaloid?"
]

def test_cat_query(query: str, timeout: int = 60) -> Dict:
    """Test query using Cheshire Cat"""
    start_time = time.time()

    try:
        response = requests.post(
            f"{CAT_URL}/message",
            json={"text": query},
            headers={"Content-Type": "application/json"},
            timeout=timeout
        )

        latency_ms = (time.time() - start_time) * 1000

        if response.status_code == 200:
            data = response.json()
            content = data.get("content", "")

            # Filter out tool calls
            if content and not (content.startswith('{"name":') or content.startswith('{')):
                return {
                    "success": True,
                    "latency_ms": latency_ms,
                    "response": content,
                    "method": "cheshire_cat"
                }
            else:
                return {
                    "success": False,
                    "latency_ms": latency_ms,
                    "error": "Got tool call instead of text",
                    "method": "cheshire_cat"
                }
        else:
            return {
                "success": False,
                "latency_ms": latency_ms,
                "error": f"HTTP {response.status_code}",
                "method": "cheshire_cat"
            }
    except Exception as e:
        return {
            "success": False,
            "latency_ms": (time.time() - start_time) * 1000,
            "error": str(e),
            "method": "cheshire_cat"
        }

async def test_current_query(query: str) -> Dict:
    """Test query using current Miku bot system"""
    if not HAS_BOT_CODE:
        return {"success": False, "error": "Bot code not available", "method": "current"}

    start_time = time.time()

    try:
        # Use your existing query_llama function
        response = await llm.query_llama(
            user_prompt=query,
            user_id="benchmark_test",
            guild_id=None,
            response_type="dm_response"
        )

        latency_ms = (time.time() - start_time) * 1000

        return {
            "success": True,
            "latency_ms": latency_ms,
            "response": response,
            "method": "current"
        }
    except Exception as e:
        return {
            "success": False,
            "latency_ms": (time.time() - start_time) * 1000,
            "error": str(e),
            "method": "current"
        }

async def run_comparison():
    """Run comparison between both systems"""
    print("=" * 70)
    print("⚖️  COMPARISON: Current System vs Cheshire Cat")
    print("=" * 70)

    cat_times: List[float] = []
    current_times: List[float] = []

    for i, query in enumerate(TEST_QUERIES):
        print(f"\n[{i+1}/{len(TEST_QUERIES)}] Query: '{query}'")
        print("-" * 70)

        # Test Cheshire Cat
        cat_result = test_cat_query(query)
        if cat_result["success"]:
            cat_times.append(cat_result["latency_ms"])
            print(f"  🐱 Cheshire Cat: {cat_result['latency_ms']:.0f}ms")
            print(f"     Response: {cat_result['response'][:80]}...")
        else:
            print(f"  🐱 Cheshire Cat: ❌ {cat_result.get('error', 'Failed')}")

        # Small delay between tests
        await asyncio.sleep(1)

        # Test current system
        if HAS_BOT_CODE:
            current_result = await test_current_query(query)
            if current_result["success"]:
                current_times.append(current_result["latency_ms"])
                print(f"  📦 Current System: {current_result['latency_ms']:.0f}ms")
                print(f"     Response: {current_result['response'][:80]}...")
            else:
                print(f"  📦 Current System: ❌ {current_result.get('error', 'Failed')}")

        await asyncio.sleep(1)

    # Print comparison statistics
    print("\n" + "=" * 70)
    print("📊 COMPARISON RESULTS")
    print("=" * 70)

    if cat_times:
        print(f"\n🐱 Cheshire Cat:")
        print(f"   Mean latency:     {statistics.mean(cat_times):.0f} ms")
        print(f"   Median latency:   {statistics.median(cat_times):.0f} ms")
        print(f"   Min latency:      {min(cat_times):.0f} ms")
        print(f"   Max latency:      {max(cat_times):.0f} ms")
        print(f"   Success rate:     {len(cat_times)}/{len(TEST_QUERIES)} ({len(cat_times)/len(TEST_QUERIES)*100:.0f}%)")

    if current_times:
        print(f"\n📦 Current System:")
        print(f"   Mean latency:     {statistics.mean(current_times):.0f} ms")
        print(f"   Median latency:   {statistics.median(current_times):.0f} ms")
        print(f"   Min latency:      {min(current_times):.0f} ms")
        print(f"   Max latency:      {max(current_times):.0f} ms")
        print(f"   Success rate:     {len(current_times)}/{len(TEST_QUERIES)} ({len(current_times)/len(TEST_QUERIES)*100:.0f}%)")

    if cat_times and current_times:
        print(f"\n⚖️  Comparison:")
        cat_mean = statistics.mean(cat_times)
        current_mean = statistics.mean(current_times)
        diff = cat_mean - current_mean
        diff_pct = (diff / current_mean) * 100

        if diff > 0:
            print(f"   Cheshire Cat is {diff:.0f}ms SLOWER ({diff_pct:+.1f}%)")
        else:
            print(f"   Cheshire Cat is {abs(diff):.0f}ms FASTER ({diff_pct:+.1f}%)")

        # Voice chat assessment
        print(f"\n🎤 Voice Chat Viability:")
        if cat_mean < 1500:
            print(f"   ✅ Both systems suitable for voice chat")
        elif cat_mean < 2000 and current_mean < 1500:
            print(f"   ⚠️  Cheshire Cat slower but still usable")
        else:
            print(f"   ❌ Cheshire Cat may be too slow for real-time voice")

    print("\n" + "=" * 70)

def main():
    if not HAS_BOT_CODE:
        print("\n⚠️  Running in Cat-only mode (bot code not available)")
        print("   To run full comparison:")
        print("   1. Make sure you're running this from the cheshire-cat directory")
        print("   2. Ensure the parent 'bot' directory is accessible\n")

    asyncio.run(run_comparison())

if __name__ == "__main__":
    main()