#!/usr/bin/env python3 """ Comparison Benchmark: Current System vs Cheshire Cat Measures the difference in performance between the two approaches """ import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) import requests import time import statistics from typing import List, Dict import asyncio CAT_URL = "http://localhost:1865" # Import your current LLM function try: from bot.utils import llm from bot import globals as bot_globals HAS_BOT_CODE = True except ImportError: print("āš ļø Could not import bot code - will skip direct comparison") HAS_BOT_CODE = False TEST_QUERIES = [ "What is your favorite food?", "Tell me about your friends", "What's the song World is Mine about?", "Hello Miku!", "Do you like to sing?", "Who created you?", "What color is your hair?", "Tell me about green onions", "What do you do for fun?", "Are you a Vocaloid?" ] def test_cat_query(query: str, timeout: int = 60) -> Dict: """Test query using Cheshire Cat""" start_time = time.time() try: response = requests.post( f"{CAT_URL}/message", json={"text": query}, headers={"Content-Type": "application/json"}, timeout=timeout ) latency_ms = (time.time() - start_time) * 1000 if response.status_code == 200: data = response.json() content = data.get("content", "") # Filter out tool calls if content and not (content.startswith('{"name":') or content.startswith('{')): return { "success": True, "latency_ms": latency_ms, "response": content, "method": "cheshire_cat" } else: return { "success": False, "latency_ms": latency_ms, "error": "Got tool call instead of text", "method": "cheshire_cat" } else: return { "success": False, "latency_ms": latency_ms, "error": f"HTTP {response.status_code}", "method": "cheshire_cat" } except Exception as e: return { "success": False, "latency_ms": (time.time() - start_time) * 1000, "error": str(e), "method": "cheshire_cat" } async def test_current_query(query: str) -> Dict: """Test query using current Miku bot system""" if not HAS_BOT_CODE: return {"success": False, "error": "Bot code not available", "method": "current"} start_time = time.time() try: # Use your existing query_llama function response = await llm.query_llama( user_prompt=query, user_id="benchmark_test", guild_id=None, response_type="dm_response" ) latency_ms = (time.time() - start_time) * 1000 return { "success": True, "latency_ms": latency_ms, "response": response, "method": "current" } except Exception as e: return { "success": False, "latency_ms": (time.time() - start_time) * 1000, "error": str(e), "method": "current" } async def run_comparison(): """Run comparison between both systems""" print("=" * 70) print("āš–ļø COMPARISON: Current System vs Cheshire Cat") print("=" * 70) cat_times: List[float] = [] current_times: List[float] = [] for i, query in enumerate(TEST_QUERIES): print(f"\n[{i+1}/{len(TEST_QUERIES)}] Query: '{query}'") print("-" * 70) # Test Cheshire Cat cat_result = test_cat_query(query) if cat_result["success"]: cat_times.append(cat_result["latency_ms"]) print(f" 🐱 Cheshire Cat: {cat_result['latency_ms']:.0f}ms") print(f" Response: {cat_result['response'][:80]}...") else: print(f" 🐱 Cheshire Cat: āŒ {cat_result.get('error', 'Failed')}") # Small delay between tests await asyncio.sleep(1) # Test current system if HAS_BOT_CODE: current_result = await test_current_query(query) if current_result["success"]: current_times.append(current_result["latency_ms"]) print(f" šŸ“¦ Current System: {current_result['latency_ms']:.0f}ms") print(f" Response: {current_result['response'][:80]}...") else: print(f" šŸ“¦ Current System: āŒ {current_result.get('error', 'Failed')}") await asyncio.sleep(1) # Print comparison statistics print("\n" + "=" * 70) print("šŸ“Š COMPARISON RESULTS") print("=" * 70) if cat_times: print(f"\n🐱 Cheshire Cat:") print(f" Mean latency: {statistics.mean(cat_times):.0f} ms") print(f" Median latency: {statistics.median(cat_times):.0f} ms") print(f" Min latency: {min(cat_times):.0f} ms") print(f" Max latency: {max(cat_times):.0f} ms") print(f" Success rate: {len(cat_times)}/{len(TEST_QUERIES)} ({len(cat_times)/len(TEST_QUERIES)*100:.0f}%)") if current_times: print(f"\nšŸ“¦ Current System:") print(f" Mean latency: {statistics.mean(current_times):.0f} ms") print(f" Median latency: {statistics.median(current_times):.0f} ms") print(f" Min latency: {min(current_times):.0f} ms") print(f" Max latency: {max(current_times):.0f} ms") print(f" Success rate: {len(current_times)}/{len(TEST_QUERIES)} ({len(current_times)/len(TEST_QUERIES)*100:.0f}%)") if cat_times and current_times: print(f"\nāš–ļø Comparison:") cat_mean = statistics.mean(cat_times) current_mean = statistics.mean(current_times) diff = cat_mean - current_mean diff_pct = (diff / current_mean) * 100 if diff > 0: print(f" Cheshire Cat is {diff:.0f}ms SLOWER ({diff_pct:+.1f}%)") else: print(f" Cheshire Cat is {abs(diff):.0f}ms FASTER ({diff_pct:+.1f}%)") # Voice chat assessment print(f"\nšŸŽ¤ Voice Chat Viability:") if cat_mean < 1500: print(f" āœ… Both systems suitable for voice chat") elif cat_mean < 2000 and current_mean < 1500: print(f" āš ļø Cheshire Cat slower but still usable") else: print(f" āŒ Cheshire Cat may be too slow for real-time voice") print("\n" + "=" * 70) def main(): if not HAS_BOT_CODE: print("\nāš ļø Running in Cat-only mode (bot code not available)") print(" To run full comparison:") print(" 1. Make sure you're running this from the cheshire-cat directory") print(" 2. Ensure the parent 'bot' directory is accessible\n") asyncio.run(run_comparison()) if __name__ == "__main__": main()