213 lines
7.1 KiB
Python
213 lines
7.1 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Comparison Benchmark: Current System vs Cheshire Cat
|
||
|
|
Measures the difference in performance between the two approaches
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import os
|
||
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||
|
|
|
||
|
|
import requests
|
||
|
|
import time
|
||
|
|
import statistics
|
||
|
|
from typing import List, Dict
|
||
|
|
import asyncio
|
||
|
|
|
||
|
|
CAT_URL = "http://localhost:1865"
|
||
|
|
|
||
|
|
# Import your current LLM function
|
||
|
|
try:
|
||
|
|
from bot.utils import llm
|
||
|
|
from bot import globals as bot_globals
|
||
|
|
HAS_BOT_CODE = True
|
||
|
|
except ImportError:
|
||
|
|
print("⚠️ Could not import bot code - will skip direct comparison")
|
||
|
|
HAS_BOT_CODE = False
|
||
|
|
|
||
|
|
TEST_QUERIES = [
|
||
|
|
"What is your favorite food?",
|
||
|
|
"Tell me about your friends",
|
||
|
|
"What's the song World is Mine about?",
|
||
|
|
"Hello Miku!",
|
||
|
|
"Do you like to sing?",
|
||
|
|
"Who created you?",
|
||
|
|
"What color is your hair?",
|
||
|
|
"Tell me about green onions",
|
||
|
|
"What do you do for fun?",
|
||
|
|
"Are you a Vocaloid?"
|
||
|
|
]
|
||
|
|
|
||
|
|
def test_cat_query(query: str, timeout: int = 60) -> Dict:
|
||
|
|
"""Test query using Cheshire Cat"""
|
||
|
|
start_time = time.time()
|
||
|
|
|
||
|
|
try:
|
||
|
|
response = requests.post(
|
||
|
|
f"{CAT_URL}/message",
|
||
|
|
json={"text": query},
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
timeout=timeout
|
||
|
|
)
|
||
|
|
|
||
|
|
latency_ms = (time.time() - start_time) * 1000
|
||
|
|
|
||
|
|
if response.status_code == 200:
|
||
|
|
data = response.json()
|
||
|
|
content = data.get("content", "")
|
||
|
|
|
||
|
|
# Filter out tool calls
|
||
|
|
if content and not (content.startswith('{"name":') or content.startswith('{')):
|
||
|
|
return {
|
||
|
|
"success": True,
|
||
|
|
"latency_ms": latency_ms,
|
||
|
|
"response": content,
|
||
|
|
"method": "cheshire_cat"
|
||
|
|
}
|
||
|
|
else:
|
||
|
|
return {
|
||
|
|
"success": False,
|
||
|
|
"latency_ms": latency_ms,
|
||
|
|
"error": "Got tool call instead of text",
|
||
|
|
"method": "cheshire_cat"
|
||
|
|
}
|
||
|
|
else:
|
||
|
|
return {
|
||
|
|
"success": False,
|
||
|
|
"latency_ms": latency_ms,
|
||
|
|
"error": f"HTTP {response.status_code}",
|
||
|
|
"method": "cheshire_cat"
|
||
|
|
}
|
||
|
|
except Exception as e:
|
||
|
|
return {
|
||
|
|
"success": False,
|
||
|
|
"latency_ms": (time.time() - start_time) * 1000,
|
||
|
|
"error": str(e),
|
||
|
|
"method": "cheshire_cat"
|
||
|
|
}
|
||
|
|
|
||
|
|
async def test_current_query(query: str) -> Dict:
|
||
|
|
"""Test query using current Miku bot system"""
|
||
|
|
if not HAS_BOT_CODE:
|
||
|
|
return {"success": False, "error": "Bot code not available", "method": "current"}
|
||
|
|
|
||
|
|
start_time = time.time()
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Use your existing query_llama function
|
||
|
|
response = await llm.query_llama(
|
||
|
|
user_prompt=query,
|
||
|
|
user_id="benchmark_test",
|
||
|
|
guild_id=None,
|
||
|
|
response_type="dm_response"
|
||
|
|
)
|
||
|
|
|
||
|
|
latency_ms = (time.time() - start_time) * 1000
|
||
|
|
|
||
|
|
return {
|
||
|
|
"success": True,
|
||
|
|
"latency_ms": latency_ms,
|
||
|
|
"response": response,
|
||
|
|
"method": "current"
|
||
|
|
}
|
||
|
|
except Exception as e:
|
||
|
|
return {
|
||
|
|
"success": False,
|
||
|
|
"latency_ms": (time.time() - start_time) * 1000,
|
||
|
|
"error": str(e),
|
||
|
|
"method": "current"
|
||
|
|
}
|
||
|
|
|
||
|
|
async def run_comparison():
|
||
|
|
"""Run comparison between both systems"""
|
||
|
|
print("=" * 70)
|
||
|
|
print("⚖️ COMPARISON: Current System vs Cheshire Cat")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
cat_times: List[float] = []
|
||
|
|
current_times: List[float] = []
|
||
|
|
|
||
|
|
for i, query in enumerate(TEST_QUERIES):
|
||
|
|
print(f"\n[{i+1}/{len(TEST_QUERIES)}] Query: '{query}'")
|
||
|
|
print("-" * 70)
|
||
|
|
|
||
|
|
# Test Cheshire Cat
|
||
|
|
cat_result = test_cat_query(query)
|
||
|
|
if cat_result["success"]:
|
||
|
|
cat_times.append(cat_result["latency_ms"])
|
||
|
|
print(f" 🐱 Cheshire Cat: {cat_result['latency_ms']:.0f}ms")
|
||
|
|
print(f" Response: {cat_result['response'][:80]}...")
|
||
|
|
else:
|
||
|
|
print(f" 🐱 Cheshire Cat: ❌ {cat_result.get('error', 'Failed')}")
|
||
|
|
|
||
|
|
# Small delay between tests
|
||
|
|
await asyncio.sleep(1)
|
||
|
|
|
||
|
|
# Test current system
|
||
|
|
if HAS_BOT_CODE:
|
||
|
|
current_result = await test_current_query(query)
|
||
|
|
if current_result["success"]:
|
||
|
|
current_times.append(current_result["latency_ms"])
|
||
|
|
print(f" 📦 Current System: {current_result['latency_ms']:.0f}ms")
|
||
|
|
print(f" Response: {current_result['response'][:80]}...")
|
||
|
|
else:
|
||
|
|
print(f" 📦 Current System: ❌ {current_result.get('error', 'Failed')}")
|
||
|
|
|
||
|
|
await asyncio.sleep(1)
|
||
|
|
|
||
|
|
# Print comparison statistics
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("📊 COMPARISON RESULTS")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
if cat_times:
|
||
|
|
print(f"\n🐱 Cheshire Cat:")
|
||
|
|
print(f" Mean latency: {statistics.mean(cat_times):.0f} ms")
|
||
|
|
print(f" Median latency: {statistics.median(cat_times):.0f} ms")
|
||
|
|
print(f" Min latency: {min(cat_times):.0f} ms")
|
||
|
|
print(f" Max latency: {max(cat_times):.0f} ms")
|
||
|
|
print(f" Success rate: {len(cat_times)}/{len(TEST_QUERIES)} ({len(cat_times)/len(TEST_QUERIES)*100:.0f}%)")
|
||
|
|
|
||
|
|
if current_times:
|
||
|
|
print(f"\n📦 Current System:")
|
||
|
|
print(f" Mean latency: {statistics.mean(current_times):.0f} ms")
|
||
|
|
print(f" Median latency: {statistics.median(current_times):.0f} ms")
|
||
|
|
print(f" Min latency: {min(current_times):.0f} ms")
|
||
|
|
print(f" Max latency: {max(current_times):.0f} ms")
|
||
|
|
print(f" Success rate: {len(current_times)}/{len(TEST_QUERIES)} ({len(current_times)/len(TEST_QUERIES)*100:.0f}%)")
|
||
|
|
|
||
|
|
if cat_times and current_times:
|
||
|
|
print(f"\n⚖️ Comparison:")
|
||
|
|
cat_mean = statistics.mean(cat_times)
|
||
|
|
current_mean = statistics.mean(current_times)
|
||
|
|
diff = cat_mean - current_mean
|
||
|
|
diff_pct = (diff / current_mean) * 100
|
||
|
|
|
||
|
|
if diff > 0:
|
||
|
|
print(f" Cheshire Cat is {diff:.0f}ms SLOWER ({diff_pct:+.1f}%)")
|
||
|
|
else:
|
||
|
|
print(f" Cheshire Cat is {abs(diff):.0f}ms FASTER ({diff_pct:+.1f}%)")
|
||
|
|
|
||
|
|
# Voice chat assessment
|
||
|
|
print(f"\n🎤 Voice Chat Viability:")
|
||
|
|
if cat_mean < 1500:
|
||
|
|
print(f" ✅ Both systems suitable for voice chat")
|
||
|
|
elif cat_mean < 2000 and current_mean < 1500:
|
||
|
|
print(f" ⚠️ Cheshire Cat slower but still usable")
|
||
|
|
else:
|
||
|
|
print(f" ❌ Cheshire Cat may be too slow for real-time voice")
|
||
|
|
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
|
||
|
|
def main():
|
||
|
|
if not HAS_BOT_CODE:
|
||
|
|
print("\n⚠️ Running in Cat-only mode (bot code not available)")
|
||
|
|
print(" To run full comparison:")
|
||
|
|
print(" 1. Make sure you're running this from the cheshire-cat directory")
|
||
|
|
print(" 2. Ensure the parent 'bot' directory is accessible\n")
|
||
|
|
|
||
|
|
asyncio.run(run_comparison())
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|