Configuration: - .env.example, .gitignore, compose.yml (main docker compose) - docker-compose-amd.yml (ROCm), docker-compose-macos.yml - start.sh, stop.sh convenience scripts - LICENSE (Apache 2.0, from upstream Cheshire Cat) Memory management utilities: - analyze_consolidation.py, manual_consolidation.py, verify_consolidation.py - check_memories.py, extract_declarative_facts.py, store_declarative_facts.py - compare_systems.py (system comparison tool) - benchmark_cat.py, streaming_benchmark.py, streaming_benchmark_v2.py Test suite: - quick_test.py, test_setup.py, test_setup_simple.py - test_consolidation_direct.py, test_declarative_recall.py, test_recall.py - test_end_to_end.py, test_full_pipeline.py - test_phase2.py, test_phase2_comprehensive.py Documentation: - README.md, QUICK_START.txt, TEST_README.md, SETUP_COMPLETE.md - PHASE2_IMPLEMENTATION_NOTES.md, PHASE2_TEST_RESULTS.md - POST_OPTIMIZATION_ANALYSIS.md
213 lines
7.1 KiB
Python
Executable File
213 lines
7.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Comparison Benchmark: Current System vs Cheshire Cat
|
|
Measures the difference in performance between the two approaches
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
|
|
import requests
|
|
import time
|
|
import statistics
|
|
from typing import List, Dict
|
|
import asyncio
|
|
|
|
CAT_URL = "http://localhost:1865"
|
|
|
|
# Import your current LLM function
|
|
try:
|
|
from bot.utils import llm
|
|
from bot import globals as bot_globals
|
|
HAS_BOT_CODE = True
|
|
except ImportError:
|
|
print("⚠️ Could not import bot code - will skip direct comparison")
|
|
HAS_BOT_CODE = False
|
|
|
|
TEST_QUERIES = [
|
|
"What is your favorite food?",
|
|
"Tell me about your friends",
|
|
"What's the song World is Mine about?",
|
|
"Hello Miku!",
|
|
"Do you like to sing?",
|
|
"Who created you?",
|
|
"What color is your hair?",
|
|
"Tell me about green onions",
|
|
"What do you do for fun?",
|
|
"Are you a Vocaloid?"
|
|
]
|
|
|
|
def test_cat_query(query: str, timeout: int = 60) -> Dict:
|
|
"""Test query using Cheshire Cat"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
response = requests.post(
|
|
f"{CAT_URL}/message",
|
|
json={"text": query},
|
|
headers={"Content-Type": "application/json"},
|
|
timeout=timeout
|
|
)
|
|
|
|
latency_ms = (time.time() - start_time) * 1000
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
content = data.get("content", "")
|
|
|
|
# Filter out tool calls
|
|
if content and not (content.startswith('{"name":') or content.startswith('{')):
|
|
return {
|
|
"success": True,
|
|
"latency_ms": latency_ms,
|
|
"response": content,
|
|
"method": "cheshire_cat"
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"latency_ms": latency_ms,
|
|
"error": "Got tool call instead of text",
|
|
"method": "cheshire_cat"
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"latency_ms": latency_ms,
|
|
"error": f"HTTP {response.status_code}",
|
|
"method": "cheshire_cat"
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"latency_ms": (time.time() - start_time) * 1000,
|
|
"error": str(e),
|
|
"method": "cheshire_cat"
|
|
}
|
|
|
|
async def test_current_query(query: str) -> Dict:
|
|
"""Test query using current Miku bot system"""
|
|
if not HAS_BOT_CODE:
|
|
return {"success": False, "error": "Bot code not available", "method": "current"}
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Use your existing query_llama function
|
|
response = await llm.query_llama(
|
|
user_prompt=query,
|
|
user_id="benchmark_test",
|
|
guild_id=None,
|
|
response_type="dm_response"
|
|
)
|
|
|
|
latency_ms = (time.time() - start_time) * 1000
|
|
|
|
return {
|
|
"success": True,
|
|
"latency_ms": latency_ms,
|
|
"response": response,
|
|
"method": "current"
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"latency_ms": (time.time() - start_time) * 1000,
|
|
"error": str(e),
|
|
"method": "current"
|
|
}
|
|
|
|
async def run_comparison():
|
|
"""Run comparison between both systems"""
|
|
print("=" * 70)
|
|
print("⚖️ COMPARISON: Current System vs Cheshire Cat")
|
|
print("=" * 70)
|
|
|
|
cat_times: List[float] = []
|
|
current_times: List[float] = []
|
|
|
|
for i, query in enumerate(TEST_QUERIES):
|
|
print(f"\n[{i+1}/{len(TEST_QUERIES)}] Query: '{query}'")
|
|
print("-" * 70)
|
|
|
|
# Test Cheshire Cat
|
|
cat_result = test_cat_query(query)
|
|
if cat_result["success"]:
|
|
cat_times.append(cat_result["latency_ms"])
|
|
print(f" 🐱 Cheshire Cat: {cat_result['latency_ms']:.0f}ms")
|
|
print(f" Response: {cat_result['response'][:80]}...")
|
|
else:
|
|
print(f" 🐱 Cheshire Cat: ❌ {cat_result.get('error', 'Failed')}")
|
|
|
|
# Small delay between tests
|
|
await asyncio.sleep(1)
|
|
|
|
# Test current system
|
|
if HAS_BOT_CODE:
|
|
current_result = await test_current_query(query)
|
|
if current_result["success"]:
|
|
current_times.append(current_result["latency_ms"])
|
|
print(f" 📦 Current System: {current_result['latency_ms']:.0f}ms")
|
|
print(f" Response: {current_result['response'][:80]}...")
|
|
else:
|
|
print(f" 📦 Current System: ❌ {current_result.get('error', 'Failed')}")
|
|
|
|
await asyncio.sleep(1)
|
|
|
|
# Print comparison statistics
|
|
print("\n" + "=" * 70)
|
|
print("📊 COMPARISON RESULTS")
|
|
print("=" * 70)
|
|
|
|
if cat_times:
|
|
print(f"\n🐱 Cheshire Cat:")
|
|
print(f" Mean latency: {statistics.mean(cat_times):.0f} ms")
|
|
print(f" Median latency: {statistics.median(cat_times):.0f} ms")
|
|
print(f" Min latency: {min(cat_times):.0f} ms")
|
|
print(f" Max latency: {max(cat_times):.0f} ms")
|
|
print(f" Success rate: {len(cat_times)}/{len(TEST_QUERIES)} ({len(cat_times)/len(TEST_QUERIES)*100:.0f}%)")
|
|
|
|
if current_times:
|
|
print(f"\n📦 Current System:")
|
|
print(f" Mean latency: {statistics.mean(current_times):.0f} ms")
|
|
print(f" Median latency: {statistics.median(current_times):.0f} ms")
|
|
print(f" Min latency: {min(current_times):.0f} ms")
|
|
print(f" Max latency: {max(current_times):.0f} ms")
|
|
print(f" Success rate: {len(current_times)}/{len(TEST_QUERIES)} ({len(current_times)/len(TEST_QUERIES)*100:.0f}%)")
|
|
|
|
if cat_times and current_times:
|
|
print(f"\n⚖️ Comparison:")
|
|
cat_mean = statistics.mean(cat_times)
|
|
current_mean = statistics.mean(current_times)
|
|
diff = cat_mean - current_mean
|
|
diff_pct = (diff / current_mean) * 100
|
|
|
|
if diff > 0:
|
|
print(f" Cheshire Cat is {diff:.0f}ms SLOWER ({diff_pct:+.1f}%)")
|
|
else:
|
|
print(f" Cheshire Cat is {abs(diff):.0f}ms FASTER ({diff_pct:+.1f}%)")
|
|
|
|
# Voice chat assessment
|
|
print(f"\n🎤 Voice Chat Viability:")
|
|
if cat_mean < 1500:
|
|
print(f" ✅ Both systems suitable for voice chat")
|
|
elif cat_mean < 2000 and current_mean < 1500:
|
|
print(f" ⚠️ Cheshire Cat slower but still usable")
|
|
else:
|
|
print(f" ❌ Cheshire Cat may be too slow for real-time voice")
|
|
|
|
print("\n" + "=" * 70)
|
|
|
|
def main():
|
|
if not HAS_BOT_CODE:
|
|
print("\n⚠️ Running in Cat-only mode (bot code not available)")
|
|
print(" To run full comparison:")
|
|
print(" 1. Make sure you're running this from the cheshire-cat directory")
|
|
print(" 2. Ensure the parent 'bot' directory is accessible\n")
|
|
|
|
asyncio.run(run_comparison())
|
|
|
|
if __name__ == "__main__":
|
|
main()
|