Files
miku-discord/cheshire-cat/benchmark_cat.py
koko210Serve ae1e0aa144 add: cheshire-cat configuration, tooling, tests, and documentation
Configuration:
- .env.example, .gitignore, compose.yml (main docker compose)
- docker-compose-amd.yml (ROCm), docker-compose-macos.yml
- start.sh, stop.sh convenience scripts
- LICENSE (Apache 2.0, from upstream Cheshire Cat)

Memory management utilities:
- analyze_consolidation.py, manual_consolidation.py, verify_consolidation.py
- check_memories.py, extract_declarative_facts.py, store_declarative_facts.py
- compare_systems.py (system comparison tool)
- benchmark_cat.py, streaming_benchmark.py, streaming_benchmark_v2.py

Test suite:
- quick_test.py, test_setup.py, test_setup_simple.py
- test_consolidation_direct.py, test_declarative_recall.py, test_recall.py
- test_end_to_end.py, test_full_pipeline.py
- test_phase2.py, test_phase2_comprehensive.py

Documentation:
- README.md, QUICK_START.txt, TEST_README.md, SETUP_COMPLETE.md
- PHASE2_IMPLEMENTATION_NOTES.md, PHASE2_TEST_RESULTS.md
- POST_OPTIMIZATION_ANALYSIS.md
2026-03-04 00:51:14 +02:00

394 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Comprehensive Cheshire Cat Performance Benchmark
Tests latency, overhead, and performance under various conditions
"""
import requests
import time
import json
import statistics
from datetime import datetime
from typing import List, Dict
import sys
CAT_URL = "http://localhost:1865"
# Test queries of varying complexity
TEST_QUERIES = {
"simple_greeting": [
"Hello!",
"Hi Miku!",
"Hey there!",
"Good morning!",
"What's up?"
],
"factual_short": [
"What is your favorite food?",
"How old are you?",
"What color is your hair?",
"Where are you from?",
"What's your name?"
],
"factual_medium": [
"Tell me about your friends Rin and Len",
"What is the song World is Mine about?",
"Who created you?",
"What kind of music do you sing?",
"What do you like to do for fun?"
],
"complex_memory": [
"What did we talk about earlier?",
"Can you remember what I asked you before?",
"Tell me everything you know about green onions",
"What are all your most iconic songs?",
"Describe your personality and how you act"
],
"conversation_flow": [
"I love your music!",
"What's your favorite song to perform?",
"Do you ever get nervous on stage?",
"That's interesting! Tell me more.",
"Thanks for chatting with me!"
]
}
class PerformanceResults:
def __init__(self):
self.query_times: List[float] = []
self.response_sizes: List[int] = []
self.errors: List[str] = []
self.category_stats: Dict[str, List[float]] = {}
def add_result(self, latency_ms: float, response_size: int, category: str):
self.query_times.append(latency_ms)
self.response_sizes.append(response_size)
if category not in self.category_stats:
self.category_stats[category] = []
self.category_stats[category].append(latency_ms)
def add_error(self, error: str):
self.errors.append(error)
def get_stats(self):
if not self.query_times:
return None
return {
"total_queries": len(self.query_times),
"total_errors": len(self.errors),
"success_rate": (len(self.query_times) / (len(self.query_times) + len(self.errors))) * 100,
"latency": {
"min_ms": min(self.query_times),
"max_ms": max(self.query_times),
"mean_ms": statistics.mean(self.query_times),
"median_ms": statistics.median(self.query_times),
"stdev_ms": statistics.stdev(self.query_times) if len(self.query_times) > 1 else 0,
"p95_ms": self._percentile(self.query_times, 95),
"p99_ms": self._percentile(self.query_times, 99)
},
"response_sizes": {
"min_bytes": min(self.response_sizes),
"max_bytes": max(self.response_sizes),
"mean_bytes": statistics.mean(self.response_sizes),
},
"by_category": {
category: {
"mean_ms": statistics.mean(times),
"median_ms": statistics.median(times),
"min_ms": min(times),
"max_ms": max(times)
}
for category, times in self.category_stats.items()
}
}
@staticmethod
def _percentile(data, percentile):
size = len(data)
sorted_data = sorted(data)
index = (percentile / 100) * size
if index.is_integer():
return sorted_data[int(index) - 1]
else:
return sorted_data[int(index)]
def test_single_query(query: str, category: str, timeout: int = 60, warmup: bool = False) -> Dict:
"""Test a single query and measure performance
Args:
query: The query text to send
category: Category for grouping results
timeout: Request timeout in seconds (60s for model loading)
warmup: If True, don't count in results (for model loading)
"""
start_time = time.time()
try:
response = requests.post(
f"{CAT_URL}/message",
json={"text": query},
headers={"Content-Type": "application/json"},
timeout=timeout
)
latency_ms = (time.time() - start_time) * 1000
if response.status_code == 200:
data = response.json()
content = data.get("content", "")
# Filter out tool calls that might still appear
if content and not (content.startswith('{"name":') or content.startswith('{')):
return {
"success": True,
"latency_ms": latency_ms,
"response_size": len(content),
"response": content,
"category": category,
"warmup": warmup
}
else:
return {
"success": False,
"latency_ms": latency_ms,
"error": "Got tool call instead of text response",
"category": category,
"warmup": warmup
}
else:
return {
"success": False,
"latency_ms": latency_ms,
"error": f"HTTP {response.status_code}",
"category": category,
"warmup": warmup
}
except Exception as e:
latency_ms = (time.time() - start_time) * 1000
return {
"success": False,
"latency_ms": latency_ms,
"error": str(e),
"category": category,
"warmup": warmup
}
def run_benchmark_suite(iterations: int = 3, verbose: bool = True) -> PerformanceResults:
"""Run complete benchmark suite"""
results = PerformanceResults()
total_queries = sum(len(queries) for queries in TEST_QUERIES.values()) * iterations
current_query = 0
print(f"\n🏁 Starting benchmark suite: {total_queries} total queries")
print("=" * 60)
# Warmup query to load the model
print("\n🔥 Warming up model (loading darkidol, may take 30-45s)...")
warmup_result = test_single_query("Hi!", "warmup", timeout=60, warmup=True)
if warmup_result["success"]:
print(f" ✅ Model loaded in {warmup_result['latency_ms']:.0f}ms")
else:
print(f" ⚠️ Warmup issue: {warmup_result.get('error', 'unknown')}")
print(" Continuing anyway...")
time.sleep(2) # Brief pause after warmup
for iteration in range(iterations):
print(f"\n📊 Iteration {iteration + 1}/{iterations}")
for category, queries in TEST_QUERIES.items():
print(f"\n Category: {category}")
for query in queries:
current_query += 1
if verbose:
print(f" [{current_query}/{total_queries}] Testing: '{query[:40]}...'")
result = test_single_query(query, category, timeout=60)
if result["success"] and not result.get("warmup", False):
results.add_result(
result["latency_ms"],
result["response_size"],
category
)
if verbose:
print(f"{result['latency_ms']:.0f}ms - {result['response_size']} bytes")
print(f" Response: {result['response'][:60]}...")
elif not result.get("warmup", False):
results.add_error(result["error"])
if verbose:
print(f" ❌ Error: {result['error']}")
# Small delay between queries to avoid overwhelming the system
time.sleep(1)
return results
def test_voice_chat_simulation(duration_seconds: int = 60) -> Dict:
"""Simulate voice chat workload (rapid-fire queries)"""
print(f"\n🎤 Simulating voice chat for {duration_seconds}s")
print(" (Rapid-fire queries to test real-time performance)")
print("=" * 60)
voice_queries = [
"Hello!",
"How are you?",
"Tell me a joke",
"What's your favorite song?",
"That's cool!",
"Can you sing?",
"I like you!",
"What should we do?",
"Tell me more",
"Goodbye!"
]
results = PerformanceResults()
start_time = time.time()
query_index = 0
while (time.time() - start_time) < duration_seconds:
query = voice_queries[query_index % len(voice_queries)]
result = test_single_query(query, "voice_chat", timeout=30) # Increased timeout
if result["success"]:
results.add_result(
result["latency_ms"],
result["response_size"],
"voice_chat"
)
status = "" if result["latency_ms"] < 2000 else "⚠️"
print(f" {status} Query {query_index + 1}: {result['latency_ms']:.0f}ms")
else:
results.add_error(result["error"])
print(f" ❌ Query {query_index + 1}: Error - {result.get('error', 'unknown')}")
query_index += 1
time.sleep(2) # Increased delay between queries
elapsed = time.time() - start_time
print(f"\n Completed {query_index} queries in {elapsed:.1f}s")
return results.get_stats()
def print_report(results: PerformanceResults):
"""Print detailed performance report"""
stats = results.get_stats()
if not stats:
print("\n❌ No successful queries to report")
return
print("\n" + "=" * 60)
print("📊 PERFORMANCE REPORT")
print("=" * 60)
# Overall Statistics
print(f"\n📈 Overall Statistics:")
print(f" Total Queries: {stats['total_queries']}")
print(f" Total Errors: {stats['total_errors']}")
print(f" Success Rate: {stats['success_rate']:.1f}%")
# Latency Statistics
lat = stats['latency']
print(f"\n⏱️ Latency Statistics:")
print(f" Mean: {lat['mean_ms']:.0f} ms")
print(f" Median: {lat['median_ms']:.0f} ms")
print(f" Min: {lat['min_ms']:.0f} ms")
print(f" Max: {lat['max_ms']:.0f} ms")
print(f" Std Dev: {lat['stdev_ms']:.0f} ms")
print(f" 95th Percentile: {lat['p95_ms']:.0f} ms")
print(f" 99th Percentile: {lat['p99_ms']:.0f} ms")
# Voice Chat Assessment
print(f"\n🎤 Voice Chat Viability:")
if lat['mean_ms'] < 1000:
print(f" ✅ EXCELLENT - Mean latency under 1s")
elif lat['mean_ms'] < 1500:
print(f" ✅ GOOD - Mean latency acceptable for voice")
elif lat['mean_ms'] < 2000:
print(f" ⚠️ BORDERLINE - Noticeable lag in voice chat")
else:
print(f" ❌ TOO SLOW - Not suitable for real-time voice")
if lat['p95_ms'] > 2000:
print(f" ⚠️ WARNING: 5% of queries exceed 2s (P95: {lat['p95_ms']:.0f}ms)")
# Category Breakdown
print(f"\n📋 Performance by Category:")
for category, cat_stats in stats['by_category'].items():
print(f"\n {category}:")
print(f" Mean: {cat_stats['mean_ms']:.0f} ms")
print(f" Median: {cat_stats['median_ms']:.0f} ms")
print(f" Range: {cat_stats['min_ms']:.0f}-{cat_stats['max_ms']:.0f} ms")
# Response Size Statistics
size = stats['response_sizes']
print(f"\n📦 Response Sizes:")
print(f" Mean: {size['mean_bytes']:.0f} bytes")
print(f" Range: {size['min_bytes']}-{size['max_bytes']} bytes")
print("\n" + "=" * 60)
def save_results(results: PerformanceResults, filename: str = None):
"""Save results to JSON file"""
if filename is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"benchmark_results_{timestamp}.json"
stats = results.get_stats()
with open(filename, 'w') as f:
json.dump(stats, f, indent=2)
print(f"\n💾 Results saved to: {filename}")
def main():
print("=" * 60)
print("🐱 Cheshire Cat Performance Benchmark")
print("=" * 60)
# Check if Cat is available
try:
response = requests.get(f"{CAT_URL}/", timeout=5)
if response.status_code != 200:
print(f"\n❌ Cat not responding (status {response.status_code})")
print(" Make sure Cat is running: docker-compose -f docker-compose.test.yml up -d")
sys.exit(1)
except Exception as e:
print(f"\n❌ Cannot connect to Cat: {e}")
print(" Make sure Cat is running: docker-compose -f docker-compose.test.yml up -d")
sys.exit(1)
print("\n✅ Cat is available\n")
# Run benchmark suite
print("Starting comprehensive benchmark...")
print("This will take several minutes...\n")
results = run_benchmark_suite(iterations=2, verbose=True)
# Print report
print_report(results)
# Voice chat simulation
print("\n" + "=" * 60)
voice_results = test_voice_chat_simulation(duration_seconds=30)
if voice_results:
print("\n🎤 Voice Chat Simulation Results:")
lat = voice_results['latency']
print(f" Mean latency: {lat['mean_ms']:.0f} ms")
print(f" Median latency: {lat['median_ms']:.0f} ms")
print(f" 95th percentile: {lat['p95_ms']:.0f} ms")
print(f" Success rate: {voice_results['success_rate']:.1f}%")
# Save results
save_results(results)
print("\n✅ Benchmark complete!")
if __name__ == "__main__":
main()