394 lines
14 KiB
Python
394 lines
14 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Comprehensive Cheshire Cat Performance Benchmark
|
||
|
|
Tests latency, overhead, and performance under various conditions
|
||
|
|
"""
|
||
|
|
|
||
|
|
import requests
|
||
|
|
import time
|
||
|
|
import json
|
||
|
|
import statistics
|
||
|
|
from datetime import datetime
|
||
|
|
from typing import List, Dict
|
||
|
|
import sys
|
||
|
|
|
||
|
|
CAT_URL = "http://localhost:1865"
|
||
|
|
|
||
|
|
# Test queries of varying complexity
|
||
|
|
TEST_QUERIES = {
|
||
|
|
"simple_greeting": [
|
||
|
|
"Hello!",
|
||
|
|
"Hi Miku!",
|
||
|
|
"Hey there!",
|
||
|
|
"Good morning!",
|
||
|
|
"What's up?"
|
||
|
|
],
|
||
|
|
"factual_short": [
|
||
|
|
"What is your favorite food?",
|
||
|
|
"How old are you?",
|
||
|
|
"What color is your hair?",
|
||
|
|
"Where are you from?",
|
||
|
|
"What's your name?"
|
||
|
|
],
|
||
|
|
"factual_medium": [
|
||
|
|
"Tell me about your friends Rin and Len",
|
||
|
|
"What is the song World is Mine about?",
|
||
|
|
"Who created you?",
|
||
|
|
"What kind of music do you sing?",
|
||
|
|
"What do you like to do for fun?"
|
||
|
|
],
|
||
|
|
"complex_memory": [
|
||
|
|
"What did we talk about earlier?",
|
||
|
|
"Can you remember what I asked you before?",
|
||
|
|
"Tell me everything you know about green onions",
|
||
|
|
"What are all your most iconic songs?",
|
||
|
|
"Describe your personality and how you act"
|
||
|
|
],
|
||
|
|
"conversation_flow": [
|
||
|
|
"I love your music!",
|
||
|
|
"What's your favorite song to perform?",
|
||
|
|
"Do you ever get nervous on stage?",
|
||
|
|
"That's interesting! Tell me more.",
|
||
|
|
"Thanks for chatting with me!"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
|
||
|
|
class PerformanceResults:
|
||
|
|
def __init__(self):
|
||
|
|
self.query_times: List[float] = []
|
||
|
|
self.response_sizes: List[int] = []
|
||
|
|
self.errors: List[str] = []
|
||
|
|
self.category_stats: Dict[str, List[float]] = {}
|
||
|
|
|
||
|
|
def add_result(self, latency_ms: float, response_size: int, category: str):
|
||
|
|
self.query_times.append(latency_ms)
|
||
|
|
self.response_sizes.append(response_size)
|
||
|
|
if category not in self.category_stats:
|
||
|
|
self.category_stats[category] = []
|
||
|
|
self.category_stats[category].append(latency_ms)
|
||
|
|
|
||
|
|
def add_error(self, error: str):
|
||
|
|
self.errors.append(error)
|
||
|
|
|
||
|
|
def get_stats(self):
|
||
|
|
if not self.query_times:
|
||
|
|
return None
|
||
|
|
|
||
|
|
return {
|
||
|
|
"total_queries": len(self.query_times),
|
||
|
|
"total_errors": len(self.errors),
|
||
|
|
"success_rate": (len(self.query_times) / (len(self.query_times) + len(self.errors))) * 100,
|
||
|
|
"latency": {
|
||
|
|
"min_ms": min(self.query_times),
|
||
|
|
"max_ms": max(self.query_times),
|
||
|
|
"mean_ms": statistics.mean(self.query_times),
|
||
|
|
"median_ms": statistics.median(self.query_times),
|
||
|
|
"stdev_ms": statistics.stdev(self.query_times) if len(self.query_times) > 1 else 0,
|
||
|
|
"p95_ms": self._percentile(self.query_times, 95),
|
||
|
|
"p99_ms": self._percentile(self.query_times, 99)
|
||
|
|
},
|
||
|
|
"response_sizes": {
|
||
|
|
"min_bytes": min(self.response_sizes),
|
||
|
|
"max_bytes": max(self.response_sizes),
|
||
|
|
"mean_bytes": statistics.mean(self.response_sizes),
|
||
|
|
},
|
||
|
|
"by_category": {
|
||
|
|
category: {
|
||
|
|
"mean_ms": statistics.mean(times),
|
||
|
|
"median_ms": statistics.median(times),
|
||
|
|
"min_ms": min(times),
|
||
|
|
"max_ms": max(times)
|
||
|
|
}
|
||
|
|
for category, times in self.category_stats.items()
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def _percentile(data, percentile):
|
||
|
|
size = len(data)
|
||
|
|
sorted_data = sorted(data)
|
||
|
|
index = (percentile / 100) * size
|
||
|
|
if index.is_integer():
|
||
|
|
return sorted_data[int(index) - 1]
|
||
|
|
else:
|
||
|
|
return sorted_data[int(index)]
|
||
|
|
|
||
|
|
def test_single_query(query: str, category: str, timeout: int = 60, warmup: bool = False) -> Dict:
|
||
|
|
"""Test a single query and measure performance
|
||
|
|
|
||
|
|
Args:
|
||
|
|
query: The query text to send
|
||
|
|
category: Category for grouping results
|
||
|
|
timeout: Request timeout in seconds (60s for model loading)
|
||
|
|
warmup: If True, don't count in results (for model loading)
|
||
|
|
"""
|
||
|
|
start_time = time.time()
|
||
|
|
|
||
|
|
try:
|
||
|
|
response = requests.post(
|
||
|
|
f"{CAT_URL}/message",
|
||
|
|
json={"text": query},
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
timeout=timeout
|
||
|
|
)
|
||
|
|
|
||
|
|
latency_ms = (time.time() - start_time) * 1000
|
||
|
|
|
||
|
|
if response.status_code == 200:
|
||
|
|
data = response.json()
|
||
|
|
content = data.get("content", "")
|
||
|
|
|
||
|
|
# Filter out tool calls that might still appear
|
||
|
|
if content and not (content.startswith('{"name":') or content.startswith('{')):
|
||
|
|
return {
|
||
|
|
"success": True,
|
||
|
|
"latency_ms": latency_ms,
|
||
|
|
"response_size": len(content),
|
||
|
|
"response": content,
|
||
|
|
"category": category,
|
||
|
|
"warmup": warmup
|
||
|
|
}
|
||
|
|
else:
|
||
|
|
return {
|
||
|
|
"success": False,
|
||
|
|
"latency_ms": latency_ms,
|
||
|
|
"error": "Got tool call instead of text response",
|
||
|
|
"category": category,
|
||
|
|
"warmup": warmup
|
||
|
|
}
|
||
|
|
else:
|
||
|
|
return {
|
||
|
|
"success": False,
|
||
|
|
"latency_ms": latency_ms,
|
||
|
|
"error": f"HTTP {response.status_code}",
|
||
|
|
"category": category,
|
||
|
|
"warmup": warmup
|
||
|
|
}
|
||
|
|
except Exception as e:
|
||
|
|
latency_ms = (time.time() - start_time) * 1000
|
||
|
|
return {
|
||
|
|
"success": False,
|
||
|
|
"latency_ms": latency_ms,
|
||
|
|
"error": str(e),
|
||
|
|
"category": category,
|
||
|
|
"warmup": warmup
|
||
|
|
}
|
||
|
|
|
||
|
|
def run_benchmark_suite(iterations: int = 3, verbose: bool = True) -> PerformanceResults:
|
||
|
|
"""Run complete benchmark suite"""
|
||
|
|
results = PerformanceResults()
|
||
|
|
total_queries = sum(len(queries) for queries in TEST_QUERIES.values()) * iterations
|
||
|
|
current_query = 0
|
||
|
|
|
||
|
|
print(f"\n🏁 Starting benchmark suite: {total_queries} total queries")
|
||
|
|
print("=" * 60)
|
||
|
|
|
||
|
|
# Warmup query to load the model
|
||
|
|
print("\n🔥 Warming up model (loading darkidol, may take 30-45s)...")
|
||
|
|
warmup_result = test_single_query("Hi!", "warmup", timeout=60, warmup=True)
|
||
|
|
if warmup_result["success"]:
|
||
|
|
print(f" ✅ Model loaded in {warmup_result['latency_ms']:.0f}ms")
|
||
|
|
else:
|
||
|
|
print(f" ⚠️ Warmup issue: {warmup_result.get('error', 'unknown')}")
|
||
|
|
print(" Continuing anyway...")
|
||
|
|
|
||
|
|
time.sleep(2) # Brief pause after warmup
|
||
|
|
|
||
|
|
for iteration in range(iterations):
|
||
|
|
print(f"\n📊 Iteration {iteration + 1}/{iterations}")
|
||
|
|
|
||
|
|
for category, queries in TEST_QUERIES.items():
|
||
|
|
print(f"\n Category: {category}")
|
||
|
|
|
||
|
|
for query in queries:
|
||
|
|
current_query += 1
|
||
|
|
if verbose:
|
||
|
|
print(f" [{current_query}/{total_queries}] Testing: '{query[:40]}...'")
|
||
|
|
|
||
|
|
result = test_single_query(query, category, timeout=60)
|
||
|
|
|
||
|
|
if result["success"] and not result.get("warmup", False):
|
||
|
|
results.add_result(
|
||
|
|
result["latency_ms"],
|
||
|
|
result["response_size"],
|
||
|
|
category
|
||
|
|
)
|
||
|
|
if verbose:
|
||
|
|
print(f" ✅ {result['latency_ms']:.0f}ms - {result['response_size']} bytes")
|
||
|
|
print(f" Response: {result['response'][:60]}...")
|
||
|
|
elif not result.get("warmup", False):
|
||
|
|
results.add_error(result["error"])
|
||
|
|
if verbose:
|
||
|
|
print(f" ❌ Error: {result['error']}")
|
||
|
|
|
||
|
|
# Small delay between queries to avoid overwhelming the system
|
||
|
|
time.sleep(1)
|
||
|
|
|
||
|
|
return results
|
||
|
|
|
||
|
|
def test_voice_chat_simulation(duration_seconds: int = 60) -> Dict:
|
||
|
|
"""Simulate voice chat workload (rapid-fire queries)"""
|
||
|
|
print(f"\n🎤 Simulating voice chat for {duration_seconds}s")
|
||
|
|
print(" (Rapid-fire queries to test real-time performance)")
|
||
|
|
print("=" * 60)
|
||
|
|
|
||
|
|
voice_queries = [
|
||
|
|
"Hello!",
|
||
|
|
"How are you?",
|
||
|
|
"Tell me a joke",
|
||
|
|
"What's your favorite song?",
|
||
|
|
"That's cool!",
|
||
|
|
"Can you sing?",
|
||
|
|
"I like you!",
|
||
|
|
"What should we do?",
|
||
|
|
"Tell me more",
|
||
|
|
"Goodbye!"
|
||
|
|
]
|
||
|
|
|
||
|
|
results = PerformanceResults()
|
||
|
|
start_time = time.time()
|
||
|
|
query_index = 0
|
||
|
|
|
||
|
|
while (time.time() - start_time) < duration_seconds:
|
||
|
|
query = voice_queries[query_index % len(voice_queries)]
|
||
|
|
result = test_single_query(query, "voice_chat", timeout=30) # Increased timeout
|
||
|
|
|
||
|
|
if result["success"]:
|
||
|
|
results.add_result(
|
||
|
|
result["latency_ms"],
|
||
|
|
result["response_size"],
|
||
|
|
"voice_chat"
|
||
|
|
)
|
||
|
|
status = "✅" if result["latency_ms"] < 2000 else "⚠️"
|
||
|
|
print(f" {status} Query {query_index + 1}: {result['latency_ms']:.0f}ms")
|
||
|
|
else:
|
||
|
|
results.add_error(result["error"])
|
||
|
|
print(f" ❌ Query {query_index + 1}: Error - {result.get('error', 'unknown')}")
|
||
|
|
|
||
|
|
query_index += 1
|
||
|
|
time.sleep(2) # Increased delay between queries
|
||
|
|
|
||
|
|
elapsed = time.time() - start_time
|
||
|
|
print(f"\n Completed {query_index} queries in {elapsed:.1f}s")
|
||
|
|
|
||
|
|
return results.get_stats()
|
||
|
|
|
||
|
|
def print_report(results: PerformanceResults):
|
||
|
|
"""Print detailed performance report"""
|
||
|
|
stats = results.get_stats()
|
||
|
|
|
||
|
|
if not stats:
|
||
|
|
print("\n❌ No successful queries to report")
|
||
|
|
return
|
||
|
|
|
||
|
|
print("\n" + "=" * 60)
|
||
|
|
print("📊 PERFORMANCE REPORT")
|
||
|
|
print("=" * 60)
|
||
|
|
|
||
|
|
# Overall Statistics
|
||
|
|
print(f"\n📈 Overall Statistics:")
|
||
|
|
print(f" Total Queries: {stats['total_queries']}")
|
||
|
|
print(f" Total Errors: {stats['total_errors']}")
|
||
|
|
print(f" Success Rate: {stats['success_rate']:.1f}%")
|
||
|
|
|
||
|
|
# Latency Statistics
|
||
|
|
lat = stats['latency']
|
||
|
|
print(f"\n⏱️ Latency Statistics:")
|
||
|
|
print(f" Mean: {lat['mean_ms']:.0f} ms")
|
||
|
|
print(f" Median: {lat['median_ms']:.0f} ms")
|
||
|
|
print(f" Min: {lat['min_ms']:.0f} ms")
|
||
|
|
print(f" Max: {lat['max_ms']:.0f} ms")
|
||
|
|
print(f" Std Dev: {lat['stdev_ms']:.0f} ms")
|
||
|
|
print(f" 95th Percentile: {lat['p95_ms']:.0f} ms")
|
||
|
|
print(f" 99th Percentile: {lat['p99_ms']:.0f} ms")
|
||
|
|
|
||
|
|
# Voice Chat Assessment
|
||
|
|
print(f"\n🎤 Voice Chat Viability:")
|
||
|
|
if lat['mean_ms'] < 1000:
|
||
|
|
print(f" ✅ EXCELLENT - Mean latency under 1s")
|
||
|
|
elif lat['mean_ms'] < 1500:
|
||
|
|
print(f" ✅ GOOD - Mean latency acceptable for voice")
|
||
|
|
elif lat['mean_ms'] < 2000:
|
||
|
|
print(f" ⚠️ BORDERLINE - Noticeable lag in voice chat")
|
||
|
|
else:
|
||
|
|
print(f" ❌ TOO SLOW - Not suitable for real-time voice")
|
||
|
|
|
||
|
|
if lat['p95_ms'] > 2000:
|
||
|
|
print(f" ⚠️ WARNING: 5% of queries exceed 2s (P95: {lat['p95_ms']:.0f}ms)")
|
||
|
|
|
||
|
|
# Category Breakdown
|
||
|
|
print(f"\n📋 Performance by Category:")
|
||
|
|
for category, cat_stats in stats['by_category'].items():
|
||
|
|
print(f"\n {category}:")
|
||
|
|
print(f" Mean: {cat_stats['mean_ms']:.0f} ms")
|
||
|
|
print(f" Median: {cat_stats['median_ms']:.0f} ms")
|
||
|
|
print(f" Range: {cat_stats['min_ms']:.0f}-{cat_stats['max_ms']:.0f} ms")
|
||
|
|
|
||
|
|
# Response Size Statistics
|
||
|
|
size = stats['response_sizes']
|
||
|
|
print(f"\n📦 Response Sizes:")
|
||
|
|
print(f" Mean: {size['mean_bytes']:.0f} bytes")
|
||
|
|
print(f" Range: {size['min_bytes']}-{size['max_bytes']} bytes")
|
||
|
|
|
||
|
|
print("\n" + "=" * 60)
|
||
|
|
|
||
|
|
def save_results(results: PerformanceResults, filename: str = None):
|
||
|
|
"""Save results to JSON file"""
|
||
|
|
if filename is None:
|
||
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
|
|
filename = f"benchmark_results_{timestamp}.json"
|
||
|
|
|
||
|
|
stats = results.get_stats()
|
||
|
|
with open(filename, 'w') as f:
|
||
|
|
json.dump(stats, f, indent=2)
|
||
|
|
|
||
|
|
print(f"\n💾 Results saved to: {filename}")
|
||
|
|
|
||
|
|
def main():
|
||
|
|
print("=" * 60)
|
||
|
|
print("🐱 Cheshire Cat Performance Benchmark")
|
||
|
|
print("=" * 60)
|
||
|
|
|
||
|
|
# Check if Cat is available
|
||
|
|
try:
|
||
|
|
response = requests.get(f"{CAT_URL}/", timeout=5)
|
||
|
|
if response.status_code != 200:
|
||
|
|
print(f"\n❌ Cat not responding (status {response.status_code})")
|
||
|
|
print(" Make sure Cat is running: docker-compose -f docker-compose.test.yml up -d")
|
||
|
|
sys.exit(1)
|
||
|
|
except Exception as e:
|
||
|
|
print(f"\n❌ Cannot connect to Cat: {e}")
|
||
|
|
print(" Make sure Cat is running: docker-compose -f docker-compose.test.yml up -d")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
print("\n✅ Cat is available\n")
|
||
|
|
|
||
|
|
# Run benchmark suite
|
||
|
|
print("Starting comprehensive benchmark...")
|
||
|
|
print("This will take several minutes...\n")
|
||
|
|
|
||
|
|
results = run_benchmark_suite(iterations=2, verbose=True)
|
||
|
|
|
||
|
|
# Print report
|
||
|
|
print_report(results)
|
||
|
|
|
||
|
|
# Voice chat simulation
|
||
|
|
print("\n" + "=" * 60)
|
||
|
|
voice_results = test_voice_chat_simulation(duration_seconds=30)
|
||
|
|
|
||
|
|
if voice_results:
|
||
|
|
print("\n🎤 Voice Chat Simulation Results:")
|
||
|
|
lat = voice_results['latency']
|
||
|
|
print(f" Mean latency: {lat['mean_ms']:.0f} ms")
|
||
|
|
print(f" Median latency: {lat['median_ms']:.0f} ms")
|
||
|
|
print(f" 95th percentile: {lat['p95_ms']:.0f} ms")
|
||
|
|
print(f" Success rate: {voice_results['success_rate']:.1f}%")
|
||
|
|
|
||
|
|
# Save results
|
||
|
|
save_results(results)
|
||
|
|
|
||
|
|
print("\n✅ Benchmark complete!")
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|