#!/usr/bin/env python3 """ Comprehensive Cheshire Cat Performance Benchmark Tests latency, overhead, and performance under various conditions """ import requests import time import json import statistics from datetime import datetime from typing import List, Dict import sys CAT_URL = "http://localhost:1865" # Test queries of varying complexity TEST_QUERIES = { "simple_greeting": [ "Hello!", "Hi Miku!", "Hey there!", "Good morning!", "What's up?" ], "factual_short": [ "What is your favorite food?", "How old are you?", "What color is your hair?", "Where are you from?", "What's your name?" ], "factual_medium": [ "Tell me about your friends Rin and Len", "What is the song World is Mine about?", "Who created you?", "What kind of music do you sing?", "What do you like to do for fun?" ], "complex_memory": [ "What did we talk about earlier?", "Can you remember what I asked you before?", "Tell me everything you know about green onions", "What are all your most iconic songs?", "Describe your personality and how you act" ], "conversation_flow": [ "I love your music!", "What's your favorite song to perform?", "Do you ever get nervous on stage?", "That's interesting! Tell me more.", "Thanks for chatting with me!" ] } class PerformanceResults: def __init__(self): self.query_times: List[float] = [] self.response_sizes: List[int] = [] self.errors: List[str] = [] self.category_stats: Dict[str, List[float]] = {} def add_result(self, latency_ms: float, response_size: int, category: str): self.query_times.append(latency_ms) self.response_sizes.append(response_size) if category not in self.category_stats: self.category_stats[category] = [] self.category_stats[category].append(latency_ms) def add_error(self, error: str): self.errors.append(error) def get_stats(self): if not self.query_times: return None return { "total_queries": len(self.query_times), "total_errors": len(self.errors), "success_rate": (len(self.query_times) / (len(self.query_times) + len(self.errors))) * 100, "latency": { "min_ms": min(self.query_times), "max_ms": max(self.query_times), "mean_ms": statistics.mean(self.query_times), "median_ms": statistics.median(self.query_times), "stdev_ms": statistics.stdev(self.query_times) if len(self.query_times) > 1 else 0, "p95_ms": self._percentile(self.query_times, 95), "p99_ms": self._percentile(self.query_times, 99) }, "response_sizes": { "min_bytes": min(self.response_sizes), "max_bytes": max(self.response_sizes), "mean_bytes": statistics.mean(self.response_sizes), }, "by_category": { category: { "mean_ms": statistics.mean(times), "median_ms": statistics.median(times), "min_ms": min(times), "max_ms": max(times) } for category, times in self.category_stats.items() } } @staticmethod def _percentile(data, percentile): size = len(data) sorted_data = sorted(data) index = (percentile / 100) * size if index.is_integer(): return sorted_data[int(index) - 1] else: return sorted_data[int(index)] def test_single_query(query: str, category: str, timeout: int = 60, warmup: bool = False) -> Dict: """Test a single query and measure performance Args: query: The query text to send category: Category for grouping results timeout: Request timeout in seconds (60s for model loading) warmup: If True, don't count in results (for model loading) """ start_time = time.time() try: response = requests.post( f"{CAT_URL}/message", json={"text": query}, headers={"Content-Type": "application/json"}, timeout=timeout ) latency_ms = (time.time() - start_time) * 1000 if response.status_code == 200: data = response.json() content = data.get("content", "") # Filter out tool calls that might still appear if content and not (content.startswith('{"name":') or content.startswith('{')): return { "success": True, "latency_ms": latency_ms, "response_size": len(content), "response": content, "category": category, "warmup": warmup } else: return { "success": False, "latency_ms": latency_ms, "error": "Got tool call instead of text response", "category": category, "warmup": warmup } else: return { "success": False, "latency_ms": latency_ms, "error": f"HTTP {response.status_code}", "category": category, "warmup": warmup } except Exception as e: latency_ms = (time.time() - start_time) * 1000 return { "success": False, "latency_ms": latency_ms, "error": str(e), "category": category, "warmup": warmup } def run_benchmark_suite(iterations: int = 3, verbose: bool = True) -> PerformanceResults: """Run complete benchmark suite""" results = PerformanceResults() total_queries = sum(len(queries) for queries in TEST_QUERIES.values()) * iterations current_query = 0 print(f"\nšŸ Starting benchmark suite: {total_queries} total queries") print("=" * 60) # Warmup query to load the model print("\nšŸ”„ Warming up model (loading darkidol, may take 30-45s)...") warmup_result = test_single_query("Hi!", "warmup", timeout=60, warmup=True) if warmup_result["success"]: print(f" āœ… Model loaded in {warmup_result['latency_ms']:.0f}ms") else: print(f" āš ļø Warmup issue: {warmup_result.get('error', 'unknown')}") print(" Continuing anyway...") time.sleep(2) # Brief pause after warmup for iteration in range(iterations): print(f"\nšŸ“Š Iteration {iteration + 1}/{iterations}") for category, queries in TEST_QUERIES.items(): print(f"\n Category: {category}") for query in queries: current_query += 1 if verbose: print(f" [{current_query}/{total_queries}] Testing: '{query[:40]}...'") result = test_single_query(query, category, timeout=60) if result["success"] and not result.get("warmup", False): results.add_result( result["latency_ms"], result["response_size"], category ) if verbose: print(f" āœ… {result['latency_ms']:.0f}ms - {result['response_size']} bytes") print(f" Response: {result['response'][:60]}...") elif not result.get("warmup", False): results.add_error(result["error"]) if verbose: print(f" āŒ Error: {result['error']}") # Small delay between queries to avoid overwhelming the system time.sleep(1) return results def test_voice_chat_simulation(duration_seconds: int = 60) -> Dict: """Simulate voice chat workload (rapid-fire queries)""" print(f"\nšŸŽ¤ Simulating voice chat for {duration_seconds}s") print(" (Rapid-fire queries to test real-time performance)") print("=" * 60) voice_queries = [ "Hello!", "How are you?", "Tell me a joke", "What's your favorite song?", "That's cool!", "Can you sing?", "I like you!", "What should we do?", "Tell me more", "Goodbye!" ] results = PerformanceResults() start_time = time.time() query_index = 0 while (time.time() - start_time) < duration_seconds: query = voice_queries[query_index % len(voice_queries)] result = test_single_query(query, "voice_chat", timeout=30) # Increased timeout if result["success"]: results.add_result( result["latency_ms"], result["response_size"], "voice_chat" ) status = "āœ…" if result["latency_ms"] < 2000 else "āš ļø" print(f" {status} Query {query_index + 1}: {result['latency_ms']:.0f}ms") else: results.add_error(result["error"]) print(f" āŒ Query {query_index + 1}: Error - {result.get('error', 'unknown')}") query_index += 1 time.sleep(2) # Increased delay between queries elapsed = time.time() - start_time print(f"\n Completed {query_index} queries in {elapsed:.1f}s") return results.get_stats() def print_report(results: PerformanceResults): """Print detailed performance report""" stats = results.get_stats() if not stats: print("\nāŒ No successful queries to report") return print("\n" + "=" * 60) print("šŸ“Š PERFORMANCE REPORT") print("=" * 60) # Overall Statistics print(f"\nšŸ“ˆ Overall Statistics:") print(f" Total Queries: {stats['total_queries']}") print(f" Total Errors: {stats['total_errors']}") print(f" Success Rate: {stats['success_rate']:.1f}%") # Latency Statistics lat = stats['latency'] print(f"\nā±ļø Latency Statistics:") print(f" Mean: {lat['mean_ms']:.0f} ms") print(f" Median: {lat['median_ms']:.0f} ms") print(f" Min: {lat['min_ms']:.0f} ms") print(f" Max: {lat['max_ms']:.0f} ms") print(f" Std Dev: {lat['stdev_ms']:.0f} ms") print(f" 95th Percentile: {lat['p95_ms']:.0f} ms") print(f" 99th Percentile: {lat['p99_ms']:.0f} ms") # Voice Chat Assessment print(f"\nšŸŽ¤ Voice Chat Viability:") if lat['mean_ms'] < 1000: print(f" āœ… EXCELLENT - Mean latency under 1s") elif lat['mean_ms'] < 1500: print(f" āœ… GOOD - Mean latency acceptable for voice") elif lat['mean_ms'] < 2000: print(f" āš ļø BORDERLINE - Noticeable lag in voice chat") else: print(f" āŒ TOO SLOW - Not suitable for real-time voice") if lat['p95_ms'] > 2000: print(f" āš ļø WARNING: 5% of queries exceed 2s (P95: {lat['p95_ms']:.0f}ms)") # Category Breakdown print(f"\nšŸ“‹ Performance by Category:") for category, cat_stats in stats['by_category'].items(): print(f"\n {category}:") print(f" Mean: {cat_stats['mean_ms']:.0f} ms") print(f" Median: {cat_stats['median_ms']:.0f} ms") print(f" Range: {cat_stats['min_ms']:.0f}-{cat_stats['max_ms']:.0f} ms") # Response Size Statistics size = stats['response_sizes'] print(f"\nšŸ“¦ Response Sizes:") print(f" Mean: {size['mean_bytes']:.0f} bytes") print(f" Range: {size['min_bytes']}-{size['max_bytes']} bytes") print("\n" + "=" * 60) def save_results(results: PerformanceResults, filename: str = None): """Save results to JSON file""" if filename is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"benchmark_results_{timestamp}.json" stats = results.get_stats() with open(filename, 'w') as f: json.dump(stats, f, indent=2) print(f"\nšŸ’¾ Results saved to: {filename}") def main(): print("=" * 60) print("🐱 Cheshire Cat Performance Benchmark") print("=" * 60) # Check if Cat is available try: response = requests.get(f"{CAT_URL}/", timeout=5) if response.status_code != 200: print(f"\nāŒ Cat not responding (status {response.status_code})") print(" Make sure Cat is running: docker-compose -f docker-compose.test.yml up -d") sys.exit(1) except Exception as e: print(f"\nāŒ Cannot connect to Cat: {e}") print(" Make sure Cat is running: docker-compose -f docker-compose.test.yml up -d") sys.exit(1) print("\nāœ… Cat is available\n") # Run benchmark suite print("Starting comprehensive benchmark...") print("This will take several minutes...\n") results = run_benchmark_suite(iterations=2, verbose=True) # Print report print_report(results) # Voice chat simulation print("\n" + "=" * 60) voice_results = test_voice_chat_simulation(duration_seconds=30) if voice_results: print("\nšŸŽ¤ Voice Chat Simulation Results:") lat = voice_results['latency'] print(f" Mean latency: {lat['mean_ms']:.0f} ms") print(f" Median latency: {lat['median_ms']:.0f} ms") print(f" 95th percentile: {lat['p95_ms']:.0f} ms") print(f" Success rate: {voice_results['success_rate']:.1f}%") # Save results save_results(results) print("\nāœ… Benchmark complete!") if __name__ == "__main__": main()