miku-discord/cheshire-cat/benchmark_cat.py

#!/usr/bin/env python3
"""
Comprehensive Cheshire Cat Performance Benchmark
Tests latency, overhead, and performance under various conditions
"""

import requests
import time
import json
import statistics
from datetime import datetime
from typing import List, Dict
import sys

CAT_URL = "http://localhost:1865"

# Test queries of varying complexity
TEST_QUERIES = {
    "simple_greeting": [
        "Hello!",
        "Hi Miku!",
        "Hey there!",
        "Good morning!",
        "What's up?"
    ],
    "factual_short": [
        "What is your favorite food?",
        "How old are you?",
        "What color is your hair?",
        "Where are you from?",
        "What's your name?"
    ],
    "factual_medium": [
        "Tell me about your friends Rin and Len",
        "What is the song World is Mine about?",
        "Who created you?",
        "What kind of music do you sing?",
        "What do you like to do for fun?"
    ],
    "complex_memory": [
        "What did we talk about earlier?",
        "Can you remember what I asked you before?",
        "Tell me everything you know about green onions",
        "What are all your most iconic songs?",
        "Describe your personality and how you act"
    ],
    "conversation_flow": [
        "I love your music!",
        "What's your favorite song to perform?",
        "Do you ever get nervous on stage?",
        "That's interesting! Tell me more.",
        "Thanks for chatting with me!"
    ]
}

class PerformanceResults:
    def __init__(self):
        self.query_times: List[float] = []
        self.response_sizes: List[int] = []
        self.errors: List[str] = []
        self.category_stats: Dict[str, List[float]] = {}

    def add_result(self, latency_ms: float, response_size: int, category: str):
        self.query_times.append(latency_ms)
        self.response_sizes.append(response_size)
        if category not in self.category_stats:
            self.category_stats[category] = []
        self.category_stats[category].append(latency_ms)

    def add_error(self, error: str):
        self.errors.append(error)

    def get_stats(self):
        if not self.query_times:
            return None

        return {
            "total_queries": len(self.query_times),
            "total_errors": len(self.errors),
            "success_rate": (len(self.query_times) / (len(self.query_times) + len(self.errors))) * 100,
            "latency": {
                "min_ms": min(self.query_times),
                "max_ms": max(self.query_times),
                "mean_ms": statistics.mean(self.query_times),
                "median_ms": statistics.median(self.query_times),
                "stdev_ms": statistics.stdev(self.query_times) if len(self.query_times) > 1 else 0,
                "p95_ms": self._percentile(self.query_times, 95),
                "p99_ms": self._percentile(self.query_times, 99)
            },
            "response_sizes": {
                "min_bytes": min(self.response_sizes),
                "max_bytes": max(self.response_sizes),
                "mean_bytes": statistics.mean(self.response_sizes),
            },
            "by_category": {
                category: {
                    "mean_ms": statistics.mean(times),
                    "median_ms": statistics.median(times),
                    "min_ms": min(times),
                    "max_ms": max(times)
                }
                for category, times in self.category_stats.items()
            }
        }

    @staticmethod
    def _percentile(data, percentile):
        size = len(data)
        sorted_data = sorted(data)
        index = (percentile / 100) * size
        if index.is_integer():
            return sorted_data[int(index) - 1]
        else:
            return sorted_data[int(index)]

def test_single_query(query: str, category: str, timeout: int = 60, warmup: bool = False) -> Dict:
    """Test a single query and measure performance

    Args:
        query: The query text to send
        category: Category for grouping results
        timeout: Request timeout in seconds (60s for model loading)
        warmup: If True, don't count in results (for model loading)
    """
    start_time = time.time()

    try:
        response = requests.post(
            f"{CAT_URL}/message",
            json={"text": query},
            headers={"Content-Type": "application/json"},
            timeout=timeout
        )

        latency_ms = (time.time() - start_time) * 1000

        if response.status_code == 200:
            data = response.json()
            content = data.get("content", "")

            # Filter out tool calls that might still appear
            if content and not (content.startswith('{"name":') or content.startswith('{')):
                return {
                    "success": True,
                    "latency_ms": latency_ms,
                    "response_size": len(content),
                    "response": content,
                    "category": category,
                    "warmup": warmup
                }
            else:
                return {
                    "success": False,
                    "latency_ms": latency_ms,
                    "error": "Got tool call instead of text response",
                    "category": category,
                    "warmup": warmup
                }
        else:
            return {
                "success": False,
                "latency_ms": latency_ms,
                "error": f"HTTP {response.status_code}",
                "category": category,
                "warmup": warmup
            }
    except Exception as e:
        latency_ms = (time.time() - start_time) * 1000
        return {
            "success": False,
            "latency_ms": latency_ms,
            "error": str(e),
            "category": category,
            "warmup": warmup
        }

def run_benchmark_suite(iterations: int = 3, verbose: bool = True) -> PerformanceResults:
    """Run complete benchmark suite"""
    results = PerformanceResults()
    total_queries = sum(len(queries) for queries in TEST_QUERIES.values()) * iterations
    current_query = 0

    print(f"\n🏁 Starting benchmark suite: {total_queries} total queries")
    print("=" * 60)

    # Warmup query to load the model
    print("\n🔥 Warming up model (loading darkidol, may take 30-45s)...")
    warmup_result = test_single_query("Hi!", "warmup", timeout=60, warmup=True)
    if warmup_result["success"]:
        print(f"   ✅ Model loaded in {warmup_result['latency_ms']:.0f}ms")
    else:
        print(f"   ⚠️  Warmup issue: {warmup_result.get('error', 'unknown')}")
        print("   Continuing anyway...")

    time.sleep(2)  # Brief pause after warmup

    for iteration in range(iterations):
        print(f"\n📊 Iteration {iteration + 1}/{iterations}")

        for category, queries in TEST_QUERIES.items():
            print(f"\n  Category: {category}")

            for query in queries:
                current_query += 1
                if verbose:
                    print(f"    [{current_query}/{total_queries}] Testing: '{query[:40]}...'")

                result = test_single_query(query, category, timeout=60)

                if result["success"] and not result.get("warmup", False):
                    results.add_result(
                        result["latency_ms"],
                        result["response_size"],
                        category
                    )
                    if verbose:
                        print(f"       ✅ {result['latency_ms']:.0f}ms - {result['response_size']} bytes")
                        print(f"          Response: {result['response'][:60]}...")
                elif not result.get("warmup", False):
                    results.add_error(result["error"])
                    if verbose:
                        print(f"       ❌ Error: {result['error']}")

                # Small delay between queries to avoid overwhelming the system
                time.sleep(1)

    return results

def test_voice_chat_simulation(duration_seconds: int = 60) -> Dict:
    """Simulate voice chat workload (rapid-fire queries)"""
    print(f"\n🎤 Simulating voice chat for {duration_seconds}s")
    print("   (Rapid-fire queries to test real-time performance)")
    print("=" * 60)

    voice_queries = [
        "Hello!",
        "How are you?",
        "Tell me a joke",
        "What's your favorite song?",
        "That's cool!",
        "Can you sing?",
        "I like you!",
        "What should we do?",
        "Tell me more",
        "Goodbye!"
    ]

    results = PerformanceResults()
    start_time = time.time()
    query_index = 0

    while (time.time() - start_time) < duration_seconds:
        query = voice_queries[query_index % len(voice_queries)]
        result = test_single_query(query, "voice_chat", timeout=30)  # Increased timeout

        if result["success"]:
            results.add_result(
                result["latency_ms"],
                result["response_size"],
                "voice_chat"
            )
            status = "✅" if result["latency_ms"] < 2000 else "⚠️"
            print(f"  {status} Query {query_index + 1}: {result['latency_ms']:.0f}ms")
        else:
            results.add_error(result["error"])
            print(f"  ❌ Query {query_index + 1}: Error - {result.get('error', 'unknown')}")

        query_index += 1
        time.sleep(2)  # Increased delay between queries

    elapsed = time.time() - start_time
    print(f"\n  Completed {query_index} queries in {elapsed:.1f}s")

    return results.get_stats()

def print_report(results: PerformanceResults):
    """Print detailed performance report"""
    stats = results.get_stats()

    if not stats:
        print("\n❌ No successful queries to report")
        return

    print("\n" + "=" * 60)
    print("📊 PERFORMANCE REPORT")
    print("=" * 60)

    # Overall Statistics
    print(f"\n📈 Overall Statistics:")
    print(f"   Total Queries:    {stats['total_queries']}")
    print(f"   Total Errors:     {stats['total_errors']}")
    print(f"   Success Rate:     {stats['success_rate']:.1f}%")

    # Latency Statistics
    lat = stats['latency']
    print(f"\n⏱️  Latency Statistics:")
    print(f"   Mean:             {lat['mean_ms']:.0f} ms")
    print(f"   Median:           {lat['median_ms']:.0f} ms")
    print(f"   Min:              {lat['min_ms']:.0f} ms")
    print(f"   Max:              {lat['max_ms']:.0f} ms")
    print(f"   Std Dev:          {lat['stdev_ms']:.0f} ms")
    print(f"   95th Percentile:  {lat['p95_ms']:.0f} ms")
    print(f"   99th Percentile:  {lat['p99_ms']:.0f} ms")

    # Voice Chat Assessment
    print(f"\n🎤 Voice Chat Viability:")
    if lat['mean_ms'] < 1000:
        print(f"   ✅ EXCELLENT - Mean latency under 1s")
    elif lat['mean_ms'] < 1500:
        print(f"   ✅ GOOD - Mean latency acceptable for voice")
    elif lat['mean_ms'] < 2000:
        print(f"   ⚠️  BORDERLINE - Noticeable lag in voice chat")
    else:
        print(f"   ❌ TOO SLOW - Not suitable for real-time voice")

    if lat['p95_ms'] > 2000:
        print(f"   ⚠️  WARNING: 5% of queries exceed 2s (P95: {lat['p95_ms']:.0f}ms)")

    # Category Breakdown
    print(f"\n📋 Performance by Category:")
    for category, cat_stats in stats['by_category'].items():
        print(f"\n   {category}:")
        print(f"     Mean:    {cat_stats['mean_ms']:.0f} ms")
        print(f"     Median:  {cat_stats['median_ms']:.0f} ms")
        print(f"     Range:   {cat_stats['min_ms']:.0f}-{cat_stats['max_ms']:.0f} ms")

    # Response Size Statistics
    size = stats['response_sizes']
    print(f"\n📦 Response Sizes:")
    print(f"   Mean:             {size['mean_bytes']:.0f} bytes")
    print(f"   Range:            {size['min_bytes']}-{size['max_bytes']} bytes")

    print("\n" + "=" * 60)

def save_results(results: PerformanceResults, filename: str = None):
    """Save results to JSON file"""
    if filename is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"benchmark_results_{timestamp}.json"

    stats = results.get_stats()
    with open(filename, 'w') as f:
        json.dump(stats, f, indent=2)

    print(f"\n💾 Results saved to: {filename}")

def main():
    print("=" * 60)
    print("🐱 Cheshire Cat Performance Benchmark")
    print("=" * 60)

    # Check if Cat is available
    try:
        response = requests.get(f"{CAT_URL}/", timeout=5)
        if response.status_code != 200:
            print(f"\n❌ Cat not responding (status {response.status_code})")
            print("   Make sure Cat is running: docker-compose -f docker-compose.test.yml up -d")
            sys.exit(1)
    except Exception as e:
        print(f"\n❌ Cannot connect to Cat: {e}")
        print("   Make sure Cat is running: docker-compose -f docker-compose.test.yml up -d")
        sys.exit(1)

    print("\n✅ Cat is available\n")

    # Run benchmark suite
    print("Starting comprehensive benchmark...")
    print("This will take several minutes...\n")

    results = run_benchmark_suite(iterations=2, verbose=True)

    # Print report
    print_report(results)

    # Voice chat simulation
    print("\n" + "=" * 60)
    voice_results = test_voice_chat_simulation(duration_seconds=30)

    if voice_results:
        print("\n🎤 Voice Chat Simulation Results:")
        lat = voice_results['latency']
        print(f"   Mean latency:     {lat['mean_ms']:.0f} ms")
        print(f"   Median latency:   {lat['median_ms']:.0f} ms")
        print(f"   95th percentile:  {lat['p95_ms']:.0f} ms")
        print(f"   Success rate:     {voice_results['success_rate']:.1f}%")

    # Save results
    save_results(results)

    print("\n✅ Benchmark complete!")

if __name__ == "__main__":
    main()