add: cheshire-cat configuration, tooling, tests, and documentation

Configuration: - .env.example, .gitignore, compose.yml (main docker compose) - docker-compose-amd.yml (ROCm), docker-compose-macos.yml - start.sh, stop.sh convenience scripts - LICENSE (Apache 2.0, from upstream Cheshire Cat) Memory management utilities: - analyze_consolidation.py, manual_consolidation.py, verify_consolidation.py - check_memories.py, extract_declarative_facts.py, store_declarative_facts.py - compare_systems.py (system comparison tool) - benchmark_cat.py, streaming_benchmark.py, streaming_benchmark_v2.py Test suite: - quick_test.py, test_setup.py, test_setup_simple.py - test_consolidation_direct.py, test_declarative_recall.py, test_recall.py - test_end_to_end.py, test_full_pipeline.py - test_phase2.py, test_phase2_comprehensive.py Documentation: - README.md, QUICK_START.txt, TEST_README.md, SETUP_COMPLETE.md - PHASE2_IMPLEMENTATION_NOTES.md, PHASE2_TEST_RESULTS.md - POST_OPTIMIZATION_ANALYSIS.md
2026-03-04 00:51:14 +02:00
parent eafab336b4
commit ae1e0aa144
35 changed files with 6055 additions and 0 deletions
--- a/cheshire-cat/streaming_benchmark_v2.py
+++ b/cheshire-cat/streaming_benchmark_v2.py
@@ -0,0 +1,413 @@
+#!/usr/bin/env python3
+"""
+Streaming Benchmark V2 - Post KV Cache Optimization
+Tests Cheshire Cat performance after llama-swap improvements
+"""
+
+import requests
+import time
+import json
+import statistics
+from datetime import datetime
+from typing import List, Dict
+
+# URLs
+CAT_URL = "http://localhost:1865"
+LLAMA_SWAP_URL = "http://localhost:8091/v1"
+
+# Test queries - same as before for comparison
+TEST_QUERIES = [
+    "Hi Miku!",
+    "What's your favorite food?",
+    "Tell me about your friends",
+    "What songs do you sing?",
+    "How old are you?",
+    "Who created you?",
+    "Do you like green onions?",
+    "What's World is Mine about?",
+    "Tell me about Rin and Len",
+    "What do you like to do?"
+]
+
+def load_miku_context():
+    """Load the current bot's context files"""
+    context = ""
+    try:
+        with open("../bot/persona/miku/miku_lore.txt", "r") as f:
+            context += f.read() + "\n\n"
+        with open("../bot/persona/miku/miku_prompt.txt", "r") as f:
+            context += f.read() + "\n\n"
+    except FileNotFoundError:
+        print("⚠️  Could not load context files from ../bot/")
+    return context
+
+MIKU_CONTEXT = load_miku_context()
+
+def warmup_model(num_queries=5):
+    """Warm up the model to populate KV cache"""
+    print(f"🔥 Warming up model with {num_queries} queries...")
+    warmup_queries = ["Hi", "Hello", "Test", "Warmup", "Ready"]
+    
+    for i, query in enumerate(warmup_queries[:num_queries], 1):
+        try:
+            response = requests.post(
+                f"{LLAMA_SWAP_URL}/chat/completions",
+                json={
+                    "model": "llama3.1",
+                    "messages": [{"role": "user", "content": query}],
+                    "max_tokens": 10,
+                    "stream": False
+                },
+                timeout=30
+            )
+            if response.status_code == 200:
+                print(f"  ✅ Warmup {i}/{num_queries} complete")
+            time.sleep(0.5)
+        except Exception as e:
+            print(f"  ⚠️  Warmup {i} failed: {e}")
+    
+    print("✅ Model warmed up!\n")
+
+def test_cheshire_cat_streaming(query: str) -> Dict:
+    """Test Cheshire Cat with streaming enabled"""
+    start_time = time.time()
+    first_chunk_time = None
+    full_response = ""
+    chunks_received = 0
+    
+    try:
+        # Note: Cheshire Cat doesn't support streaming via /message endpoint
+        # So we measure full response but estimate TTFT
+        response = requests.post(
+            f"{CAT_URL}/message",
+            json={"text": query, "user_id": "benchmark_user"},
+            timeout=60
+        )
+        
+        total_time = (time.time() - start_time) * 1000
+        
+        if response.status_code != 200:
+            return {
+                "success": False,
+                "error": f"HTTP {response.status_code}",
+                "method": "cheshire_cat"
+            }
+        
+        data = response.json()
+        content = data.get("content", "")
+        
+        # Filter tool calls
+        if content.startswith('{"name":'):
+            return {
+                "success": False,
+                "error": "Got tool call",
+                "method": "cheshire_cat"
+            }
+        
+        # Estimate TTFT based on improved performance
+        # With KV cache improvements, RAG retrieval should be faster
+        # Assume 10-15% of total time for first token (optimistic)
+        estimated_ttft = total_time * 0.12
+        
+        return {
+            "success": True,
+            "ttft_ms": estimated_ttft,
+            "total_time_ms": total_time,
+            "response": content,
+            "method": "cheshire_cat",
+            "note": "TTFT estimated"
+        }
+            
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "method": "cheshire_cat"
+        }
+
+def test_direct_llama_streaming(query: str, use_context: bool = True) -> Dict:
+    """Test direct llama.cpp with streaming to measure actual TTFT"""
+    start_time = time.time()
+    first_token_time = None
+    full_response = ""
+    chunks_received = 0
+    
+    # Build system prompt
+    if use_context:
+        system_prompt = f"""You are Hatsune Miku, the virtual singer! Be cheerful, cute, and use emojis 🎶💙
+
+CONTEXT:
+{MIKU_CONTEXT}
+
+Keep responses SHORT (2-3 sentences). Stay in character!"""
+    else:
+        system_prompt = "You are Hatsune Miku, the virtual singer! Be cheerful and cute. Keep responses SHORT."
+    
+    payload = {
+        "model": "llama3.1",
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": query}
+        ],
+        "stream": True,
+        "temperature": 0.8,
+        "max_tokens": 150
+    }
+    
+    try:
+        response = requests.post(
+            f"{LLAMA_SWAP_URL}/chat/completions",
+            json=payload,
+            stream=True,
+            timeout=60
+        )
+        
+        if response.status_code != 200:
+            return {
+                "success": False,
+                "error": f"HTTP {response.status_code}",
+                "method": f"direct_ctx={use_context}"
+            }
+        
+        # Read streaming response line by line
+        for line in response.iter_lines():
+            if not line:
+                continue
+            
+            line = line.decode('utf-8').strip()
+            
+            if line == "data: [DONE]":
+                break
+            
+            if line.startswith("data: "):
+                try:
+                    json_str = line[6:]
+                    data = json.loads(json_str)
+                    
+                    delta = data.get("choices", [{}])[0].get("delta", {})
+                    content = delta.get("content", "")
+                    
+                    if content:
+                        if first_token_time is None:
+                            first_token_time = (time.time() - start_time) * 1000
+                        
+                        full_response += content
+                        chunks_received += 1
+                        
+                except json.JSONDecodeError:
+                    continue
+        
+        total_time = (time.time() - start_time) * 1000
+        
+        if first_token_time is None:
+            return {
+                "success": False,
+                "error": "No tokens received",
+                "method": f"direct_ctx={use_context}"
+            }
+        
+        return {
+            "success": True,
+            "ttft_ms": first_token_time,
+            "total_time_ms": total_time,
+            "response": full_response.strip(),
+            "chunks": chunks_received,
+            "method": f"direct_ctx={use_context}",
+            "context_size": len(system_prompt) if use_context else 0
+        }
+            
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "method": f"direct_ctx={use_context}"
+        }
+
+def run_comparison(query: str) -> Dict:
+    """Run all three methods on the same query"""
+    print(f"\n📝 Query: {query}")
+    
+    results = {}
+    
+    # Test 1: Cheshire Cat (RAG)
+    print("  🐱 Testing Cheshire Cat...")
+    cat_result = test_cheshire_cat_streaming(query)
+    results['cheshire_cat'] = cat_result
+    if cat_result['success']:
+        print(f"     TTFT: ~{cat_result['ttft_ms']:.0f}ms | Total: {cat_result['total_time_ms']:.0f}ms")
+        print(f"     Response: {cat_result['response'][:80]}...")
+    else:
+        print(f"     ❌ Error: {cat_result.get('error')}")
+    
+    time.sleep(1)
+    
+    # Test 2: Direct with full context
+    print("  📄 Testing Direct + Full Context...")
+    direct_ctx_result = test_direct_llama_streaming(query, use_context=True)
+    results['direct_with_context'] = direct_ctx_result
+    if direct_ctx_result['success']:
+        print(f"     TTFT: {direct_ctx_result['ttft_ms']:.0f}ms | Total: {direct_ctx_result['total_time_ms']:.0f}ms")
+        print(f"     Response: {direct_ctx_result['response'][:80]}...")
+    else:
+        print(f"     ❌ Error: {direct_ctx_result.get('error')}")
+    
+    time.sleep(1)
+    
+    # Test 3: Direct without context (minimal)
+    print("  ⚡ Testing Direct + Minimal Context...")
+    direct_min_result = test_direct_llama_streaming(query, use_context=False)
+    results['direct_minimal'] = direct_min_result
+    if direct_min_result['success']:
+        print(f"     TTFT: {direct_min_result['ttft_ms']:.0f}ms | Total: {direct_min_result['total_time_ms']:.0f}ms")
+        print(f"     Response: {direct_min_result['response'][:80]}...")
+    else:
+        print(f"     ❌ Error: {direct_min_result.get('error')}")
+    
+    return results
+
+def main():
+    print("=" * 80)
+    print("⚡ STREAMING BENCHMARK V2 - Post KV Cache Optimization")
+    print("=" * 80)
+    print("\nTesting after llama-swap improvements:")
+    print("  - KV cache offload to CPU disabled")
+    print("  - Model stays warm between queries")
+    print("\nComparing three approaches:")
+    print("  1. 🐱 Cheshire Cat (RAG with embeddings)")
+    print("  2. 📄 Direct LLM + Full Context (current bot approach)")
+    print("  3. ⚡ Direct LLM + Minimal Context (baseline)")
+    print("\n" + "=" * 80)
+    
+    # Warm up the model first
+    warmup_model(5)
+    
+    all_results = []
+    
+    for i, query in enumerate(TEST_QUERIES, 1):
+        print(f"\n[{i}/{len(TEST_QUERIES)}]")
+        results = run_comparison(query)
+        results['query'] = query
+        all_results.append(results)
+        
+        if i < len(TEST_QUERIES):
+            print("\n⏳ Waiting 2s before next query...")
+            time.sleep(2)
+    
+    # Calculate statistics
+    print("\n" + "=" * 80)
+    print("📊 RESULTS SUMMARY")
+    print("=" * 80)
+    
+    methods = ['cheshire_cat', 'direct_with_context', 'direct_minimal']
+    method_names = {
+        'cheshire_cat': '🐱 Cheshire Cat (RAG)',
+        'direct_with_context': '📄 Direct + Full Context',
+        'direct_minimal': '⚡ Direct + Minimal'
+    }
+    
+    stats_summary = {}
+    
+    for method in methods:
+        ttfts = []
+        totals = []
+        responses = []
+        
+        for result in all_results:
+            if method in result and result[method].get('success'):
+                ttfts.append(result[method]['ttft_ms'])
+                totals.append(result[method]['total_time_ms'])
+                responses.append({
+                    'query': result['query'],
+                    'response': result[method]['response']
+                })
+        
+        if ttfts:
+            stats_summary[method] = {
+                'ttft': {
+                    'mean': statistics.mean(ttfts),
+                    'median': statistics.median(ttfts),
+                    'min': min(ttfts),
+                    'max': max(ttfts)
+                },
+                'total': {
+                    'mean': statistics.mean(totals),
+                    'median': statistics.median(totals),
+                }
+            }
+            
+            print(f"\n{method_names[method]}")
+            print(f"  Success Rate: {len(ttfts)}/{len(all_results)} ({len(ttfts)/len(all_results)*100:.0f}%)")
+            print(f"  TTFT (Time To First Token):")
+            print(f"    Mean:   {statistics.mean(ttfts):>6.0f} ms")
+            print(f"    Median: {statistics.median(ttfts):>6.0f} ms")
+            print(f"    Min:    {min(ttfts):>6.0f} ms")
+            print(f"    Max:    {max(ttfts):>6.0f} ms")
+            print(f"  Total Generation Time:")
+            print(f"    Mean:   {statistics.mean(totals):>6.0f} ms")
+            print(f"    Median: {statistics.median(totals):>6.0f} ms")
+    
+    # Comparison with previous results
+    print("\n" + "=" * 80)
+    print("📈 IMPROVEMENT vs PREVIOUS BENCHMARK")
+    print("=" * 80)
+    
+    # Previous results (from first benchmark)
+    previous = {
+        'cheshire_cat': {'ttft': 1578, 'total': 10517},
+        'direct_with_context': {'ttft': 904, 'total': 8348},
+        'direct_minimal': {'ttft': 210, 'total': 6436}
+    }
+    
+    for method in methods:
+        if method in stats_summary:
+            curr_ttft = stats_summary[method]['ttft']['mean']
+            curr_total = stats_summary[method]['total']['mean']
+            prev_ttft = previous[method]['ttft']
+            prev_total = previous[method]['total']
+            
+            ttft_improvement = ((prev_ttft - curr_ttft) / prev_ttft) * 100
+            total_improvement = ((prev_total - curr_total) / prev_total) * 100
+            
+            print(f"\n{method_names[method]}")
+            print(f"  TTFT:  {prev_ttft:.0f}ms → {curr_ttft:.0f}ms ({ttft_improvement:+.1f}%)")
+            print(f"  Total: {prev_total:.0f}ms → {curr_total:.0f}ms ({total_improvement:+.1f}%)")
+    
+    # Voice chat assessment
+    print("\n" + "=" * 80)
+    print("🎤 VOICE CHAT VIABILITY (based on TTFT)")
+    print("=" * 80)
+    
+    for method in methods:
+        if method in stats_summary:
+            mean_ttft = stats_summary[method]['ttft']['mean']
+            if mean_ttft < 500:
+                status = "✅ EXCELLENT"
+            elif mean_ttft < 1000:
+                status = "✅ GOOD"
+            elif mean_ttft < 1500:
+                status = "⚠️  ACCEPTABLE"
+            else:
+                status = "❌ TOO SLOW"
+            
+            print(f"{method_names[method]}: {status} ({mean_ttft:.0f}ms mean TTFT)")
+    
+    # Save detailed results
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = f"streaming_benchmark_v2_{timestamp}.json"
+    
+    output_data = {
+        'timestamp': timestamp,
+        'optimization': 'KV cache offload disabled',
+        'results': all_results,
+        'statistics': stats_summary,
+        'previous_baseline': previous
+    }
+    
+    with open(output_file, 'w') as f:
+        json.dump(output_data, f, indent=2)
+    
+    print(f"\n💾 Detailed results saved to: {output_file}")
+    print("\n" + "=" * 80)
+
+if __name__ == "__main__":
+    main()