add: cheshire-cat configuration, tooling, tests, and documentation

Configuration: - .env.example, .gitignore, compose.yml (main docker compose) - docker-compose-amd.yml (ROCm), docker-compose-macos.yml - start.sh, stop.sh convenience scripts - LICENSE (Apache 2.0, from upstream Cheshire Cat) Memory management utilities: - analyze_consolidation.py, manual_consolidation.py, verify_consolidation.py - check_memories.py, extract_declarative_facts.py, store_declarative_facts.py - compare_systems.py (system comparison tool) - benchmark_cat.py, streaming_benchmark.py, streaming_benchmark_v2.py Test suite: - quick_test.py, test_setup.py, test_setup_simple.py - test_consolidation_direct.py, test_declarative_recall.py, test_recall.py - test_end_to_end.py, test_full_pipeline.py - test_phase2.py, test_phase2_comprehensive.py Documentation: - README.md, QUICK_START.txt, TEST_README.md, SETUP_COMPLETE.md - PHASE2_IMPLEMENTATION_NOTES.md, PHASE2_TEST_RESULTS.md - POST_OPTIMIZATION_ANALYSIS.md
2026-03-04 00:51:14 +02:00
parent eafab336b4
commit ae1e0aa144
35 changed files with 6055 additions and 0 deletions
--- a/cheshire-cat/test_end_to_end.py
+++ b/cheshire-cat/test_end_to_end.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""
+END-TO-END Phase 2 Test
+
+Tests the complete pipeline:
+1. Send 20 diverse messages (important + trivial)
+2. Verify discord_bridge filters pure junk immediately
+3. Verify rest stored with consolidated=False
+4. Trigger consolidation
+5. Verify LLM/heuristic rates and deletes low-importance
+6. Verify facts extracted to declarative memory
+7. Test recall of important information
+
+This is the TRUE test of whether Phase 2 works.
+"""
+
+import requests
+import json
+import time
+from qdrant_client import QdrantClient
+
+CAT_URL = "http://localhost:1865"
+TEST_USER = "end_to_end_test_user"
+
+def send_message(text: str):
+    """Send message to Cat"""
+    response = requests.post(
+        f"{CAT_URL}/message",
+        json={"text": text, "user_id": TEST_USER},
+        timeout=30
+    )
+    if response.status_code == 200:
+        return True
+    return False
+
+def check_memory_state():
+    """Check current memory state"""
+    client = QdrantClient(host='localhost', port=6333, timeout=10, prefer_grpc=False)
+    
+    # Get episodic memories
+    episodic, _ = client.scroll('episodic', limit=100, with_payload=True, with_vectors=False)
+    
+    # Get declarative memories  
+    declarative, _ = client.scroll('declarative', limit=100, with_payload=True, with_vectors=False)
+    
+    return episodic, declarative
+
+def main():
+    print("=" * 70)
+    print("END-TO-END PHASE 2 TEST")
+    print("=" * 70)
+    
+    # Phase 1: Send diverse messages
+    print("\n📤 PHASE 1: Sending 20 messages...")
+    print("-" * 70)
+    
+    messages = {
+        "PURE JUNK (should be filtered immediately)": [
+            "lol",
+            "k",  
+            "ok",
+        ],
+        "IMPORTANT FACTS (should be kept + extracted)": [
+            "My name is Jennifer Martinez",
+            "I'm 28 years old",
+            "I work as a nurse at Seattle General Hospital",
+            "My cat's name is Whiskers",
+            "I'm allergic to peanuts",
+        ],
+        "EMOTIONAL EVENTS (should be kept)": [
+            "My father passed away last month from cancer",
+            "I just got accepted into grad school!",
+            "I'm struggling with anxiety lately",
+        ],
+        "MUNDANE CHITCHAT (should be deleted in consolidation)": [
+            "What's up?",
+            "How are you?",
+            "That's interesting",
+            "Nice weather today",
+        ],
+        "PREFERENCES (should be kept + extracted)": [
+            "I love jazz music",
+            "My favorite color is purple",
+            "I hate horror movies",
+        ],
+    }
+    
+    all_messages = []
+    for category, msgs in messages.items():
+        print(f"\n{category}:")
+        for msg in msgs:
+            print(f"  → {msg}")
+            send_message(msg)
+            all_messages.append((category, msg))
+            time.sleep(0.3)
+    
+    print(f"\n✅ Sent {len(all_messages)} messages")
+    
+    # Phase 2: Check immediate filtering
+    print("\n" + "=" * 70)
+    print("📊 PHASE 2: Checking immediate filtering (discord_bridge)")
+    print("-" * 70)
+    
+    time.sleep(2)  # Let storage complete
+    episodic, declarative = check_memory_state()
+    
+    print(f"\nEpisodic memories stored: {len(episodic)}")
+    print(f"Declarative memories: {len(declarative)}")
+    
+    # Check what was stored
+    stored_content = [e.payload.get('page_content', '') for e in episodic]
+    
+    pure_junk = ["lol", "k", "ok"]
+    junk_filtered = [j for j in pure_junk if j not in stored_content]
+    junk_stored = [j for j in pure_junk if j in stored_content]
+    
+    print(f"\n✅ Pure junk filtered: {len(junk_filtered)}/3")
+    if junk_filtered:
+        for msg in junk_filtered:
+            print(f"  - '{msg}'")
+    
+    if junk_stored:
+        print(f"\n⚠️  Pure junk NOT filtered: {len(junk_stored)}/3")
+        for msg in junk_stored:
+            print(f"  - '{msg}'")
+    
+    # Check consolidated flag
+    unconsolidated = [e for e in episodic if not e.payload.get('metadata', {}).get('consolidated', True)]
+    print(f"\n📋 Memories marked consolidated=False: {len(unconsolidated)}")
+    
+    # Phase 3: Trigger consolidation
+    print("\n" + "=" * 70)
+    print("🌙 PHASE 3: Triggering consolidation")
+    print("-" * 70)
+    
+    response = requests.post(
+        f"{CAT_URL}/message",
+        json={"text": "consolidate now", "user_id": "admin"},
+        timeout=60
+    )
+    
+    if response.status_code == 200:
+        result = response.json()
+        print(f"✅ Consolidation triggered")
+        print(f"Response: {result.get('content', '')[:200]}")
+    else:
+        print(f"❌ Consolidation failed: {response.status_code}")
+        return
+    
+    time.sleep(3)  # Let consolidation complete
+    
+    # Phase 4: Check post-consolidation state
+    print("\n" + "=" * 70)
+    print("📊 PHASE 4: Analyzing post-consolidation state")
+    print("-" * 70)
+    
+    episodic_after, declarative_after = check_memory_state()
+    
+    print(f"\nEpisodic memories: {len(episodic)} → {len(episodic_after)}")
+    print(f"Deleted: {len(episodic) - len(episodic_after)}")
+    print(f"\nDeclarative memories: {len(declarative)} → {len(declarative_after)}")
+    print(f"Facts extracted: {len(declarative_after) - len(declarative)}")
+    
+    # Check what was deleted
+    stored_after = [e.payload.get('page_content', '') for e in episodic_after]
+    deleted = [msg for msg in stored_content if msg not in stored_after]
+    
+    if deleted:
+        print(f"\n🗑️  Deleted ({len(deleted)}):")
+        for msg in deleted[:10]:
+            print(f"  - '{msg}'")
+    
+    # Check what important stuff remains
+    important_keywords = ["Jennifer", "28", "nurse", "Whiskers", "peanuts", 
+                         "father", "grad school", "anxiety", "jazz", "purple"]
+    important_kept = [msg for msg in stored_after if any(kw in msg for kw in important_keywords)]
+    
+    print(f"\n✅ Important messages kept ({len(important_kept)}):")
+    for msg in important_kept[:8]:
+        print(f"  - '{msg}'")
+    
+    # Phase 5: Test recall
+    print("\n" + "=" * 70)
+    print("🧠 PHASE 5: Testing recall")
+    print("-" * 70)
+    
+    test_queries = [
+        "What is my name?",
+        "Where do I work?",
+        "What's my cat's name?",
+        "What am I allergic to?",
+    ]
+    
+    for query in test_queries:
+        response = requests.post(
+            f"{CAT_URL}/message",
+            json={"text": query, "user_id": TEST_USER},
+            timeout=30
+        )
+        
+        if response.status_code == 200:
+            result = response.json()
+            answer = result.get('content', '')
+            memories = result.get('why', {}).get('memory', {})
+            episodic_recalled = len(memories.get('episodic', []))
+            declarative_recalled = len(memories.get('declarative', []))
+            
+            print(f"\nQ: {query}")
+            print(f"A: {answer[:150]}")
+            print(f"   [Recalled: {episodic_recalled} episodic, {declarative_recalled} declarative]")
+    
+    # Final summary
+    print("\n" + "=" * 70)
+    print("📋 FINAL SUMMARY")
+    print("=" * 70)
+    
+    print(f"\n1. Immediate filtering:")
+    print(f"   ✅ Filtered: {len(junk_filtered)}/3 pure junk")
+    print(f"   📝 Stored: {len(episodic)} messages")
+    
+    print(f"\n2. Consolidation:")
+    print(f"   🗑️  Deleted: {len(deleted)} low-importance")
+    print(f"   ✅ Kept: {len(episodic_after)} important")
+    print(f"   📚 Facts extracted: {len(declarative_after) - len(declarative)}")
+    
+    print(f"\n3. Recall:")
+    print(f"   Test queries: {len(test_queries)}")
+    print(f"   (Check above for recall accuracy)")
+    
+    print("\n" + "=" * 70)
+
+if __name__ == "__main__":
+    main()