add: cheshire-cat configuration, tooling, tests, and documentation

Configuration: - .env.example, .gitignore, compose.yml (main docker compose) - docker-compose-amd.yml (ROCm), docker-compose-macos.yml - start.sh, stop.sh convenience scripts - LICENSE (Apache 2.0, from upstream Cheshire Cat) Memory management utilities: - analyze_consolidation.py, manual_consolidation.py, verify_consolidation.py - check_memories.py, extract_declarative_facts.py, store_declarative_facts.py - compare_systems.py (system comparison tool) - benchmark_cat.py, streaming_benchmark.py, streaming_benchmark_v2.py Test suite: - quick_test.py, test_setup.py, test_setup_simple.py - test_consolidation_direct.py, test_declarative_recall.py, test_recall.py - test_end_to_end.py, test_full_pipeline.py - test_phase2.py, test_phase2_comprehensive.py Documentation: - README.md, QUICK_START.txt, TEST_README.md, SETUP_COMPLETE.md - PHASE2_IMPLEMENTATION_NOTES.md, PHASE2_TEST_RESULTS.md - POST_OPTIMIZATION_ANALYSIS.md
2026-03-04 00:51:14 +02:00
parent eafab336b4
commit ae1e0aa144
35 changed files with 6055 additions and 0 deletions
--- a/cheshire-cat/extract_declarative_facts.py
+++ b/cheshire-cat/extract_declarative_facts.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""
+Declarative Memory Extraction
+
+After consolidation keeps important episodic memories, this script:
+1. Analyzes kept memories
+2. Extracts structured facts (name, age, location, preferences, etc.)
+3. Stores facts in declarative memory collection
+4. Enables better retrieval for direct questions
+
+This is the KEY to making Phase 2 actually useful.
+"""
+
+import re
+from qdrant_client import QdrantClient
+from qdrant_client.models import PointStruct
+import uuid
+from datetime import datetime
+
+
+QDRANT_HOST = "localhost"
+QDRANT_PORT = 6333
+
+# Fact extraction patterns
+EXTRACTION_PATTERNS = {
+    'name': [
+        r"(?:my name is|i'm|i am|call me)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
+        r"(?:this is|i'm)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s*(?:speaking|here)?",
+    ],
+    'age': [
+        r"i'?m\s+(\d{1,3})\s+years?\s+old",
+        r"i'?m\s+(\d{1,3})",
+    ],
+    'location': [
+        r"i live in\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)",
+        r"i'?m (?:from|in)\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)",
+    ],
+    'job': [
+        r"i work (?:as|at)\s+(?:a|an)?\s*([a-zA-Z\s]+?)(?:at|in|for|\.|$)",
+        r"i'?m a\s+([a-zA-Z\s]+?)(?:at|in|for|\.|$)",
+    ],
+    'workplace': [
+        r"(?:i work|employed) (?:at|for|in)\s+([A-Z][a-zA-Z\s&]+?)(?:\.|$|,)",
+    ],
+    'pet_name': [
+        r"my (?:cat|dog|pet)'?s name is\s+([A-Z][a-z]+)",
+    ],
+    'allergy': [
+        r"i'?m allergic to\s+([a-z]+)",
+        r"i have (?:a|an) allergy to\s+([a-z]+)",
+    ],
+    'favorite_color': [
+        r"my favorite colo(?:u)?r is\s+([a-z]+)",
+        r"i love (?:the colo(?:u)?r )?\s*([a-z]+)",
+    ],
+    'hobby': [
+        r"i love (?:playing|doing)\s+([a-z]+)",
+        r"i enjoy\s+([a-z]+)",
+        r"i'?m (?:learning|studying)\s+([a-zA-Z\s]+?)(?:\.|$|!)",
+    ],
+    'preference': [
+        r"i (?:love|like|prefer)\s+([a-z\s]+)",
+        r"i (?:hate|dislike)\s+([a-z\s]+)",
+    ],
+}
+
+
+def extract_facts_from_text(text: str) -> dict:
+    """Extract structured facts from a text using regex patterns"""
+    facts = {}
+    text_lower = text.lower()
+    
+    for fact_type, patterns in EXTRACTION_PATTERNS.items():
+        for pattern in patterns:
+            match = re.search(pattern, text_lower if 'name' not in fact_type else text)
+            if match:
+                value = match.group(1).strip()
+                # Clean up the value
+                value = value.rstrip('.,!?')
+                if len(value) > 2:  # Minimum viable fact
+                    facts[fact_type] = value
+                    break  # Use first match
+    
+    return facts
+
+
+def create_declarative_memory(fact_type: str, value: str, source_memory: str, user_id: str = None):
+    """Create a declarative memory point for Qdrant"""
+    
+    # Create natural language fact statement
+    fact_templates = {
+        'name': f"The user's name is {value}",
+        'age': f"The user is {value} years old",
+        'location': f"The user lives in {value}",
+        'job': f"The user works as a {value}",
+        'workplace': f"The user works at {value}",
+        'pet_name': f"The user has a pet named {value}",
+        'allergy': f"The user is allergic to {value}",
+        'favorite_color': f"The user's favorite color is {value}",
+        'hobby': f"The user enjoys {value}",
+        'preference': f"The user likes {value}",
+    }
+    
+    fact_statement = fact_templates.get(fact_type, f"User fact: {fact_type} = {value}")
+    
+    # Create point structure (will need embeddings from Cat's LLM)
+    # For now, we'll create the structure and let Cat embed it
+    return {
+        'content': fact_statement,
+        'metadata': {
+            'type': 'declarative',
+            'fact_type': fact_type,
+            'fact_value': value,
+            'source': source_memory[:200],
+            'extracted_at': datetime.now().isoformat(),
+            'user_id': user_id or 'unknown',
+        }
+    }
+
+
+def extract_all_facts(client: QdrantClient):
+    """
+    Extract facts from all consolidated episodic memories.
+    Returns list of declarative memory points to be stored.
+    """
+    
+    print("🔍 Scanning episodic memories for facts...")
+    
+    # Get all consolidated episodic memories
+    episodic, _ = client.scroll(
+        collection_name='episodic',
+        limit=1000,
+        with_payload=True,
+        with_vectors=False
+    )
+    
+    # Only process consolidated memories
+    consolidated = [e for e in episodic if e.payload.get('metadata', {}).get('consolidated', False)]
+    
+    print(f"📊 Found {len(consolidated)} consolidated memories to analyze")
+    
+    all_facts = []
+    facts_by_type = {}
+    
+    for memory in consolidated:
+        content = memory.payload.get('page_content', '')
+        user_id = memory.payload.get('metadata', {}).get('user_id', 'unknown')
+        
+        # Extract facts from this memory
+        facts = extract_facts_from_text(content)
+        
+        if facts:
+            print(f"\n✅ Extracted from: '{content[:60]}...'")
+            for fact_type, value in facts.items():
+                print(f"   → {fact_type}: {value}")
+                
+                # Create declarative memory
+                decl_mem = create_declarative_memory(fact_type, value, content, user_id)
+                all_facts.append(decl_mem)
+                
+                # Track for summary
+                if fact_type not in facts_by_type:
+                    facts_by_type[fact_type] = []
+                facts_by_type[fact_type].append(value)
+    
+    # Summary
+    print("\n" + "=" * 70)
+    print("EXTRACTION SUMMARY")
+    print("=" * 70)
+    print(f"Total facts extracted: {len(all_facts)}")
+    print(f"\nBy type:")
+    for fact_type, values in sorted(facts_by_type.items()):
+        print(f"  {fact_type}: {len(values)} facts")
+        for val in values[:3]:
+            print(f"    - {val}")
+    
+    return all_facts
+
+
+def store_facts_to_file(facts: list, filename: str = 'extracted_facts.json'):
+    """Save extracted facts to JSON file for review"""
+    import json
+    with open(filename, 'w') as f:
+        json.dump(facts, f, indent=2)
+    print(f"\n📄 Facts saved to {filename}")
+
+
+def main():
+    print("=" * 70)
+    print("DECLARATIVE MEMORY EXTRACTION")
+    print("=" * 70)
+    
+    # Connect to Qdrant
+    client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=10, prefer_grpc=False)
+    
+    # Extract facts
+    facts = extract_all_facts(client)
+    
+    if not facts:
+        print("\n⚠️  No facts extracted. Ensure memories are consolidated first.")
+        return
+    
+    # Save to file for review
+    store_facts_to_file(facts, 'extracted_facts.json')
+    
+    print("\n" + "=" * 70)
+    print("NEXT STEPS:")
+    print("=" * 70)
+    print("1. Review extracted_facts.json to verify accuracy")
+    print("2. Facts need to be embedded and stored in Qdrant's declarative collection")
+    print("3. This requires Cat's embedder (will implement in next step)")
+    print("4. Once stored, test recall with direct questions")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    main()