cheshire-cat/extract_declarative_facts.py

#!/usr/bin/env python3
"""
Declarative Memory Extraction

After consolidation keeps important episodic memories, this script:
1. Analyzes kept memories
2. Extracts structured facts (name, age, location, preferences, etc.)
3. Stores facts in declarative memory collection
4. Enables better retrieval for direct questions

This is the KEY to making Phase 2 actually useful.
"""

import re
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
import uuid
from datetime import datetime


QDRANT_HOST = "localhost"
QDRANT_PORT = 6333

# Fact extraction patterns
EXTRACTION_PATTERNS = {
    'name': [
        r"(?:my name is|i'm|i am|call me)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
        r"(?:this is|i'm)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s*(?:speaking|here)?",
    ],
    'age': [
        r"i'?m\s+(\d{1,3})\s+years?\s+old",
        r"i'?m\s+(\d{1,3})",
    ],
    'location': [
        r"i live in\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)",
        r"i'?m (?:from|in)\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)",
    ],
    'job': [
        r"i work (?:as|at)\s+(?:a|an)?\s*([a-zA-Z\s]+?)(?:at|in|for|\.|$)",
        r"i'?m a\s+([a-zA-Z\s]+?)(?:at|in|for|\.|$)",
    ],
    'workplace': [
        r"(?:i work|employed) (?:at|for|in)\s+([A-Z][a-zA-Z\s&]+?)(?:\.|$|,)",
    ],
    'pet_name': [
        r"my (?:cat|dog|pet)'?s name is\s+([A-Z][a-z]+)",
    ],
    'allergy': [
        r"i'?m allergic to\s+([a-z]+)",
        r"i have (?:a|an) allergy to\s+([a-z]+)",
    ],
    'favorite_color': [
        r"my favorite colo(?:u)?r is\s+([a-z]+)",
        r"i love (?:the colo(?:u)?r )?\s*([a-z]+)",
    ],
    'hobby': [
        r"i love (?:playing|doing)\s+([a-z]+)",
        r"i enjoy\s+([a-z]+)",
        r"i'?m (?:learning|studying)\s+([a-zA-Z\s]+?)(?:\.|$|!)",
    ],
    'preference': [
        r"i (?:love|like|prefer)\s+([a-z\s]+)",
        r"i (?:hate|dislike)\s+([a-z\s]+)",
    ],
}


def extract_facts_from_text(text: str) -> dict:
    """Extract structured facts from a text using regex patterns"""
    facts = {}
    text_lower = text.lower()
    
    for fact_type, patterns in EXTRACTION_PATTERNS.items():
        for pattern in patterns:
            match = re.search(pattern, text_lower if 'name' not in fact_type else text)
            if match:
                value = match.group(1).strip()
                # Clean up the value
                value = value.rstrip('.,!?')
                if len(value) > 2:  # Minimum viable fact
                    facts[fact_type] = value
                    break  # Use first match
    
    return facts


def create_declarative_memory(fact_type: str, value: str, source_memory: str, user_id: str = None):
    """Create a declarative memory point for Qdrant"""
    
    # Create natural language fact statement
    fact_templates = {
        'name': f"The user's name is {value}",
        'age': f"The user is {value} years old",
        'location': f"The user lives in {value}",
        'job': f"The user works as a {value}",
        'workplace': f"The user works at {value}",
        'pet_name': f"The user has a pet named {value}",
        'allergy': f"The user is allergic to {value}",
        'favorite_color': f"The user's favorite color is {value}",
        'hobby': f"The user enjoys {value}",
        'preference': f"The user likes {value}",
    }
    
    fact_statement = fact_templates.get(fact_type, f"User fact: {fact_type} = {value}")
    
    # Create point structure (will need embeddings from Cat's LLM)
    # For now, we'll create the structure and let Cat embed it
    return {
        'content': fact_statement,
        'metadata': {
            'type': 'declarative',
            'fact_type': fact_type,
            'fact_value': value,
            'source': source_memory[:200],
            'extracted_at': datetime.now().isoformat(),
            'user_id': user_id or 'unknown',
        }
    }


def extract_all_facts(client: QdrantClient):
    """
    Extract facts from all consolidated episodic memories.
    Returns list of declarative memory points to be stored.
    """
    
    print("🔍 Scanning episodic memories for facts...")
    
    # Get all consolidated episodic memories
    episodic, _ = client.scroll(
        collection_name='episodic',
        limit=1000,
        with_payload=True,
        with_vectors=False
    )
    
    # Only process consolidated memories
    consolidated = [e for e in episodic if e.payload.get('metadata', {}).get('consolidated', False)]
    
    print(f"📊 Found {len(consolidated)} consolidated memories to analyze")
    
    all_facts = []
    facts_by_type = {}
    
    for memory in consolidated:
        content = memory.payload.get('page_content', '')
        user_id = memory.payload.get('metadata', {}).get('user_id', 'unknown')
        
        # Extract facts from this memory
        facts = extract_facts_from_text(content)
        
        if facts:
            print(f"\n✅ Extracted from: '{content[:60]}...'")
            for fact_type, value in facts.items():
                print(f"   → {fact_type}: {value}")
                
                # Create declarative memory
                decl_mem = create_declarative_memory(fact_type, value, content, user_id)
                all_facts.append(decl_mem)
                
                # Track for summary
                if fact_type not in facts_by_type:
                    facts_by_type[fact_type] = []
                facts_by_type[fact_type].append(value)
    
    # Summary
    print("\n" + "=" * 70)
    print("EXTRACTION SUMMARY")
    print("=" * 70)
    print(f"Total facts extracted: {len(all_facts)}")
    print(f"\nBy type:")
    for fact_type, values in sorted(facts_by_type.items()):
        print(f"  {fact_type}: {len(values)} facts")
        for val in values[:3]:
            print(f"    - {val}")
    
    return all_facts


def store_facts_to_file(facts: list, filename: str = 'extracted_facts.json'):
    """Save extracted facts to JSON file for review"""
    import json
    with open(filename, 'w') as f:
        json.dump(facts, f, indent=2)
    print(f"\n📄 Facts saved to {filename}")


def main():
    print("=" * 70)
    print("DECLARATIVE MEMORY EXTRACTION")
    print("=" * 70)
    
    # Connect to Qdrant
    client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=10, prefer_grpc=False)
    
    # Extract facts
    facts = extract_all_facts(client)
    
    if not facts:
        print("\n⚠️  No facts extracted. Ensure memories are consolidated first.")
        return
    
    # Save to file for review
    store_facts_to_file(facts, 'extracted_facts.json')
    
    print("\n" + "=" * 70)
    print("NEXT STEPS:")
    print("=" * 70)
    print("1. Review extracted_facts.json to verify accuracy")
    print("2. Facts need to be embedded and stored in Qdrant's declarative collection")
    print("3. This requires Cat's embedder (will implement in next step)")
    print("4. Once stored, test recall with direct questions")
    print("=" * 70)


if __name__ == "__main__":
    main()
add: cheshire-cat configuration, tooling, tests, and documentation Configuration: - .env.example, .gitignore, compose.yml (main docker compose) - docker-compose-amd.yml (ROCm), docker-compose-macos.yml - start.sh, stop.sh convenience scripts - LICENSE (Apache 2.0, from upstream Cheshire Cat) Memory management utilities: - analyze_consolidation.py, manual_consolidation.py, verify_consolidation.py - check_memories.py, extract_declarative_facts.py, store_declarative_facts.py - compare_systems.py (system comparison tool) - benchmark_cat.py, streaming_benchmark.py, streaming_benchmark_v2.py Test suite: - quick_test.py, test_setup.py, test_setup_simple.py - test_consolidation_direct.py, test_declarative_recall.py, test_recall.py - test_end_to_end.py, test_full_pipeline.py - test_phase2.py, test_phase2_comprehensive.py Documentation: - README.md, QUICK_START.txt, TEST_README.md, SETUP_COMPLETE.md - PHASE2_IMPLEMENTATION_NOTES.md, PHASE2_TEST_RESULTS.md - POST_OPTIMIZATION_ANALYSIS.md 2026-03-04 00:51:14 +02:00			`#!/usr/bin/env python3`
			`"""`
			`Declarative Memory Extraction`

			`After consolidation keeps important episodic memories, this script:`
			`1. Analyzes kept memories`
			`2. Extracts structured facts (name, age, location, preferences, etc.)`
			`3. Stores facts in declarative memory collection`
			`4. Enables better retrieval for direct questions`

			`This is the KEY to making Phase 2 actually useful.`
			`"""`

			`import re`
			`from qdrant_client import QdrantClient`
			`from qdrant_client.models import PointStruct`
			`import uuid`
			`from datetime import datetime`


			`QDRANT_HOST = "localhost"`
			`QDRANT_PORT = 6333`

			`# Fact extraction patterns`
			`EXTRACTION_PATTERNS = {`
			`'name': [`
			`r"(?:my name is\|i'm\|i am\|call me)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",`
			`r"(?:this is\|i'm)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s*(?:speaking\|here)?",`
			`],`
			`'age': [`
			`r"i'?m\s+(\d{1,3})\s+years?\s+old",`
			`r"i'?m\s+(\d{1,3})",`
			`],`
			`'location': [`
			`r"i live in\s+([A-Z][a-zA-Z\s,]+?)(?:\.\|$\|,)",`
			`r"i'?m (?:from\|in)\s+([A-Z][a-zA-Z\s,]+?)(?:\.\|$\|,)",`
			`],`
			`'job': [`
			`r"i work (?:as\|at)\s+(?:a\|an)?\s*([a-zA-Z\s]+?)(?:at\|in\|for\|\.\|$)",`
			`r"i'?m a\s+([a-zA-Z\s]+?)(?:at\|in\|for\|\.\|$)",`
			`],`
			`'workplace': [`
			`r"(?:i work\|employed) (?:at\|for\|in)\s+([A-Z][a-zA-Z\s&]+?)(?:\.\|$\|,)",`
			`],`
			`'pet_name': [`
			`r"my (?:cat\|dog\|pet)'?s name is\s+([A-Z][a-z]+)",`
			`],`
			`'allergy': [`
			`r"i'?m allergic to\s+([a-z]+)",`
			`r"i have (?:a\|an) allergy to\s+([a-z]+)",`
			`],`
			`'favorite_color': [`
			`r"my favorite colo(?:u)?r is\s+([a-z]+)",`
			`r"i love (?:the colo(?:u)?r )?\s*([a-z]+)",`
			`],`
			`'hobby': [`
			`r"i love (?:playing\|doing)\s+([a-z]+)",`
			`r"i enjoy\s+([a-z]+)",`
			`r"i'?m (?:learning\|studying)\s+([a-zA-Z\s]+?)(?:\.\|$\|!)",`
			`],`
			`'preference': [`
			`r"i (?:love\|like\|prefer)\s+([a-z\s]+)",`
			`r"i (?:hate\|dislike)\s+([a-z\s]+)",`
			`],`
			`}`


			`def extract_facts_from_text(text: str) -> dict:`
			`"""Extract structured facts from a text using regex patterns"""`
			`facts = {}`
			`text_lower = text.lower()`

			`for fact_type, patterns in EXTRACTION_PATTERNS.items():`
			`for pattern in patterns:`
			`match = re.search(pattern, text_lower if 'name' not in fact_type else text)`
			`if match:`
			`value = match.group(1).strip()`
			`# Clean up the value`
			`value = value.rstrip('.,!?')`
			`if len(value) > 2: # Minimum viable fact`
			`facts[fact_type] = value`
			`break # Use first match`

			`return facts`


			`def create_declarative_memory(fact_type: str, value: str, source_memory: str, user_id: str = None):`
			`"""Create a declarative memory point for Qdrant"""`

			`# Create natural language fact statement`
			`fact_templates = {`
			`'name': f"The user's name is {value}",`
			`'age': f"The user is {value} years old",`
			`'location': f"The user lives in {value}",`
			`'job': f"The user works as a {value}",`
			`'workplace': f"The user works at {value}",`
			`'pet_name': f"The user has a pet named {value}",`
			`'allergy': f"The user is allergic to {value}",`
			`'favorite_color': f"The user's favorite color is {value}",`
			`'hobby': f"The user enjoys {value}",`
			`'preference': f"The user likes {value}",`
			`}`

			`fact_statement = fact_templates.get(fact_type, f"User fact: {fact_type} = {value}")`

			`# Create point structure (will need embeddings from Cat's LLM)`
			`# For now, we'll create the structure and let Cat embed it`
			`return {`
			`'content': fact_statement,`
			`'metadata': {`
			`'type': 'declarative',`
			`'fact_type': fact_type,`
			`'fact_value': value,`
			`'source': source_memory[:200],`
			`'extracted_at': datetime.now().isoformat(),`
			`'user_id': user_id or 'unknown',`
			`}`
			`}`


			`def extract_all_facts(client: QdrantClient):`
			`"""`
			`Extract facts from all consolidated episodic memories.`
			`Returns list of declarative memory points to be stored.`
			`"""`

			`print("🔍 Scanning episodic memories for facts...")`

			`# Get all consolidated episodic memories`
			`episodic, _ = client.scroll(`
			`collection_name='episodic',`
			`limit=1000,`
			`with_payload=True,`
			`with_vectors=False`
			`)`

			`# Only process consolidated memories`
			`consolidated = [e for e in episodic if e.payload.get('metadata', {}).get('consolidated', False)]`

			`print(f"📊 Found {len(consolidated)} consolidated memories to analyze")`

			`all_facts = []`
			`facts_by_type = {}`

			`for memory in consolidated:`
			`content = memory.payload.get('page_content', '')`
			`user_id = memory.payload.get('metadata', {}).get('user_id', 'unknown')`

			`# Extract facts from this memory`
			`facts = extract_facts_from_text(content)`

			`if facts:`
			`print(f"\n✅ Extracted from: '{content[:60]}...'")`
			`for fact_type, value in facts.items():`
			`print(f" → {fact_type}: {value}")`

			`# Create declarative memory`
			`decl_mem = create_declarative_memory(fact_type, value, content, user_id)`
			`all_facts.append(decl_mem)`

			`# Track for summary`
			`if fact_type not in facts_by_type:`
			`facts_by_type[fact_type] = []`
			`facts_by_type[fact_type].append(value)`

			`# Summary`
			`print("\n" + "=" * 70)`
			`print("EXTRACTION SUMMARY")`
			`print("=" * 70)`
			`print(f"Total facts extracted: {len(all_facts)}")`
			`print(f"\nBy type:")`
			`for fact_type, values in sorted(facts_by_type.items()):`
			`print(f" {fact_type}: {len(values)} facts")`
			`for val in values[:3]:`
			`print(f" - {val}")`

			`return all_facts`


			`def store_facts_to_file(facts: list, filename: str = 'extracted_facts.json'):`
			`"""Save extracted facts to JSON file for review"""`
			`import json`
			`with open(filename, 'w') as f:`
			`json.dump(facts, f, indent=2)`
			`print(f"\n📄 Facts saved to {filename}")`


			`def main():`
			`print("=" * 70)`
			`print("DECLARATIVE MEMORY EXTRACTION")`
			`print("=" * 70)`

			`# Connect to Qdrant`
			`client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=10, prefer_grpc=False)`

			`# Extract facts`
			`facts = extract_all_facts(client)`

			`if not facts:`
			`print("\n⚠️ No facts extracted. Ensure memories are consolidated first.")`
			`return`

			`# Save to file for review`
			`store_facts_to_file(facts, 'extracted_facts.json')`

			`print("\n" + "=" * 70)`
			`print("NEXT STEPS:")`
			`print("=" * 70)`
			`print("1. Review extracted_facts.json to verify accuracy")`
			`print("2. Facts need to be embedded and stored in Qdrant's declarative collection")`
			`print("3. This requires Cat's embedder (will implement in next step)")`
			`print("4. Once stored, test recall with direct questions")`
			`print("=" * 70)`


			`if __name__ == "__main__":`
			`main()`