#!/usr/bin/env python3 """ Declarative Memory Extraction After consolidation keeps important episodic memories, this script: 1. Analyzes kept memories 2. Extracts structured facts (name, age, location, preferences, etc.) 3. Stores facts in declarative memory collection 4. Enables better retrieval for direct questions This is the KEY to making Phase 2 actually useful. """ import re from qdrant_client import QdrantClient from qdrant_client.models import PointStruct import uuid from datetime import datetime QDRANT_HOST = "localhost" QDRANT_PORT = 6333 # Fact extraction patterns EXTRACTION_PATTERNS = { 'name': [ r"(?:my name is|i'm|i am|call me)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)", r"(?:this is|i'm)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s*(?:speaking|here)?", ], 'age': [ r"i'?m\s+(\d{1,3})\s+years?\s+old", r"i'?m\s+(\d{1,3})", ], 'location': [ r"i live in\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)", r"i'?m (?:from|in)\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)", ], 'job': [ r"i work (?:as|at)\s+(?:a|an)?\s*([a-zA-Z\s]+?)(?:at|in|for|\.|$)", r"i'?m a\s+([a-zA-Z\s]+?)(?:at|in|for|\.|$)", ], 'workplace': [ r"(?:i work|employed) (?:at|for|in)\s+([A-Z][a-zA-Z\s&]+?)(?:\.|$|,)", ], 'pet_name': [ r"my (?:cat|dog|pet)'?s name is\s+([A-Z][a-z]+)", ], 'allergy': [ r"i'?m allergic to\s+([a-z]+)", r"i have (?:a|an) allergy to\s+([a-z]+)", ], 'favorite_color': [ r"my favorite colo(?:u)?r is\s+([a-z]+)", r"i love (?:the colo(?:u)?r )?\s*([a-z]+)", ], 'hobby': [ r"i love (?:playing|doing)\s+([a-z]+)", r"i enjoy\s+([a-z]+)", r"i'?m (?:learning|studying)\s+([a-zA-Z\s]+?)(?:\.|$|!)", ], 'preference': [ r"i (?:love|like|prefer)\s+([a-z\s]+)", r"i (?:hate|dislike)\s+([a-z\s]+)", ], } def extract_facts_from_text(text: str) -> dict: """Extract structured facts from a text using regex patterns""" facts = {} text_lower = text.lower() for fact_type, patterns in EXTRACTION_PATTERNS.items(): for pattern in patterns: match = re.search(pattern, text_lower if 'name' not in fact_type else text) if match: value = match.group(1).strip() # Clean up the value value = value.rstrip('.,!?') if len(value) > 2: # Minimum viable fact facts[fact_type] = value break # Use first match return facts def create_declarative_memory(fact_type: str, value: str, source_memory: str, user_id: str = None): """Create a declarative memory point for Qdrant""" # Create natural language fact statement fact_templates = { 'name': f"The user's name is {value}", 'age': f"The user is {value} years old", 'location': f"The user lives in {value}", 'job': f"The user works as a {value}", 'workplace': f"The user works at {value}", 'pet_name': f"The user has a pet named {value}", 'allergy': f"The user is allergic to {value}", 'favorite_color': f"The user's favorite color is {value}", 'hobby': f"The user enjoys {value}", 'preference': f"The user likes {value}", } fact_statement = fact_templates.get(fact_type, f"User fact: {fact_type} = {value}") # Create point structure (will need embeddings from Cat's LLM) # For now, we'll create the structure and let Cat embed it return { 'content': fact_statement, 'metadata': { 'type': 'declarative', 'fact_type': fact_type, 'fact_value': value, 'source': source_memory[:200], 'extracted_at': datetime.now().isoformat(), 'user_id': user_id or 'unknown', } } def extract_all_facts(client: QdrantClient): """ Extract facts from all consolidated episodic memories. Returns list of declarative memory points to be stored. """ print("šŸ” Scanning episodic memories for facts...") # Get all consolidated episodic memories episodic, _ = client.scroll( collection_name='episodic', limit=1000, with_payload=True, with_vectors=False ) # Only process consolidated memories consolidated = [e for e in episodic if e.payload.get('metadata', {}).get('consolidated', False)] print(f"šŸ“Š Found {len(consolidated)} consolidated memories to analyze") all_facts = [] facts_by_type = {} for memory in consolidated: content = memory.payload.get('page_content', '') user_id = memory.payload.get('metadata', {}).get('user_id', 'unknown') # Extract facts from this memory facts = extract_facts_from_text(content) if facts: print(f"\nāœ… Extracted from: '{content[:60]}...'") for fact_type, value in facts.items(): print(f" → {fact_type}: {value}") # Create declarative memory decl_mem = create_declarative_memory(fact_type, value, content, user_id) all_facts.append(decl_mem) # Track for summary if fact_type not in facts_by_type: facts_by_type[fact_type] = [] facts_by_type[fact_type].append(value) # Summary print("\n" + "=" * 70) print("EXTRACTION SUMMARY") print("=" * 70) print(f"Total facts extracted: {len(all_facts)}") print(f"\nBy type:") for fact_type, values in sorted(facts_by_type.items()): print(f" {fact_type}: {len(values)} facts") for val in values[:3]: print(f" - {val}") return all_facts def store_facts_to_file(facts: list, filename: str = 'extracted_facts.json'): """Save extracted facts to JSON file for review""" import json with open(filename, 'w') as f: json.dump(facts, f, indent=2) print(f"\nšŸ“„ Facts saved to {filename}") def main(): print("=" * 70) print("DECLARATIVE MEMORY EXTRACTION") print("=" * 70) # Connect to Qdrant client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=10, prefer_grpc=False) # Extract facts facts = extract_all_facts(client) if not facts: print("\nāš ļø No facts extracted. Ensure memories are consolidated first.") return # Save to file for review store_facts_to_file(facts, 'extracted_facts.json') print("\n" + "=" * 70) print("NEXT STEPS:") print("=" * 70) print("1. Review extracted_facts.json to verify accuracy") print("2. Facts need to be embedded and stored in Qdrant's declarative collection") print("3. This requires Cat's embedder (will implement in next step)") print("4. Once stored, test recall with direct questions") print("=" * 70) if __name__ == "__main__": main()