Files
miku-discord/cheshire-cat/extract_declarative_facts.py

218 lines
6.9 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Declarative Memory Extraction
After consolidation keeps important episodic memories, this script:
1. Analyzes kept memories
2. Extracts structured facts (name, age, location, preferences, etc.)
3. Stores facts in declarative memory collection
4. Enables better retrieval for direct questions
This is the KEY to making Phase 2 actually useful.
"""
import re
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
import uuid
from datetime import datetime
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
# Fact extraction patterns
EXTRACTION_PATTERNS = {
'name': [
r"(?:my name is|i'm|i am|call me)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
r"(?:this is|i'm)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s*(?:speaking|here)?",
],
'age': [
r"i'?m\s+(\d{1,3})\s+years?\s+old",
r"i'?m\s+(\d{1,3})",
],
'location': [
r"i live in\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)",
r"i'?m (?:from|in)\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)",
],
'job': [
r"i work (?:as|at)\s+(?:a|an)?\s*([a-zA-Z\s]+?)(?:at|in|for|\.|$)",
r"i'?m a\s+([a-zA-Z\s]+?)(?:at|in|for|\.|$)",
],
'workplace': [
r"(?:i work|employed) (?:at|for|in)\s+([A-Z][a-zA-Z\s&]+?)(?:\.|$|,)",
],
'pet_name': [
r"my (?:cat|dog|pet)'?s name is\s+([A-Z][a-z]+)",
],
'allergy': [
r"i'?m allergic to\s+([a-z]+)",
r"i have (?:a|an) allergy to\s+([a-z]+)",
],
'favorite_color': [
r"my favorite colo(?:u)?r is\s+([a-z]+)",
r"i love (?:the colo(?:u)?r )?\s*([a-z]+)",
],
'hobby': [
r"i love (?:playing|doing)\s+([a-z]+)",
r"i enjoy\s+([a-z]+)",
r"i'?m (?:learning|studying)\s+([a-zA-Z\s]+?)(?:\.|$|!)",
],
'preference': [
r"i (?:love|like|prefer)\s+([a-z\s]+)",
r"i (?:hate|dislike)\s+([a-z\s]+)",
],
}
def extract_facts_from_text(text: str) -> dict:
"""Extract structured facts from a text using regex patterns"""
facts = {}
text_lower = text.lower()
for fact_type, patterns in EXTRACTION_PATTERNS.items():
for pattern in patterns:
match = re.search(pattern, text_lower if 'name' not in fact_type else text)
if match:
value = match.group(1).strip()
# Clean up the value
value = value.rstrip('.,!?')
if len(value) > 2: # Minimum viable fact
facts[fact_type] = value
break # Use first match
return facts
def create_declarative_memory(fact_type: str, value: str, source_memory: str, user_id: str = None):
"""Create a declarative memory point for Qdrant"""
# Create natural language fact statement
fact_templates = {
'name': f"The user's name is {value}",
'age': f"The user is {value} years old",
'location': f"The user lives in {value}",
'job': f"The user works as a {value}",
'workplace': f"The user works at {value}",
'pet_name': f"The user has a pet named {value}",
'allergy': f"The user is allergic to {value}",
'favorite_color': f"The user's favorite color is {value}",
'hobby': f"The user enjoys {value}",
'preference': f"The user likes {value}",
}
fact_statement = fact_templates.get(fact_type, f"User fact: {fact_type} = {value}")
# Create point structure (will need embeddings from Cat's LLM)
# For now, we'll create the structure and let Cat embed it
return {
'content': fact_statement,
'metadata': {
'type': 'declarative',
'fact_type': fact_type,
'fact_value': value,
'source': source_memory[:200],
'extracted_at': datetime.now().isoformat(),
'user_id': user_id or 'unknown',
}
}
def extract_all_facts(client: QdrantClient):
"""
Extract facts from all consolidated episodic memories.
Returns list of declarative memory points to be stored.
"""
print("🔍 Scanning episodic memories for facts...")
# Get all consolidated episodic memories
episodic, _ = client.scroll(
collection_name='episodic',
limit=1000,
with_payload=True,
with_vectors=False
)
# Only process consolidated memories
consolidated = [e for e in episodic if e.payload.get('metadata', {}).get('consolidated', False)]
print(f"📊 Found {len(consolidated)} consolidated memories to analyze")
all_facts = []
facts_by_type = {}
for memory in consolidated:
content = memory.payload.get('page_content', '')
user_id = memory.payload.get('metadata', {}).get('user_id', 'unknown')
# Extract facts from this memory
facts = extract_facts_from_text(content)
if facts:
print(f"\n✅ Extracted from: '{content[:60]}...'")
for fact_type, value in facts.items():
print(f"{fact_type}: {value}")
# Create declarative memory
decl_mem = create_declarative_memory(fact_type, value, content, user_id)
all_facts.append(decl_mem)
# Track for summary
if fact_type not in facts_by_type:
facts_by_type[fact_type] = []
facts_by_type[fact_type].append(value)
# Summary
print("\n" + "=" * 70)
print("EXTRACTION SUMMARY")
print("=" * 70)
print(f"Total facts extracted: {len(all_facts)}")
print(f"\nBy type:")
for fact_type, values in sorted(facts_by_type.items()):
print(f" {fact_type}: {len(values)} facts")
for val in values[:3]:
print(f" - {val}")
return all_facts
def store_facts_to_file(facts: list, filename: str = 'extracted_facts.json'):
"""Save extracted facts to JSON file for review"""
import json
with open(filename, 'w') as f:
json.dump(facts, f, indent=2)
print(f"\n📄 Facts saved to {filename}")
def main():
print("=" * 70)
print("DECLARATIVE MEMORY EXTRACTION")
print("=" * 70)
# Connect to Qdrant
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=10, prefer_grpc=False)
# Extract facts
facts = extract_all_facts(client)
if not facts:
print("\n⚠️ No facts extracted. Ensure memories are consolidated first.")
return
# Save to file for review
store_facts_to_file(facts, 'extracted_facts.json')
print("\n" + "=" * 70)
print("NEXT STEPS:")
print("=" * 70)
print("1. Review extracted_facts.json to verify accuracy")
print("2. Facts need to be embedded and stored in Qdrant's declarative collection")
print("3. This requires Cat's embedder (will implement in next step)")
print("4. Once stored, test recall with direct questions")
print("=" * 70)
if __name__ == "__main__":
main()