218 lines
6.9 KiB
Python
218 lines
6.9 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Declarative Memory Extraction
|
||
|
|
|
||
|
|
After consolidation keeps important episodic memories, this script:
|
||
|
|
1. Analyzes kept memories
|
||
|
|
2. Extracts structured facts (name, age, location, preferences, etc.)
|
||
|
|
3. Stores facts in declarative memory collection
|
||
|
|
4. Enables better retrieval for direct questions
|
||
|
|
|
||
|
|
This is the KEY to making Phase 2 actually useful.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import re
|
||
|
|
from qdrant_client import QdrantClient
|
||
|
|
from qdrant_client.models import PointStruct
|
||
|
|
import uuid
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
|
||
|
|
QDRANT_HOST = "localhost"
|
||
|
|
QDRANT_PORT = 6333
|
||
|
|
|
||
|
|
# Fact extraction patterns
|
||
|
|
EXTRACTION_PATTERNS = {
|
||
|
|
'name': [
|
||
|
|
r"(?:my name is|i'm|i am|call me)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
|
||
|
|
r"(?:this is|i'm)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s*(?:speaking|here)?",
|
||
|
|
],
|
||
|
|
'age': [
|
||
|
|
r"i'?m\s+(\d{1,3})\s+years?\s+old",
|
||
|
|
r"i'?m\s+(\d{1,3})",
|
||
|
|
],
|
||
|
|
'location': [
|
||
|
|
r"i live in\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)",
|
||
|
|
r"i'?m (?:from|in)\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)",
|
||
|
|
],
|
||
|
|
'job': [
|
||
|
|
r"i work (?:as|at)\s+(?:a|an)?\s*([a-zA-Z\s]+?)(?:at|in|for|\.|$)",
|
||
|
|
r"i'?m a\s+([a-zA-Z\s]+?)(?:at|in|for|\.|$)",
|
||
|
|
],
|
||
|
|
'workplace': [
|
||
|
|
r"(?:i work|employed) (?:at|for|in)\s+([A-Z][a-zA-Z\s&]+?)(?:\.|$|,)",
|
||
|
|
],
|
||
|
|
'pet_name': [
|
||
|
|
r"my (?:cat|dog|pet)'?s name is\s+([A-Z][a-z]+)",
|
||
|
|
],
|
||
|
|
'allergy': [
|
||
|
|
r"i'?m allergic to\s+([a-z]+)",
|
||
|
|
r"i have (?:a|an) allergy to\s+([a-z]+)",
|
||
|
|
],
|
||
|
|
'favorite_color': [
|
||
|
|
r"my favorite colo(?:u)?r is\s+([a-z]+)",
|
||
|
|
r"i love (?:the colo(?:u)?r )?\s*([a-z]+)",
|
||
|
|
],
|
||
|
|
'hobby': [
|
||
|
|
r"i love (?:playing|doing)\s+([a-z]+)",
|
||
|
|
r"i enjoy\s+([a-z]+)",
|
||
|
|
r"i'?m (?:learning|studying)\s+([a-zA-Z\s]+?)(?:\.|$|!)",
|
||
|
|
],
|
||
|
|
'preference': [
|
||
|
|
r"i (?:love|like|prefer)\s+([a-z\s]+)",
|
||
|
|
r"i (?:hate|dislike)\s+([a-z\s]+)",
|
||
|
|
],
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def extract_facts_from_text(text: str) -> dict:
|
||
|
|
"""Extract structured facts from a text using regex patterns"""
|
||
|
|
facts = {}
|
||
|
|
text_lower = text.lower()
|
||
|
|
|
||
|
|
for fact_type, patterns in EXTRACTION_PATTERNS.items():
|
||
|
|
for pattern in patterns:
|
||
|
|
match = re.search(pattern, text_lower if 'name' not in fact_type else text)
|
||
|
|
if match:
|
||
|
|
value = match.group(1).strip()
|
||
|
|
# Clean up the value
|
||
|
|
value = value.rstrip('.,!?')
|
||
|
|
if len(value) > 2: # Minimum viable fact
|
||
|
|
facts[fact_type] = value
|
||
|
|
break # Use first match
|
||
|
|
|
||
|
|
return facts
|
||
|
|
|
||
|
|
|
||
|
|
def create_declarative_memory(fact_type: str, value: str, source_memory: str, user_id: str = None):
|
||
|
|
"""Create a declarative memory point for Qdrant"""
|
||
|
|
|
||
|
|
# Create natural language fact statement
|
||
|
|
fact_templates = {
|
||
|
|
'name': f"The user's name is {value}",
|
||
|
|
'age': f"The user is {value} years old",
|
||
|
|
'location': f"The user lives in {value}",
|
||
|
|
'job': f"The user works as a {value}",
|
||
|
|
'workplace': f"The user works at {value}",
|
||
|
|
'pet_name': f"The user has a pet named {value}",
|
||
|
|
'allergy': f"The user is allergic to {value}",
|
||
|
|
'favorite_color': f"The user's favorite color is {value}",
|
||
|
|
'hobby': f"The user enjoys {value}",
|
||
|
|
'preference': f"The user likes {value}",
|
||
|
|
}
|
||
|
|
|
||
|
|
fact_statement = fact_templates.get(fact_type, f"User fact: {fact_type} = {value}")
|
||
|
|
|
||
|
|
# Create point structure (will need embeddings from Cat's LLM)
|
||
|
|
# For now, we'll create the structure and let Cat embed it
|
||
|
|
return {
|
||
|
|
'content': fact_statement,
|
||
|
|
'metadata': {
|
||
|
|
'type': 'declarative',
|
||
|
|
'fact_type': fact_type,
|
||
|
|
'fact_value': value,
|
||
|
|
'source': source_memory[:200],
|
||
|
|
'extracted_at': datetime.now().isoformat(),
|
||
|
|
'user_id': user_id or 'unknown',
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def extract_all_facts(client: QdrantClient):
|
||
|
|
"""
|
||
|
|
Extract facts from all consolidated episodic memories.
|
||
|
|
Returns list of declarative memory points to be stored.
|
||
|
|
"""
|
||
|
|
|
||
|
|
print("🔍 Scanning episodic memories for facts...")
|
||
|
|
|
||
|
|
# Get all consolidated episodic memories
|
||
|
|
episodic, _ = client.scroll(
|
||
|
|
collection_name='episodic',
|
||
|
|
limit=1000,
|
||
|
|
with_payload=True,
|
||
|
|
with_vectors=False
|
||
|
|
)
|
||
|
|
|
||
|
|
# Only process consolidated memories
|
||
|
|
consolidated = [e for e in episodic if e.payload.get('metadata', {}).get('consolidated', False)]
|
||
|
|
|
||
|
|
print(f"📊 Found {len(consolidated)} consolidated memories to analyze")
|
||
|
|
|
||
|
|
all_facts = []
|
||
|
|
facts_by_type = {}
|
||
|
|
|
||
|
|
for memory in consolidated:
|
||
|
|
content = memory.payload.get('page_content', '')
|
||
|
|
user_id = memory.payload.get('metadata', {}).get('user_id', 'unknown')
|
||
|
|
|
||
|
|
# Extract facts from this memory
|
||
|
|
facts = extract_facts_from_text(content)
|
||
|
|
|
||
|
|
if facts:
|
||
|
|
print(f"\n✅ Extracted from: '{content[:60]}...'")
|
||
|
|
for fact_type, value in facts.items():
|
||
|
|
print(f" → {fact_type}: {value}")
|
||
|
|
|
||
|
|
# Create declarative memory
|
||
|
|
decl_mem = create_declarative_memory(fact_type, value, content, user_id)
|
||
|
|
all_facts.append(decl_mem)
|
||
|
|
|
||
|
|
# Track for summary
|
||
|
|
if fact_type not in facts_by_type:
|
||
|
|
facts_by_type[fact_type] = []
|
||
|
|
facts_by_type[fact_type].append(value)
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("EXTRACTION SUMMARY")
|
||
|
|
print("=" * 70)
|
||
|
|
print(f"Total facts extracted: {len(all_facts)}")
|
||
|
|
print(f"\nBy type:")
|
||
|
|
for fact_type, values in sorted(facts_by_type.items()):
|
||
|
|
print(f" {fact_type}: {len(values)} facts")
|
||
|
|
for val in values[:3]:
|
||
|
|
print(f" - {val}")
|
||
|
|
|
||
|
|
return all_facts
|
||
|
|
|
||
|
|
|
||
|
|
def store_facts_to_file(facts: list, filename: str = 'extracted_facts.json'):
|
||
|
|
"""Save extracted facts to JSON file for review"""
|
||
|
|
import json
|
||
|
|
with open(filename, 'w') as f:
|
||
|
|
json.dump(facts, f, indent=2)
|
||
|
|
print(f"\n📄 Facts saved to {filename}")
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
print("=" * 70)
|
||
|
|
print("DECLARATIVE MEMORY EXTRACTION")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
# Connect to Qdrant
|
||
|
|
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=10, prefer_grpc=False)
|
||
|
|
|
||
|
|
# Extract facts
|
||
|
|
facts = extract_all_facts(client)
|
||
|
|
|
||
|
|
if not facts:
|
||
|
|
print("\n⚠️ No facts extracted. Ensure memories are consolidated first.")
|
||
|
|
return
|
||
|
|
|
||
|
|
# Save to file for review
|
||
|
|
store_facts_to_file(facts, 'extracted_facts.json')
|
||
|
|
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("NEXT STEPS:")
|
||
|
|
print("=" * 70)
|
||
|
|
print("1. Review extracted_facts.json to verify accuracy")
|
||
|
|
print("2. Facts need to be embedded and stored in Qdrant's declarative collection")
|
||
|
|
print("3. This requires Cat's embedder (will implement in next step)")
|
||
|
|
print("4. Once stored, test recall with direct questions")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|