add: cheshire-cat configuration, tooling, tests, and documentation
Configuration: - .env.example, .gitignore, compose.yml (main docker compose) - docker-compose-amd.yml (ROCm), docker-compose-macos.yml - start.sh, stop.sh convenience scripts - LICENSE (Apache 2.0, from upstream Cheshire Cat) Memory management utilities: - analyze_consolidation.py, manual_consolidation.py, verify_consolidation.py - check_memories.py, extract_declarative_facts.py, store_declarative_facts.py - compare_systems.py (system comparison tool) - benchmark_cat.py, streaming_benchmark.py, streaming_benchmark_v2.py Test suite: - quick_test.py, test_setup.py, test_setup_simple.py - test_consolidation_direct.py, test_declarative_recall.py, test_recall.py - test_end_to_end.py, test_full_pipeline.py - test_phase2.py, test_phase2_comprehensive.py Documentation: - README.md, QUICK_START.txt, TEST_README.md, SETUP_COMPLETE.md - PHASE2_IMPLEMENTATION_NOTES.md, PHASE2_TEST_RESULTS.md - POST_OPTIMIZATION_ANALYSIS.md
This commit is contained in:
217
cheshire-cat/extract_declarative_facts.py
Executable file
217
cheshire-cat/extract_declarative_facts.py
Executable file
@@ -0,0 +1,217 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Declarative Memory Extraction
|
||||
|
||||
After consolidation keeps important episodic memories, this script:
|
||||
1. Analyzes kept memories
|
||||
2. Extracts structured facts (name, age, location, preferences, etc.)
|
||||
3. Stores facts in declarative memory collection
|
||||
4. Enables better retrieval for direct questions
|
||||
|
||||
This is the KEY to making Phase 2 actually useful.
|
||||
"""
|
||||
|
||||
import re
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import PointStruct
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
QDRANT_HOST = "localhost"
|
||||
QDRANT_PORT = 6333
|
||||
|
||||
# Fact extraction patterns
|
||||
EXTRACTION_PATTERNS = {
|
||||
'name': [
|
||||
r"(?:my name is|i'm|i am|call me)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)",
|
||||
r"(?:this is|i'm)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)\s*(?:speaking|here)?",
|
||||
],
|
||||
'age': [
|
||||
r"i'?m\s+(\d{1,3})\s+years?\s+old",
|
||||
r"i'?m\s+(\d{1,3})",
|
||||
],
|
||||
'location': [
|
||||
r"i live in\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)",
|
||||
r"i'?m (?:from|in)\s+([A-Z][a-zA-Z\s,]+?)(?:\.|$|,)",
|
||||
],
|
||||
'job': [
|
||||
r"i work (?:as|at)\s+(?:a|an)?\s*([a-zA-Z\s]+?)(?:at|in|for|\.|$)",
|
||||
r"i'?m a\s+([a-zA-Z\s]+?)(?:at|in|for|\.|$)",
|
||||
],
|
||||
'workplace': [
|
||||
r"(?:i work|employed) (?:at|for|in)\s+([A-Z][a-zA-Z\s&]+?)(?:\.|$|,)",
|
||||
],
|
||||
'pet_name': [
|
||||
r"my (?:cat|dog|pet)'?s name is\s+([A-Z][a-z]+)",
|
||||
],
|
||||
'allergy': [
|
||||
r"i'?m allergic to\s+([a-z]+)",
|
||||
r"i have (?:a|an) allergy to\s+([a-z]+)",
|
||||
],
|
||||
'favorite_color': [
|
||||
r"my favorite colo(?:u)?r is\s+([a-z]+)",
|
||||
r"i love (?:the colo(?:u)?r )?\s*([a-z]+)",
|
||||
],
|
||||
'hobby': [
|
||||
r"i love (?:playing|doing)\s+([a-z]+)",
|
||||
r"i enjoy\s+([a-z]+)",
|
||||
r"i'?m (?:learning|studying)\s+([a-zA-Z\s]+?)(?:\.|$|!)",
|
||||
],
|
||||
'preference': [
|
||||
r"i (?:love|like|prefer)\s+([a-z\s]+)",
|
||||
r"i (?:hate|dislike)\s+([a-z\s]+)",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def extract_facts_from_text(text: str) -> dict:
|
||||
"""Extract structured facts from a text using regex patterns"""
|
||||
facts = {}
|
||||
text_lower = text.lower()
|
||||
|
||||
for fact_type, patterns in EXTRACTION_PATTERNS.items():
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, text_lower if 'name' not in fact_type else text)
|
||||
if match:
|
||||
value = match.group(1).strip()
|
||||
# Clean up the value
|
||||
value = value.rstrip('.,!?')
|
||||
if len(value) > 2: # Minimum viable fact
|
||||
facts[fact_type] = value
|
||||
break # Use first match
|
||||
|
||||
return facts
|
||||
|
||||
|
||||
def create_declarative_memory(fact_type: str, value: str, source_memory: str, user_id: str = None):
|
||||
"""Create a declarative memory point for Qdrant"""
|
||||
|
||||
# Create natural language fact statement
|
||||
fact_templates = {
|
||||
'name': f"The user's name is {value}",
|
||||
'age': f"The user is {value} years old",
|
||||
'location': f"The user lives in {value}",
|
||||
'job': f"The user works as a {value}",
|
||||
'workplace': f"The user works at {value}",
|
||||
'pet_name': f"The user has a pet named {value}",
|
||||
'allergy': f"The user is allergic to {value}",
|
||||
'favorite_color': f"The user's favorite color is {value}",
|
||||
'hobby': f"The user enjoys {value}",
|
||||
'preference': f"The user likes {value}",
|
||||
}
|
||||
|
||||
fact_statement = fact_templates.get(fact_type, f"User fact: {fact_type} = {value}")
|
||||
|
||||
# Create point structure (will need embeddings from Cat's LLM)
|
||||
# For now, we'll create the structure and let Cat embed it
|
||||
return {
|
||||
'content': fact_statement,
|
||||
'metadata': {
|
||||
'type': 'declarative',
|
||||
'fact_type': fact_type,
|
||||
'fact_value': value,
|
||||
'source': source_memory[:200],
|
||||
'extracted_at': datetime.now().isoformat(),
|
||||
'user_id': user_id or 'unknown',
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def extract_all_facts(client: QdrantClient):
|
||||
"""
|
||||
Extract facts from all consolidated episodic memories.
|
||||
Returns list of declarative memory points to be stored.
|
||||
"""
|
||||
|
||||
print("🔍 Scanning episodic memories for facts...")
|
||||
|
||||
# Get all consolidated episodic memories
|
||||
episodic, _ = client.scroll(
|
||||
collection_name='episodic',
|
||||
limit=1000,
|
||||
with_payload=True,
|
||||
with_vectors=False
|
||||
)
|
||||
|
||||
# Only process consolidated memories
|
||||
consolidated = [e for e in episodic if e.payload.get('metadata', {}).get('consolidated', False)]
|
||||
|
||||
print(f"📊 Found {len(consolidated)} consolidated memories to analyze")
|
||||
|
||||
all_facts = []
|
||||
facts_by_type = {}
|
||||
|
||||
for memory in consolidated:
|
||||
content = memory.payload.get('page_content', '')
|
||||
user_id = memory.payload.get('metadata', {}).get('user_id', 'unknown')
|
||||
|
||||
# Extract facts from this memory
|
||||
facts = extract_facts_from_text(content)
|
||||
|
||||
if facts:
|
||||
print(f"\n✅ Extracted from: '{content[:60]}...'")
|
||||
for fact_type, value in facts.items():
|
||||
print(f" → {fact_type}: {value}")
|
||||
|
||||
# Create declarative memory
|
||||
decl_mem = create_declarative_memory(fact_type, value, content, user_id)
|
||||
all_facts.append(decl_mem)
|
||||
|
||||
# Track for summary
|
||||
if fact_type not in facts_by_type:
|
||||
facts_by_type[fact_type] = []
|
||||
facts_by_type[fact_type].append(value)
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 70)
|
||||
print("EXTRACTION SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"Total facts extracted: {len(all_facts)}")
|
||||
print(f"\nBy type:")
|
||||
for fact_type, values in sorted(facts_by_type.items()):
|
||||
print(f" {fact_type}: {len(values)} facts")
|
||||
for val in values[:3]:
|
||||
print(f" - {val}")
|
||||
|
||||
return all_facts
|
||||
|
||||
|
||||
def store_facts_to_file(facts: list, filename: str = 'extracted_facts.json'):
|
||||
"""Save extracted facts to JSON file for review"""
|
||||
import json
|
||||
with open(filename, 'w') as f:
|
||||
json.dump(facts, f, indent=2)
|
||||
print(f"\n📄 Facts saved to {filename}")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("DECLARATIVE MEMORY EXTRACTION")
|
||||
print("=" * 70)
|
||||
|
||||
# Connect to Qdrant
|
||||
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=10, prefer_grpc=False)
|
||||
|
||||
# Extract facts
|
||||
facts = extract_all_facts(client)
|
||||
|
||||
if not facts:
|
||||
print("\n⚠️ No facts extracted. Ensure memories are consolidated first.")
|
||||
return
|
||||
|
||||
# Save to file for review
|
||||
store_facts_to_file(facts, 'extracted_facts.json')
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("NEXT STEPS:")
|
||||
print("=" * 70)
|
||||
print("1. Review extracted_facts.json to verify accuracy")
|
||||
print("2. Facts need to be embedded and stored in Qdrant's declarative collection")
|
||||
print("3. This requires Cat's embedder (will implement in next step)")
|
||||
print("4. Once stored, test recall with direct questions")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user