add: cheshire-cat configuration, tooling, tests, and documentation
Configuration: - .env.example, .gitignore, compose.yml (main docker compose) - docker-compose-amd.yml (ROCm), docker-compose-macos.yml - start.sh, stop.sh convenience scripts - LICENSE (Apache 2.0, from upstream Cheshire Cat) Memory management utilities: - analyze_consolidation.py, manual_consolidation.py, verify_consolidation.py - check_memories.py, extract_declarative_facts.py, store_declarative_facts.py - compare_systems.py (system comparison tool) - benchmark_cat.py, streaming_benchmark.py, streaming_benchmark_v2.py Test suite: - quick_test.py, test_setup.py, test_setup_simple.py - test_consolidation_direct.py, test_declarative_recall.py, test_recall.py - test_end_to_end.py, test_full_pipeline.py - test_phase2.py, test_phase2_comprehensive.py Documentation: - README.md, QUICK_START.txt, TEST_README.md, SETUP_COMPLETE.md - PHASE2_IMPLEMENTATION_NOTES.md, PHASE2_TEST_RESULTS.md - POST_OPTIMIZATION_ANALYSIS.md
This commit is contained in:
204
cheshire-cat/store_declarative_facts.py
Executable file
204
cheshire-cat/store_declarative_facts.py
Executable file
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Store extracted declarative facts into Qdrant's declarative memory collection.
|
||||
This enables direct retrieval for factual questions.
|
||||
|
||||
Uses sentence-transformers directly (same model Cat uses).
|
||||
"""
|
||||
|
||||
import json
|
||||
from qdrant_client import QdrantClient
|
||||
from uuid import uuid4
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# Configuration
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
FACTS_FILE = "extracted_facts.json"
|
||||
|
||||
# Initialize embedder (same model as Cat uses)
|
||||
embedder = None
|
||||
|
||||
def get_embedder():
|
||||
"""Get or create the embedder instance."""
|
||||
global embedder
|
||||
if embedder is None:
|
||||
print("🔧 Initializing sentence-transformers embedder...")
|
||||
# Use BAAI/bge-large-en-v1.5 which produces 1024-dimensional vectors
|
||||
embedder = SentenceTransformer('BAAI/bge-large-en-v1.5')
|
||||
print("✅ Embedder ready\n")
|
||||
return embedder
|
||||
|
||||
def get_embedding(text: str) -> list:
|
||||
"""
|
||||
Get embedding vector for text.
|
||||
|
||||
Args:
|
||||
text: Text to embed
|
||||
|
||||
Returns:
|
||||
Embedding vector (list of floats)
|
||||
"""
|
||||
try:
|
||||
emb = get_embedder()
|
||||
vector = emb.encode(text, convert_to_numpy=True).tolist()
|
||||
return vector
|
||||
except Exception as e:
|
||||
print(f"❌ Error generating embedding: {e}")
|
||||
raise
|
||||
|
||||
def store_fact_in_qdrant(client: QdrantClient, fact: dict) -> str:
|
||||
"""
|
||||
Store a single fact in Qdrant's declarative collection.
|
||||
|
||||
Args:
|
||||
client: Qdrant client instance
|
||||
fact: Fact dictionary with 'content' and 'metadata'
|
||||
|
||||
Returns:
|
||||
Point ID (string)
|
||||
"""
|
||||
try:
|
||||
# Get embedding for the fact content
|
||||
print(f" 🔄 Embedding: '{fact['content']}'")
|
||||
embedding = get_embedding(fact['content'])
|
||||
|
||||
# Generate unique ID
|
||||
point_id = str(uuid4())
|
||||
|
||||
# Store in declarative collection with Cat-compatible structure
|
||||
client.upsert(
|
||||
collection_name="declarative",
|
||||
points=[{
|
||||
"id": point_id,
|
||||
"vector": embedding,
|
||||
"payload": {
|
||||
# Core content (Cat standard)
|
||||
"page_content": fact['content'],
|
||||
|
||||
# Metadata nested object (Cat requires this structure)
|
||||
"metadata": {
|
||||
"source": fact['metadata']['source'],
|
||||
"when": fact['metadata']['extracted_at'],
|
||||
# Additional metadata for our tracking
|
||||
"fact_type": fact['metadata']['fact_type'],
|
||||
"fact_value": fact['metadata']['fact_value'],
|
||||
"user_id": fact['metadata']['user_id'],
|
||||
}
|
||||
}
|
||||
}]
|
||||
)
|
||||
|
||||
print(f" ✅ Stored with ID: {point_id}")
|
||||
return point_id
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error storing fact: {e}")
|
||||
raise
|
||||
|
||||
def store_all_facts(facts_file: str):
|
||||
"""
|
||||
Load extracted facts and store them in Qdrant's declarative collection.
|
||||
|
||||
Args:
|
||||
facts_file: Path to JSON file with extracted facts
|
||||
"""
|
||||
print("=" * 70)
|
||||
print("DECLARATIVE MEMORY STORAGE")
|
||||
print("=" * 70)
|
||||
|
||||
# Load extracted facts
|
||||
print(f"📂 Loading facts from {facts_file}...")
|
||||
try:
|
||||
with open(facts_file, 'r') as f:
|
||||
facts = json.load(f)
|
||||
print(f"📊 Loaded {len(facts)} facts to store\n")
|
||||
except FileNotFoundError:
|
||||
print(f"❌ Error: {facts_file} not found. Run extract_declarative_facts.py first.")
|
||||
return
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"❌ Error parsing JSON: {e}")
|
||||
return
|
||||
|
||||
# Connect to Qdrant
|
||||
print(f"🔌 Connecting to Qdrant at {QDRANT_URL}...")
|
||||
try:
|
||||
client = QdrantClient(url=QDRANT_URL)
|
||||
# Verify declarative collection exists
|
||||
collections = client.get_collections().collections
|
||||
if not any(c.name == "declarative" for c in collections):
|
||||
print("❌ Error: 'declarative' collection not found in Qdrant")
|
||||
return
|
||||
print("✅ Connected to Qdrant\n")
|
||||
except Exception as e:
|
||||
print(f"❌ Error connecting to Qdrant: {e}")
|
||||
return
|
||||
|
||||
# Store each fact
|
||||
stored_count = 0
|
||||
failed_count = 0
|
||||
|
||||
for i, fact in enumerate(facts, 1):
|
||||
fact_type = fact['metadata']['fact_type']
|
||||
fact_value = fact['metadata']['fact_value']
|
||||
|
||||
print(f"[{i}/{len(facts)}] Storing {fact_type}: {fact_value}")
|
||||
|
||||
try:
|
||||
store_fact_in_qdrant(client, fact)
|
||||
stored_count += 1
|
||||
except Exception as e:
|
||||
print(f" ❌ Failed: {e}")
|
||||
failed_count += 1
|
||||
|
||||
print()
|
||||
|
||||
# Summary
|
||||
print("=" * 70)
|
||||
print("STORAGE SUMMARY")
|
||||
print("=" * 70)
|
||||
print(f"✅ Successfully stored: {stored_count} facts")
|
||||
if failed_count > 0:
|
||||
print(f"❌ Failed to store: {failed_count} facts")
|
||||
print()
|
||||
|
||||
# Verify storage
|
||||
print("🔍 Verifying storage...")
|
||||
try:
|
||||
result = client.scroll(
|
||||
collection_name="declarative",
|
||||
limit=10,
|
||||
with_payload=True,
|
||||
with_vectors=False
|
||||
)
|
||||
|
||||
declarative_facts = [
|
||||
p for p in result[0]
|
||||
if p.payload.get('fact_type') is not None
|
||||
]
|
||||
|
||||
print(f"📊 Found {len(declarative_facts)} declarative facts in Qdrant")
|
||||
|
||||
if declarative_facts:
|
||||
print("\n📝 Sample stored facts:")
|
||||
for point in declarative_facts[:5]:
|
||||
fact_type = point.payload.get('fact_type', 'unknown')
|
||||
fact_value = point.payload.get('fact_value', 'unknown')
|
||||
print(f" - {fact_type}: {fact_value}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error verifying storage: {e}")
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("NEXT STEPS")
|
||||
print("=" * 70)
|
||||
print("1. Test recall by asking Miku factual questions")
|
||||
print("2. Example queries:")
|
||||
print(" - 'What is my favorite color?'")
|
||||
print(" - 'Where do I work?'")
|
||||
print(" - 'What are my hobbies?'")
|
||||
print("3. If recall still fails, check Cat's retrieval logic")
|
||||
print("=" * 70)
|
||||
|
||||
if __name__ == "__main__":
|
||||
store_all_facts(FACTS_FILE)
|
||||
Reference in New Issue
Block a user