#!/usr/bin/env python3 """ Store extracted declarative facts into Qdrant's declarative memory collection. This enables direct retrieval for factual questions. Uses sentence-transformers directly (same model Cat uses). """ import json from qdrant_client import QdrantClient from uuid import uuid4 from sentence_transformers import SentenceTransformer # Configuration QDRANT_URL = "http://localhost:6333" FACTS_FILE = "extracted_facts.json" # Initialize embedder (same model as Cat uses) embedder = None def get_embedder(): """Get or create the embedder instance.""" global embedder if embedder is None: print("šŸ”§ Initializing sentence-transformers embedder...") # Use BAAI/bge-large-en-v1.5 which produces 1024-dimensional vectors embedder = SentenceTransformer('BAAI/bge-large-en-v1.5') print("āœ… Embedder ready\n") return embedder def get_embedding(text: str) -> list: """ Get embedding vector for text. Args: text: Text to embed Returns: Embedding vector (list of floats) """ try: emb = get_embedder() vector = emb.encode(text, convert_to_numpy=True).tolist() return vector except Exception as e: print(f"āŒ Error generating embedding: {e}") raise def store_fact_in_qdrant(client: QdrantClient, fact: dict) -> str: """ Store a single fact in Qdrant's declarative collection. Args: client: Qdrant client instance fact: Fact dictionary with 'content' and 'metadata' Returns: Point ID (string) """ try: # Get embedding for the fact content print(f" šŸ”„ Embedding: '{fact['content']}'") embedding = get_embedding(fact['content']) # Generate unique ID point_id = str(uuid4()) # Store in declarative collection with Cat-compatible structure client.upsert( collection_name="declarative", points=[{ "id": point_id, "vector": embedding, "payload": { # Core content (Cat standard) "page_content": fact['content'], # Metadata nested object (Cat requires this structure) "metadata": { "source": fact['metadata']['source'], "when": fact['metadata']['extracted_at'], # Additional metadata for our tracking "fact_type": fact['metadata']['fact_type'], "fact_value": fact['metadata']['fact_value'], "user_id": fact['metadata']['user_id'], } } }] ) print(f" āœ… Stored with ID: {point_id}") return point_id except Exception as e: print(f" āŒ Error storing fact: {e}") raise def store_all_facts(facts_file: str): """ Load extracted facts and store them in Qdrant's declarative collection. Args: facts_file: Path to JSON file with extracted facts """ print("=" * 70) print("DECLARATIVE MEMORY STORAGE") print("=" * 70) # Load extracted facts print(f"šŸ“‚ Loading facts from {facts_file}...") try: with open(facts_file, 'r') as f: facts = json.load(f) print(f"šŸ“Š Loaded {len(facts)} facts to store\n") except FileNotFoundError: print(f"āŒ Error: {facts_file} not found. Run extract_declarative_facts.py first.") return except json.JSONDecodeError as e: print(f"āŒ Error parsing JSON: {e}") return # Connect to Qdrant print(f"šŸ”Œ Connecting to Qdrant at {QDRANT_URL}...") try: client = QdrantClient(url=QDRANT_URL) # Verify declarative collection exists collections = client.get_collections().collections if not any(c.name == "declarative" for c in collections): print("āŒ Error: 'declarative' collection not found in Qdrant") return print("āœ… Connected to Qdrant\n") except Exception as e: print(f"āŒ Error connecting to Qdrant: {e}") return # Store each fact stored_count = 0 failed_count = 0 for i, fact in enumerate(facts, 1): fact_type = fact['metadata']['fact_type'] fact_value = fact['metadata']['fact_value'] print(f"[{i}/{len(facts)}] Storing {fact_type}: {fact_value}") try: store_fact_in_qdrant(client, fact) stored_count += 1 except Exception as e: print(f" āŒ Failed: {e}") failed_count += 1 print() # Summary print("=" * 70) print("STORAGE SUMMARY") print("=" * 70) print(f"āœ… Successfully stored: {stored_count} facts") if failed_count > 0: print(f"āŒ Failed to store: {failed_count} facts") print() # Verify storage print("šŸ” Verifying storage...") try: result = client.scroll( collection_name="declarative", limit=10, with_payload=True, with_vectors=False ) declarative_facts = [ p for p in result[0] if p.payload.get('fact_type') is not None ] print(f"šŸ“Š Found {len(declarative_facts)} declarative facts in Qdrant") if declarative_facts: print("\nšŸ“ Sample stored facts:") for point in declarative_facts[:5]: fact_type = point.payload.get('fact_type', 'unknown') fact_value = point.payload.get('fact_value', 'unknown') print(f" - {fact_type}: {fact_value}") except Exception as e: print(f"āŒ Error verifying storage: {e}") print() print("=" * 70) print("NEXT STEPS") print("=" * 70) print("1. Test recall by asking Miku factual questions") print("2. Example queries:") print(" - 'What is my favorite color?'") print(" - 'Where do I work?'") print(" - 'What are my hobbies?'") print("3. If recall still fails, check Cat's retrieval logic") print("=" * 70) if __name__ == "__main__": store_all_facts(FACTS_FILE)