miku-discord/cheshire-cat/store_declarative_facts.py

#!/usr/bin/env python3
"""
Store extracted declarative facts into Qdrant's declarative memory collection.
This enables direct retrieval for factual questions.

Uses sentence-transformers directly (same model Cat uses).
"""

import json
from qdrant_client import QdrantClient
from uuid import uuid4
from sentence_transformers import SentenceTransformer

# Configuration
QDRANT_URL = "http://localhost:6333"
FACTS_FILE = "extracted_facts.json"

# Initialize embedder (same model as Cat uses)
embedder = None

def get_embedder():
    """Get or create the embedder instance."""
    global embedder
    if embedder is None:
        print("🔧 Initializing sentence-transformers embedder...")
        # Use BAAI/bge-large-en-v1.5 which produces 1024-dimensional vectors
        embedder = SentenceTransformer('BAAI/bge-large-en-v1.5')
        print("✅ Embedder ready\n")
    return embedder

def get_embedding(text: str) -> list:
    """
    Get embedding vector for text.

    Args:
        text: Text to embed

    Returns:
        Embedding vector (list of floats)
    """
    try:
        emb = get_embedder()
        vector = emb.encode(text, convert_to_numpy=True).tolist()
        return vector
    except Exception as e:
        print(f"❌ Error generating embedding: {e}")
        raise

def store_fact_in_qdrant(client: QdrantClient, fact: dict) -> str:
    """
    Store a single fact in Qdrant's declarative collection.

    Args:
        client: Qdrant client instance
        fact: Fact dictionary with 'content' and 'metadata'

    Returns:
        Point ID (string)
    """
    try:
        # Get embedding for the fact content
        print(f"  🔄 Embedding: '{fact['content']}'")
        embedding = get_embedding(fact['content'])

        # Generate unique ID
        point_id = str(uuid4())

        # Store in declarative collection with Cat-compatible structure
        client.upsert(
            collection_name="declarative",
            points=[{
                "id": point_id,
                "vector": embedding,
                "payload": {
                    # Core content (Cat standard)
                    "page_content": fact['content'],

                    # Metadata nested object (Cat requires this structure)
                    "metadata": {
                        "source": fact['metadata']['source'],
                        "when": fact['metadata']['extracted_at'],
                        # Additional metadata for our tracking
                        "fact_type": fact['metadata']['fact_type'],
                        "fact_value": fact['metadata']['fact_value'],
                        "user_id": fact['metadata']['user_id'],
                    }
                }
            }]
        )

        print(f"  ✅ Stored with ID: {point_id}")
        return point_id

    except Exception as e:
        print(f"  ❌ Error storing fact: {e}")
        raise

def store_all_facts(facts_file: str):
    """
    Load extracted facts and store them in Qdrant's declarative collection.

    Args:
        facts_file: Path to JSON file with extracted facts
    """
    print("=" * 70)
    print("DECLARATIVE MEMORY STORAGE")
    print("=" * 70)

    # Load extracted facts
    print(f"📂 Loading facts from {facts_file}...")
    try:
        with open(facts_file, 'r') as f:
            facts = json.load(f)
        print(f"📊 Loaded {len(facts)} facts to store\n")
    except FileNotFoundError:
        print(f"❌ Error: {facts_file} not found. Run extract_declarative_facts.py first.")
        return
    except json.JSONDecodeError as e:
        print(f"❌ Error parsing JSON: {e}")
        return

    # Connect to Qdrant
    print(f"🔌 Connecting to Qdrant at {QDRANT_URL}...")
    try:
        client = QdrantClient(url=QDRANT_URL)
        # Verify declarative collection exists
        collections = client.get_collections().collections
        if not any(c.name == "declarative" for c in collections):
            print("❌ Error: 'declarative' collection not found in Qdrant")
            return
        print("✅ Connected to Qdrant\n")
    except Exception as e:
        print(f"❌ Error connecting to Qdrant: {e}")
        return

    # Store each fact
    stored_count = 0
    failed_count = 0

    for i, fact in enumerate(facts, 1):
        fact_type = fact['metadata']['fact_type']
        fact_value = fact['metadata']['fact_value']

        print(f"[{i}/{len(facts)}] Storing {fact_type}: {fact_value}")

        try:
            store_fact_in_qdrant(client, fact)
            stored_count += 1
        except Exception as e:
            print(f"  ❌ Failed: {e}")
            failed_count += 1

        print()

    # Summary
    print("=" * 70)
    print("STORAGE SUMMARY")
    print("=" * 70)
    print(f"✅ Successfully stored: {stored_count} facts")
    if failed_count > 0:
        print(f"❌ Failed to store: {failed_count} facts")
    print()

    # Verify storage
    print("🔍 Verifying storage...")
    try:
        result = client.scroll(
            collection_name="declarative",
            limit=10,
            with_payload=True,
            with_vectors=False
        )

        declarative_facts = [
            p for p in result[0]
            if p.payload.get('fact_type') is not None
        ]

        print(f"📊 Found {len(declarative_facts)} declarative facts in Qdrant")

        if declarative_facts:
            print("\n📝 Sample stored facts:")
            for point in declarative_facts[:5]:
                fact_type = point.payload.get('fact_type', 'unknown')
                fact_value = point.payload.get('fact_value', 'unknown')
                print(f"   - {fact_type}: {fact_value}")

    except Exception as e:
        print(f"❌ Error verifying storage: {e}")

    print()
    print("=" * 70)
    print("NEXT STEPS")
    print("=" * 70)
    print("1. Test recall by asking Miku factual questions")
    print("2. Example queries:")
    print("   - 'What is my favorite color?'")
    print("   - 'Where do I work?'")
    print("   - 'What are my hobbies?'")
    print("3. If recall still fails, check Cat's retrieval logic")
    print("=" * 70)

if __name__ == "__main__":
    store_all_facts(FACTS_FILE)