add: cheshire-cat configuration, tooling, tests, and documentation
Configuration: - .env.example, .gitignore, compose.yml (main docker compose) - docker-compose-amd.yml (ROCm), docker-compose-macos.yml - start.sh, stop.sh convenience scripts - LICENSE (Apache 2.0, from upstream Cheshire Cat) Memory management utilities: - analyze_consolidation.py, manual_consolidation.py, verify_consolidation.py - check_memories.py, extract_declarative_facts.py, store_declarative_facts.py - compare_systems.py (system comparison tool) - benchmark_cat.py, streaming_benchmark.py, streaming_benchmark_v2.py Test suite: - quick_test.py, test_setup.py, test_setup_simple.py - test_consolidation_direct.py, test_declarative_recall.py, test_recall.py - test_end_to_end.py, test_full_pipeline.py - test_phase2.py, test_phase2_comprehensive.py Documentation: - README.md, QUICK_START.txt, TEST_README.md, SETUP_COMPLETE.md - PHASE2_IMPLEMENTATION_NOTES.md, PHASE2_TEST_RESULTS.md - POST_OPTIMIZATION_ANALYSIS.md
This commit is contained in:
295
cheshire-cat/test_full_pipeline.py
Executable file
295
cheshire-cat/test_full_pipeline.py
Executable file
@@ -0,0 +1,295 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Full pipeline test for Phase 2 memory consolidation with declarative extraction.
|
||||
|
||||
Steps:
|
||||
1. Tell Miku 20 facts (mix of important and trivial)
|
||||
2. Run consolidation to delete trivial messages
|
||||
3. Extract facts from consolidated episodic memories
|
||||
4. Store facts in declarative memory
|
||||
5. Test recall with factual questions
|
||||
"""
|
||||
|
||||
import requests
|
||||
import time
|
||||
import sys
|
||||
|
||||
CAT_URL = "http://localhost:1865"
|
||||
USER_ID = "test_user_pipeline"
|
||||
|
||||
# Test messages to tell Miku
|
||||
TEST_MESSAGES = [
|
||||
# Important facts (should be remembered)
|
||||
"My name is Sarah Chen.",
|
||||
"I'm 28 years old.",
|
||||
"I live in Seattle, Washington.",
|
||||
"I work as a software engineer at Microsoft.",
|
||||
"My favorite color is forest green.",
|
||||
"I love playing piano. I've been practicing for 15 years.",
|
||||
"I'm learning Japanese! Currently at N3 level.",
|
||||
"I have a cat named Luna.",
|
||||
"I'm allergic to peanuts.",
|
||||
"I prefer cats over dogs, though I like both.",
|
||||
"My favorite food is ramen.",
|
||||
"I enjoy hiking on weekends.",
|
||||
"I graduated from UW in 2018.",
|
||||
"My birthday is March 15th.",
|
||||
|
||||
# Trivial messages (should be deleted during consolidation)
|
||||
"lol",
|
||||
"k",
|
||||
"haha",
|
||||
"brb",
|
||||
"nice",
|
||||
"cool",
|
||||
]
|
||||
|
||||
# Questions to test recall
|
||||
RECALL_TESTS = [
|
||||
{
|
||||
"question": "What is my name?",
|
||||
"expected": "sarah",
|
||||
"fact_type": "name"
|
||||
},
|
||||
{
|
||||
"question": "How old am I?",
|
||||
"expected": "28",
|
||||
"fact_type": "age"
|
||||
},
|
||||
{
|
||||
"question": "Where do I live?",
|
||||
"expected": "seattle",
|
||||
"fact_type": "location"
|
||||
},
|
||||
{
|
||||
"question": "What do I do for work?",
|
||||
"expected": "software engineer",
|
||||
"fact_type": "job"
|
||||
},
|
||||
{
|
||||
"question": "What is my favorite color?",
|
||||
"expected": "forest green",
|
||||
"fact_type": "favorite_color"
|
||||
},
|
||||
{
|
||||
"question": "What instruments do I play?",
|
||||
"expected": "piano",
|
||||
"fact_type": "hobby"
|
||||
},
|
||||
{
|
||||
"question": "What language am I learning?",
|
||||
"expected": "japanese",
|
||||
"fact_type": "hobby"
|
||||
},
|
||||
{
|
||||
"question": "What is my cat's name?",
|
||||
"expected": "luna",
|
||||
"fact_type": "pet_name"
|
||||
},
|
||||
{
|
||||
"question": "What am I allergic to?",
|
||||
"expected": "peanut",
|
||||
"fact_type": "allergy"
|
||||
},
|
||||
{
|
||||
"question": "Do I prefer cats or dogs?",
|
||||
"expected": "cat",
|
||||
"fact_type": "preference"
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def send_message(text: str) -> dict:
|
||||
"""Send a message to Miku."""
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{CAT_URL}/message",
|
||||
json={"text": text, "user_id": USER_ID},
|
||||
timeout=30
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
print(f" ❌ Error sending message: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def trigger_consolidation() -> bool:
|
||||
"""Trigger memory consolidation."""
|
||||
try:
|
||||
response = send_message("consolidate now")
|
||||
if response:
|
||||
print(" ✅ Consolidation triggered")
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" ❌ Error triggering consolidation: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 80)
|
||||
print("PHASE 2 FULL PIPELINE TEST")
|
||||
print("=" * 80)
|
||||
print(f"Testing with user: {USER_ID}\n")
|
||||
|
||||
# Step 1: Tell Miku the facts
|
||||
print("STEP 1: Telling Miku facts...")
|
||||
print("-" * 80)
|
||||
successful_sends = 0
|
||||
|
||||
for i, message in enumerate(TEST_MESSAGES, 1):
|
||||
is_trivial = message in ["lol", "k", "haha", "brb", "nice", "cool"]
|
||||
msg_type = "TRIVIAL" if is_trivial else "IMPORTANT"
|
||||
|
||||
print(f"[{i}/{len(TEST_MESSAGES)}] {msg_type}: {message}")
|
||||
response = send_message(message)
|
||||
|
||||
if response:
|
||||
print(f" ✅ Sent successfully")
|
||||
successful_sends += 1
|
||||
else:
|
||||
print(f" ❌ Failed to send")
|
||||
|
||||
time.sleep(1) # Brief pause between messages
|
||||
|
||||
print(f"\n✅ Successfully sent {successful_sends}/{len(TEST_MESSAGES)} messages\n")
|
||||
|
||||
# Step 2: Trigger consolidation
|
||||
print("STEP 2: Triggering consolidation...")
|
||||
print("-" * 80)
|
||||
|
||||
if not trigger_consolidation():
|
||||
print("❌ Failed to trigger consolidation")
|
||||
sys.exit(1)
|
||||
|
||||
print("⏳ Waiting for consolidation to complete...")
|
||||
time.sleep(5)
|
||||
print("✅ Consolidation complete\n")
|
||||
|
||||
# Step 3: Extract and store declarative facts
|
||||
print("STEP 3: Extracting and storing declarative facts...")
|
||||
print("-" * 80)
|
||||
print("Running extract_declarative_facts.py...")
|
||||
|
||||
import subprocess
|
||||
result = subprocess.run(
|
||||
["python3", "extract_declarative_facts.py"],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
# Count extracted facts from output
|
||||
facts_count = result.stdout.count("✅ Extracted from:")
|
||||
print(f"✅ Extracted {facts_count} facts")
|
||||
else:
|
||||
print(f"❌ Extraction failed: {result.stderr[:200]}")
|
||||
sys.exit(1)
|
||||
|
||||
print("\nRunning store_declarative_facts.py...")
|
||||
result = subprocess.run(
|
||||
["python3", "store_declarative_facts.py"],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
# Check for success in output
|
||||
if "Successfully stored:" in result.stdout:
|
||||
stored_line = [l for l in result.stdout.split('\n') if "Successfully stored:" in l][0]
|
||||
print(f"✅ {stored_line.strip()}")
|
||||
else:
|
||||
print("✅ Facts stored")
|
||||
else:
|
||||
print(f"❌ Storage failed: {result.stderr[:200]}")
|
||||
sys.exit(1)
|
||||
|
||||
print()
|
||||
|
||||
# Step 4: Test recall
|
||||
print("STEP 4: Testing declarative memory recall...")
|
||||
print("-" * 80)
|
||||
|
||||
results = []
|
||||
successful_recalls = 0
|
||||
|
||||
for i, test in enumerate(RECALL_TESTS, 1):
|
||||
question = test["question"]
|
||||
expected = test["expected"].lower()
|
||||
|
||||
print(f"[{i}/{len(RECALL_TESTS)}] {question}")
|
||||
print(f" Expected: {expected}")
|
||||
|
||||
response = send_message(question)
|
||||
|
||||
if response:
|
||||
answer = response.get('content', '').lower()
|
||||
success = expected in answer
|
||||
|
||||
if success:
|
||||
print(f" ✅ RECALLED correctly")
|
||||
successful_recalls += 1
|
||||
else:
|
||||
print(f" ❌ NOT recalled")
|
||||
print(f" Response: {answer[:100]}...")
|
||||
|
||||
results.append({
|
||||
'question': question,
|
||||
'expected': expected,
|
||||
'success': success,
|
||||
'response': response.get('content', '')
|
||||
})
|
||||
else:
|
||||
print(f" ❌ ERROR - No response")
|
||||
results.append({
|
||||
'question': question,
|
||||
'expected': expected,
|
||||
'success': False,
|
||||
'response': None
|
||||
})
|
||||
|
||||
print()
|
||||
time.sleep(2)
|
||||
|
||||
# Final summary
|
||||
print("=" * 80)
|
||||
print("FINAL RESULTS")
|
||||
print("=" * 80)
|
||||
|
||||
success_rate = (successful_recalls / len(RECALL_TESTS)) * 100
|
||||
|
||||
print(f"\n📊 RECALL SUCCESS RATE: {successful_recalls}/{len(RECALL_TESTS)} ({success_rate:.1f}%)\n")
|
||||
|
||||
if success_rate == 100:
|
||||
print("🎉 PERFECT! All facts recalled correctly!")
|
||||
elif success_rate >= 80:
|
||||
print("✅ EXCELLENT! Most facts recalled correctly.")
|
||||
elif success_rate >= 50:
|
||||
print("⚠️ PARTIAL SUCCESS - Needs improvement.")
|
||||
else:
|
||||
print("❌ POOR PERFORMANCE - System needs significant fixes.")
|
||||
|
||||
print("\nDetailed results:")
|
||||
print("-" * 80)
|
||||
|
||||
for result in results:
|
||||
status = "✅" if result['success'] else "❌"
|
||||
print(f"{status} {result['question']}")
|
||||
if not result['success'] and result['response']:
|
||||
print(f" Response: {result['response'][:150]}...")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
|
||||
if success_rate == 100:
|
||||
print("✅ PHASE 2 COMPLETE AND READY FOR PRODUCTION!")
|
||||
elif success_rate >= 80:
|
||||
print("✅ PHASE 2 MOSTLY WORKING - Minor refinements needed")
|
||||
else:
|
||||
print("❌ PHASE 2 NEEDS MORE WORK")
|
||||
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user