296 lines
8.2 KiB
Python
296 lines
8.2 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Full pipeline test for Phase 2 memory consolidation with declarative extraction.
|
||
|
|
|
||
|
|
Steps:
|
||
|
|
1. Tell Miku 20 facts (mix of important and trivial)
|
||
|
|
2. Run consolidation to delete trivial messages
|
||
|
|
3. Extract facts from consolidated episodic memories
|
||
|
|
4. Store facts in declarative memory
|
||
|
|
5. Test recall with factual questions
|
||
|
|
"""
|
||
|
|
|
||
|
|
import requests
|
||
|
|
import time
|
||
|
|
import sys
|
||
|
|
|
||
|
|
CAT_URL = "http://localhost:1865"
|
||
|
|
USER_ID = "test_user_pipeline"
|
||
|
|
|
||
|
|
# Test messages to tell Miku
|
||
|
|
TEST_MESSAGES = [
|
||
|
|
# Important facts (should be remembered)
|
||
|
|
"My name is Sarah Chen.",
|
||
|
|
"I'm 28 years old.",
|
||
|
|
"I live in Seattle, Washington.",
|
||
|
|
"I work as a software engineer at Microsoft.",
|
||
|
|
"My favorite color is forest green.",
|
||
|
|
"I love playing piano. I've been practicing for 15 years.",
|
||
|
|
"I'm learning Japanese! Currently at N3 level.",
|
||
|
|
"I have a cat named Luna.",
|
||
|
|
"I'm allergic to peanuts.",
|
||
|
|
"I prefer cats over dogs, though I like both.",
|
||
|
|
"My favorite food is ramen.",
|
||
|
|
"I enjoy hiking on weekends.",
|
||
|
|
"I graduated from UW in 2018.",
|
||
|
|
"My birthday is March 15th.",
|
||
|
|
|
||
|
|
# Trivial messages (should be deleted during consolidation)
|
||
|
|
"lol",
|
||
|
|
"k",
|
||
|
|
"haha",
|
||
|
|
"brb",
|
||
|
|
"nice",
|
||
|
|
"cool",
|
||
|
|
]
|
||
|
|
|
||
|
|
# Questions to test recall
|
||
|
|
RECALL_TESTS = [
|
||
|
|
{
|
||
|
|
"question": "What is my name?",
|
||
|
|
"expected": "sarah",
|
||
|
|
"fact_type": "name"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "How old am I?",
|
||
|
|
"expected": "28",
|
||
|
|
"fact_type": "age"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "Where do I live?",
|
||
|
|
"expected": "seattle",
|
||
|
|
"fact_type": "location"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "What do I do for work?",
|
||
|
|
"expected": "software engineer",
|
||
|
|
"fact_type": "job"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "What is my favorite color?",
|
||
|
|
"expected": "forest green",
|
||
|
|
"fact_type": "favorite_color"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "What instruments do I play?",
|
||
|
|
"expected": "piano",
|
||
|
|
"fact_type": "hobby"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "What language am I learning?",
|
||
|
|
"expected": "japanese",
|
||
|
|
"fact_type": "hobby"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "What is my cat's name?",
|
||
|
|
"expected": "luna",
|
||
|
|
"fact_type": "pet_name"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "What am I allergic to?",
|
||
|
|
"expected": "peanut",
|
||
|
|
"fact_type": "allergy"
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"question": "Do I prefer cats or dogs?",
|
||
|
|
"expected": "cat",
|
||
|
|
"fact_type": "preference"
|
||
|
|
},
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
def send_message(text: str) -> dict:
|
||
|
|
"""Send a message to Miku."""
|
||
|
|
try:
|
||
|
|
response = requests.post(
|
||
|
|
f"{CAT_URL}/message",
|
||
|
|
json={"text": text, "user_id": USER_ID},
|
||
|
|
timeout=30
|
||
|
|
)
|
||
|
|
response.raise_for_status()
|
||
|
|
return response.json()
|
||
|
|
except Exception as e:
|
||
|
|
print(f" ❌ Error sending message: {e}")
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def trigger_consolidation() -> bool:
|
||
|
|
"""Trigger memory consolidation."""
|
||
|
|
try:
|
||
|
|
response = send_message("consolidate now")
|
||
|
|
if response:
|
||
|
|
print(" ✅ Consolidation triggered")
|
||
|
|
return True
|
||
|
|
return False
|
||
|
|
except Exception as e:
|
||
|
|
print(f" ❌ Error triggering consolidation: {e}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
print("=" * 80)
|
||
|
|
print("PHASE 2 FULL PIPELINE TEST")
|
||
|
|
print("=" * 80)
|
||
|
|
print(f"Testing with user: {USER_ID}\n")
|
||
|
|
|
||
|
|
# Step 1: Tell Miku the facts
|
||
|
|
print("STEP 1: Telling Miku facts...")
|
||
|
|
print("-" * 80)
|
||
|
|
successful_sends = 0
|
||
|
|
|
||
|
|
for i, message in enumerate(TEST_MESSAGES, 1):
|
||
|
|
is_trivial = message in ["lol", "k", "haha", "brb", "nice", "cool"]
|
||
|
|
msg_type = "TRIVIAL" if is_trivial else "IMPORTANT"
|
||
|
|
|
||
|
|
print(f"[{i}/{len(TEST_MESSAGES)}] {msg_type}: {message}")
|
||
|
|
response = send_message(message)
|
||
|
|
|
||
|
|
if response:
|
||
|
|
print(f" ✅ Sent successfully")
|
||
|
|
successful_sends += 1
|
||
|
|
else:
|
||
|
|
print(f" ❌ Failed to send")
|
||
|
|
|
||
|
|
time.sleep(1) # Brief pause between messages
|
||
|
|
|
||
|
|
print(f"\n✅ Successfully sent {successful_sends}/{len(TEST_MESSAGES)} messages\n")
|
||
|
|
|
||
|
|
# Step 2: Trigger consolidation
|
||
|
|
print("STEP 2: Triggering consolidation...")
|
||
|
|
print("-" * 80)
|
||
|
|
|
||
|
|
if not trigger_consolidation():
|
||
|
|
print("❌ Failed to trigger consolidation")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
print("⏳ Waiting for consolidation to complete...")
|
||
|
|
time.sleep(5)
|
||
|
|
print("✅ Consolidation complete\n")
|
||
|
|
|
||
|
|
# Step 3: Extract and store declarative facts
|
||
|
|
print("STEP 3: Extracting and storing declarative facts...")
|
||
|
|
print("-" * 80)
|
||
|
|
print("Running extract_declarative_facts.py...")
|
||
|
|
|
||
|
|
import subprocess
|
||
|
|
result = subprocess.run(
|
||
|
|
["python3", "extract_declarative_facts.py"],
|
||
|
|
capture_output=True,
|
||
|
|
text=True
|
||
|
|
)
|
||
|
|
|
||
|
|
if result.returncode == 0:
|
||
|
|
# Count extracted facts from output
|
||
|
|
facts_count = result.stdout.count("✅ Extracted from:")
|
||
|
|
print(f"✅ Extracted {facts_count} facts")
|
||
|
|
else:
|
||
|
|
print(f"❌ Extraction failed: {result.stderr[:200]}")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
print("\nRunning store_declarative_facts.py...")
|
||
|
|
result = subprocess.run(
|
||
|
|
["python3", "store_declarative_facts.py"],
|
||
|
|
capture_output=True,
|
||
|
|
text=True
|
||
|
|
)
|
||
|
|
|
||
|
|
if result.returncode == 0:
|
||
|
|
# Check for success in output
|
||
|
|
if "Successfully stored:" in result.stdout:
|
||
|
|
stored_line = [l for l in result.stdout.split('\n') if "Successfully stored:" in l][0]
|
||
|
|
print(f"✅ {stored_line.strip()}")
|
||
|
|
else:
|
||
|
|
print("✅ Facts stored")
|
||
|
|
else:
|
||
|
|
print(f"❌ Storage failed: {result.stderr[:200]}")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
print()
|
||
|
|
|
||
|
|
# Step 4: Test recall
|
||
|
|
print("STEP 4: Testing declarative memory recall...")
|
||
|
|
print("-" * 80)
|
||
|
|
|
||
|
|
results = []
|
||
|
|
successful_recalls = 0
|
||
|
|
|
||
|
|
for i, test in enumerate(RECALL_TESTS, 1):
|
||
|
|
question = test["question"]
|
||
|
|
expected = test["expected"].lower()
|
||
|
|
|
||
|
|
print(f"[{i}/{len(RECALL_TESTS)}] {question}")
|
||
|
|
print(f" Expected: {expected}")
|
||
|
|
|
||
|
|
response = send_message(question)
|
||
|
|
|
||
|
|
if response:
|
||
|
|
answer = response.get('content', '').lower()
|
||
|
|
success = expected in answer
|
||
|
|
|
||
|
|
if success:
|
||
|
|
print(f" ✅ RECALLED correctly")
|
||
|
|
successful_recalls += 1
|
||
|
|
else:
|
||
|
|
print(f" ❌ NOT recalled")
|
||
|
|
print(f" Response: {answer[:100]}...")
|
||
|
|
|
||
|
|
results.append({
|
||
|
|
'question': question,
|
||
|
|
'expected': expected,
|
||
|
|
'success': success,
|
||
|
|
'response': response.get('content', '')
|
||
|
|
})
|
||
|
|
else:
|
||
|
|
print(f" ❌ ERROR - No response")
|
||
|
|
results.append({
|
||
|
|
'question': question,
|
||
|
|
'expected': expected,
|
||
|
|
'success': False,
|
||
|
|
'response': None
|
||
|
|
})
|
||
|
|
|
||
|
|
print()
|
||
|
|
time.sleep(2)
|
||
|
|
|
||
|
|
# Final summary
|
||
|
|
print("=" * 80)
|
||
|
|
print("FINAL RESULTS")
|
||
|
|
print("=" * 80)
|
||
|
|
|
||
|
|
success_rate = (successful_recalls / len(RECALL_TESTS)) * 100
|
||
|
|
|
||
|
|
print(f"\n📊 RECALL SUCCESS RATE: {successful_recalls}/{len(RECALL_TESTS)} ({success_rate:.1f}%)\n")
|
||
|
|
|
||
|
|
if success_rate == 100:
|
||
|
|
print("🎉 PERFECT! All facts recalled correctly!")
|
||
|
|
elif success_rate >= 80:
|
||
|
|
print("✅ EXCELLENT! Most facts recalled correctly.")
|
||
|
|
elif success_rate >= 50:
|
||
|
|
print("⚠️ PARTIAL SUCCESS - Needs improvement.")
|
||
|
|
else:
|
||
|
|
print("❌ POOR PERFORMANCE - System needs significant fixes.")
|
||
|
|
|
||
|
|
print("\nDetailed results:")
|
||
|
|
print("-" * 80)
|
||
|
|
|
||
|
|
for result in results:
|
||
|
|
status = "✅" if result['success'] else "❌"
|
||
|
|
print(f"{status} {result['question']}")
|
||
|
|
if not result['success'] and result['response']:
|
||
|
|
print(f" Response: {result['response'][:150]}...")
|
||
|
|
|
||
|
|
print("\n" + "=" * 80)
|
||
|
|
|
||
|
|
if success_rate == 100:
|
||
|
|
print("✅ PHASE 2 COMPLETE AND READY FOR PRODUCTION!")
|
||
|
|
elif success_rate >= 80:
|
||
|
|
print("✅ PHASE 2 MOSTLY WORKING - Minor refinements needed")
|
||
|
|
else:
|
||
|
|
print("❌ PHASE 2 NEEDS MORE WORK")
|
||
|
|
|
||
|
|
print("=" * 80)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|