feat: add Traefik proxy, custom chat template, improve Cheshire Cat memory

docker-compose.yml: - Add Traefik proxy network + labels for miku.panel domain - Connect miku-bot service to proxy network llama-swap-config.yaml / llama-swap-rocm-config.yaml: - Add --chat-template-file flag to disable Llama 3.1 built-in tool calling (was causing malformed responses) - ROCm config: add Rocinante-X 12B model entry for comparison testing cheshire-cat discord_bridge plugin: - Increase declarative memory recall (k=3→10, threshold 0.7→0.5) for better factual retrieval - Add agent_prompt_prefix hook to enforce factual accuracy from declarative memories - Add before_agent_starts debug logging for memory inspection - Add passthrough hooks for message/suffix pipeline
2026-03-04 00:48:58 +02:00
parent 335b58a867
commit eafab336b4
4 changed files with 100 additions and 2 deletions
--- a/cheshire-cat/cat/plugins/discord_bridge/discord_bridge.py
+++ b/cheshire-cat/cat/plugins/discord_bridge/discord_bridge.py
@@ -99,6 +99,26 @@ def before_cat_stores_episodic_memory(doc, cat):
    return doc


+@hook(priority=80)
+def before_cat_recalls_declarative_memories(declarative_recall_config, cat):
+    """
+    Increase k-value and lower threshold for better declarative memory retrieval.
+    
+    Default Cat settings (k=3, threshold=0.7) are too restrictive for factual recall.
+    We increase k to retrieve more candidates and lower threshold to catch facts
+    that might have lower similarity scores due to embedding model limitations.
+    """
+    # Increase from k=3 to k=10 (retrieve more memories)
+    declarative_recall_config["k"] = 10
+    
+    # Lower threshold from 0.7 to 0.5 (be more lenient with similarity scores)
+    declarative_recall_config["threshold"] = 0.5
+    
+    print(f"🔧 [Discord Bridge] Adjusted declarative recall: k={declarative_recall_config['k']}, threshold={declarative_recall_config['threshold']}")
+    
+    return declarative_recall_config
+
+
@hook(priority=50)
 def after_cat_recalls_memories(cat):
    """
@@ -119,6 +139,63 @@ def after_cat_recalls_memories(cat):
    
    if declarative_memories:
        print(f"📚 [Discord Bridge] Recalled {len(declarative_memories)} declarative facts for user {cat.user_id}")
+        # Show the actual facts for debugging
+        for doc, score, *rest in declarative_memories[:3]:  # Show top 3
+            print(f"   - [{score:.3f}] {doc.page_content[:80]}...")
+
+
+@hook(priority=100)
+def agent_prompt_prefix(prefix, cat) -> str:
+    """
+    Add explicit instruction to respect declarative facts.
+    This overrides the default Cat prefix to emphasize factual accuracy.
+    """
+    # Add a strong instruction about facts BEFORE the regular personality
+    enhanced_prefix = f"""You are Hatsune Miku, a cheerful virtual idol.
+
+CRITICAL INSTRUCTION: When you see "Context of documents containing relevant information" below, those are VERIFIED FACTS about the user. You MUST use these facts when they are relevant to the user's question. Never guess or make up information that contradicts these facts.
+
+{prefix}"""
+    
+    return enhanced_prefix
+
+
+@hook(priority=100)
+def before_agent_starts(agent_input, cat) -> dict:
+    """
+    Log the agent input for debugging.
+    Now that the suffix template is fixed, declarative facts should appear naturally.
+    """
+    declarative_mem = agent_input.get('declarative_memory', '')
+    episodic_mem = agent_input.get('episodic_memory', '')
+    
+    print(f"🔍 [Discord Bridge] before_agent_starts called")
+    print(f"   input: {agent_input.get('input', '')[:80]}")
+    print(f"   declarative_mem length: {len(declarative_mem)}")
+    print(f"   episodic_mem length: {len(episodic_mem)}")
+    if declarative_mem:
+        print(f"   declarative_mem preview: {declarative_mem[:200]}")
+    
+    return agent_input
+
+
+@hook(priority=100)
+def before_cat_sends_message(message: dict, cat) -> dict:
+    """
+    This hook is called AFTER the LLM response, so it's too late to modify the prompt.
+    Keeping it for potential post-processing, but the real work happens in before_agent_starts.
+    """
+    return message
+
+
+@hook(priority=10)
+def agent_prompt_suffix(prompt_suffix, cat) -> str:
+    """
+    Pass through the suffix unchanged.
+    The miku_personality plugin (priority=100) sets the suffix with memory placeholders.
+    This lower-priority hook runs first but the miku_personality hook overrides it.
+    """
+    return prompt_suffix


 # Plugin metadata
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -134,7 +134,15 @@ services:
    networks:
      - default  # Stay on default for llama-swap + cheshire-cat communication
      - miku-voice  # Connect to voice network for RVC/TTS
+      - proxy # Traefik proxy network for miku.panel
    restart: unless-stopped
+    labels:
+      - "traefik.enable=true"
+      - "traefik.http.routers.miku.rule=Host(`miku.panel`)"
+      - "traefik.http.routers.miku.entrypoints=websecure"
+      - "traefik.http.routers.miku.tls=true"
+      - "traefik.http.services.miku.loadbalancer.server.port=3939"
+      - "traefik.docker.network=proxy"

  # ========== Voice / STT ==========
  miku-stt:
@@ -193,6 +201,9 @@ networks:
  miku-voice:
    external: true
    name: miku-voice-network
+  proxy:
+    name: proxy 
+    external: true

 volumes:
  stt-models:
--- a/llama-swap-config.yaml
+++ b/llama-swap-config.yaml
@@ -3,8 +3,9 @@

 models:
  # Main text generation model (Llama 3.1 8B)
+  # Custom chat template to disable built-in tool calling
  llama3.1:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja
    ttl: 1800  # Unload after 30 minutes of inactivity (1800 seconds)
    swap: true  # CRITICAL: Unload other models when loading this one
    aliases:
--- a/llama-swap-rocm-config.yaml
+++ b/llama-swap-rocm-config.yaml
@@ -4,8 +4,9 @@

 models:
  # Main text generation model (same name as NVIDIA for uniform switching)
+  # Custom chat template to disable built-in tool calling
  llama3.1:
-    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
+    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja
    ttl: 1800  # Unload after 30 minutes of inactivity (1800 seconds)
    aliases:
      - llama3.1
@@ -20,6 +21,14 @@ models:
      - evil-model
      - uncensored
  
+  # Rocinante-X 12B - larger creative/RP model for comparison testing
+  rocinante:
+    cmd: /app/llama-server --port ${PORT} --model /models/Rocinante-X-12B-v1b-Q5_K_M.gguf -ngl 99 -c 8192 --host 0.0.0.0 --no-warmup --flash-attn on
+    ttl: 1800  # Unload after 30 minutes of inactivity
+    aliases:
+      - rocinante
+      - rocinante-12b
+
  # Japanese language model (Llama 3.1 Swallow - Japanese optimized)
  swallow:
    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on