diff --git a/cheshire-cat/cat/plugins/discord_bridge/discord_bridge.py b/cheshire-cat/cat/plugins/discord_bridge/discord_bridge.py index f6b665b..9aae808 100644 --- a/cheshire-cat/cat/plugins/discord_bridge/discord_bridge.py +++ b/cheshire-cat/cat/plugins/discord_bridge/discord_bridge.py @@ -99,6 +99,26 @@ def before_cat_stores_episodic_memory(doc, cat): return doc +@hook(priority=80) +def before_cat_recalls_declarative_memories(declarative_recall_config, cat): + """ + Increase k-value and lower threshold for better declarative memory retrieval. + + Default Cat settings (k=3, threshold=0.7) are too restrictive for factual recall. + We increase k to retrieve more candidates and lower threshold to catch facts + that might have lower similarity scores due to embedding model limitations. + """ + # Increase from k=3 to k=10 (retrieve more memories) + declarative_recall_config["k"] = 10 + + # Lower threshold from 0.7 to 0.5 (be more lenient with similarity scores) + declarative_recall_config["threshold"] = 0.5 + + print(f"🔧 [Discord Bridge] Adjusted declarative recall: k={declarative_recall_config['k']}, threshold={declarative_recall_config['threshold']}") + + return declarative_recall_config + + @hook(priority=50) def after_cat_recalls_memories(cat): """ @@ -119,6 +139,63 @@ def after_cat_recalls_memories(cat): if declarative_memories: print(f"📚 [Discord Bridge] Recalled {len(declarative_memories)} declarative facts for user {cat.user_id}") + # Show the actual facts for debugging + for doc, score, *rest in declarative_memories[:3]: # Show top 3 + print(f" - [{score:.3f}] {doc.page_content[:80]}...") + + +@hook(priority=100) +def agent_prompt_prefix(prefix, cat) -> str: + """ + Add explicit instruction to respect declarative facts. + This overrides the default Cat prefix to emphasize factual accuracy. + """ + # Add a strong instruction about facts BEFORE the regular personality + enhanced_prefix = f"""You are Hatsune Miku, a cheerful virtual idol. + +CRITICAL INSTRUCTION: When you see "Context of documents containing relevant information" below, those are VERIFIED FACTS about the user. You MUST use these facts when they are relevant to the user's question. Never guess or make up information that contradicts these facts. + +{prefix}""" + + return enhanced_prefix + + +@hook(priority=100) +def before_agent_starts(agent_input, cat) -> dict: + """ + Log the agent input for debugging. + Now that the suffix template is fixed, declarative facts should appear naturally. + """ + declarative_mem = agent_input.get('declarative_memory', '') + episodic_mem = agent_input.get('episodic_memory', '') + + print(f"🔍 [Discord Bridge] before_agent_starts called") + print(f" input: {agent_input.get('input', '')[:80]}") + print(f" declarative_mem length: {len(declarative_mem)}") + print(f" episodic_mem length: {len(episodic_mem)}") + if declarative_mem: + print(f" declarative_mem preview: {declarative_mem[:200]}") + + return agent_input + + +@hook(priority=100) +def before_cat_sends_message(message: dict, cat) -> dict: + """ + This hook is called AFTER the LLM response, so it's too late to modify the prompt. + Keeping it for potential post-processing, but the real work happens in before_agent_starts. + """ + return message + + +@hook(priority=10) +def agent_prompt_suffix(prompt_suffix, cat) -> str: + """ + Pass through the suffix unchanged. + The miku_personality plugin (priority=100) sets the suffix with memory placeholders. + This lower-priority hook runs first but the miku_personality hook overrides it. + """ + return prompt_suffix # Plugin metadata diff --git a/docker-compose.yml b/docker-compose.yml index ac5a8c9..b91ee2d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -134,7 +134,15 @@ services: networks: - default # Stay on default for llama-swap + cheshire-cat communication - miku-voice # Connect to voice network for RVC/TTS + - proxy # Traefik proxy network for miku.panel restart: unless-stopped + labels: + - "traefik.enable=true" + - "traefik.http.routers.miku.rule=Host(`miku.panel`)" + - "traefik.http.routers.miku.entrypoints=websecure" + - "traefik.http.routers.miku.tls=true" + - "traefik.http.services.miku.loadbalancer.server.port=3939" + - "traefik.docker.network=proxy" # ========== Voice / STT ========== miku-stt: @@ -193,6 +201,9 @@ networks: miku-voice: external: true name: miku-voice-network + proxy: + name: proxy + external: true volumes: stt-models: diff --git a/llama-swap-config.yaml b/llama-swap-config.yaml index 07821c9..1e4b1d5 100644 --- a/llama-swap-config.yaml +++ b/llama-swap-config.yaml @@ -3,8 +3,9 @@ models: # Main text generation model (Llama 3.1 8B) + # Custom chat template to disable built-in tool calling llama3.1: - cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on + cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds) swap: true # CRITICAL: Unload other models when loading this one aliases: diff --git a/llama-swap-rocm-config.yaml b/llama-swap-rocm-config.yaml index 5327531..f52adfc 100644 --- a/llama-swap-rocm-config.yaml +++ b/llama-swap-rocm-config.yaml @@ -4,8 +4,9 @@ models: # Main text generation model (same name as NVIDIA for uniform switching) + # Custom chat template to disable built-in tool calling llama3.1: - cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on + cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds) aliases: - llama3.1 @@ -20,6 +21,14 @@ models: - evil-model - uncensored + # Rocinante-X 12B - larger creative/RP model for comparison testing + rocinante: + cmd: /app/llama-server --port ${PORT} --model /models/Rocinante-X-12B-v1b-Q5_K_M.gguf -ngl 99 -c 8192 --host 0.0.0.0 --no-warmup --flash-attn on + ttl: 1800 # Unload after 30 minutes of inactivity + aliases: + - rocinante + - rocinante-12b + # Japanese language model (Llama 3.1 Swallow - Japanese optimized) swallow: cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on