miku-discord/llama-swap-rocm-config.yaml

# llama-swap configuration for AMD RX 6800 (ROCm)
# This manages automatic model switching and unloading for the secondary GPU
# Vision model stays on NVIDIA GPU - AMD only handles text models

models:
  # Main text generation model (same name as NVIDIA for uniform switching)
  # Custom chat template to disable built-in tool calling
  llama3.1:
    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja
    ttl: 1800  # Unload after 30 minutes of inactivity (1800 seconds)
    aliases:
      - llama3.1
      - text-model

  # Evil/Uncensored model (same name as NVIDIA for uniform switching)
  darkidol:
    cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
    ttl: 1800  # Unload after 30 minutes of inactivity
    aliases:
      - darkidol
      - evil-model
      - uncensored

  # Rocinante-X 12B - larger creative/RP model for comparison testing
  rocinante:
    cmd: /app/llama-server --port ${PORT} --model /models/Rocinante-X-12B-v1b-Q5_K_M.gguf -ngl 99 -c 8192 --host 0.0.0.0 --no-warmup --flash-attn on
    ttl: 1800  # Unload after 30 minutes of inactivity
    aliases:
      - rocinante
      - rocinante-12b

  # Japanese language model (Llama 3.1 Swallow - Japanese optimized)
  swallow:
    cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
    ttl: 1800  # Unload after 30 minutes of inactivity
    aliases:
      - swallow
      - japanese
      - japanese-model

# Server configuration
# llama-swap will listen on this address
# Inside Docker, we bind to 0.0.0.0 to allow bot container to connect