llama-swap: use pre-built images (:cuda, :rocm) with GPU-specific flags
- Drop custom Dockerfiles; docker-compose uses ghcr.io pre-built images which ship llama-swap + llama-server with no pinned versions (always latest) - NVIDIA GTX 1660 (6GB): add -fit off --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 to fix OOM segfault with new llama.cpp b9014's GPU-side KV cache default - AMD RX 6800 (16GB): flags unchanged; KV cache stays on GPU for max speed - Both running llama-swap v211 + llama.cpp b9014 (2026-05-05)
This commit is contained in:
@@ -5,7 +5,7 @@ models:
|
||||
# Main text generation model (Llama 3.1 8B)
|
||||
# Custom chat template to disable built-in tool calling
|
||||
llama3.1:
|
||||
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on --chat-template-file /app/llama31_notool_template.jinja
|
||||
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0 --chat-template-file /app/llama31_notool_template.jinja
|
||||
ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds)
|
||||
swap: true # CRITICAL: Unload other models when loading this one
|
||||
aliases:
|
||||
@@ -14,7 +14,7 @@ models:
|
||||
|
||||
# Evil/Uncensored text generation model (DarkIdol-Llama 3.1 8B)
|
||||
darkidol:
|
||||
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
|
||||
cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
|
||||
ttl: 1800 # Unload after 30 minutes of inactivity
|
||||
swap: true # CRITICAL: Unload other models when loading this one
|
||||
aliases:
|
||||
@@ -24,7 +24,7 @@ models:
|
||||
|
||||
# Japanese language model (Llama 3.1 Swallow - Japanese optimized)
|
||||
swallow:
|
||||
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on
|
||||
cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
|
||||
ttl: 1800 # Unload after 30 minutes of inactivity
|
||||
swap: true # CRITICAL: Unload other models when loading this one
|
||||
aliases:
|
||||
@@ -34,7 +34,7 @@ models:
|
||||
|
||||
# Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs)
|
||||
vision:
|
||||
cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup --flash-attn on
|
||||
cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 -fit off --no-warmup --flash-attn on --no-kv-offload --cache-type-k q4_0 --cache-type-v q4_0
|
||||
ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds)
|
||||
swap: true # CRITICAL: Unload text models before loading vision
|
||||
aliases:
|
||||
|
||||
Reference in New Issue
Block a user