From 5b1163c7afd676a5256ba2e081d992f10202442d Mon Sep 17 00:00:00 2001 From: koko210Serve Date: Fri, 30 Jan 2026 21:35:07 +0200 Subject: [PATCH] Removed KV Cache offloading to increase performance --- llama-swap-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama-swap-config.yaml b/llama-swap-config.yaml index 2ea9a38..07821c9 100644 --- a/llama-swap-config.yaml +++ b/llama-swap-config.yaml @@ -4,7 +4,7 @@ models: # Main text generation model (Llama 3.1 8B) llama3.1: - cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup + cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-8B-Instruct-UD-Q4_K_XL.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on ttl: 1800 # Unload after 30 minutes of inactivity (1800 seconds) swap: true # CRITICAL: Unload other models when loading this one aliases: @@ -13,7 +13,7 @@ models: # Evil/Uncensored text generation model (DarkIdol-Llama 3.1 8B) darkidol: - cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup + cmd: /app/llama-server --port ${PORT} --model /models/DarkIdol-Llama-3.1-8B-Instruct-1.3-Uncensored_Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on ttl: 1800 # Unload after 30 minutes of inactivity swap: true # CRITICAL: Unload other models when loading this one aliases: @@ -23,7 +23,7 @@ models: # Japanese language model (Llama 3.1 Swallow - Japanese optimized) swallow: - cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -nkvo -c 16384 --host 0.0.0.0 --no-warmup + cmd: /app/llama-server --port ${PORT} --model /models/Llama-3.1-Swallow-8B-Instruct-v0.5-Q4_K_M.gguf -ngl 99 -c 16384 --host 0.0.0.0 --no-warmup --flash-attn on ttl: 1800 # Unload after 30 minutes of inactivity swap: true # CRITICAL: Unload other models when loading this one aliases: @@ -33,7 +33,7 @@ models: # Vision/Multimodal model (MiniCPM-V-4.5 - supports images, video, and GIFs) vision: - cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup + cmd: /app/llama-server --port ${PORT} --model /models/MiniCPM-V-4_5-Q3_K_S.gguf --mmproj /models/MiniCPM-V-4_5-mmproj-f16.gguf -ngl 99 -c 4096 --host 0.0.0.0 --no-warmup --flash-attn on ttl: 900 # Vision model used less frequently, shorter TTL (15 minutes = 900 seconds) swap: true # CRITICAL: Unload text models before loading vision aliases: