Add dual GPU support with web UI selector

Features:
- Built custom ROCm container for AMD RX 6800 GPU
- Added GPU selection toggle in web UI (NVIDIA/AMD)
- Unified model names across both GPUs for seamless switching
- Vision model always uses NVIDIA GPU (optimal performance)
- Text models (llama3.1, darkidol) can use either GPU
- Added /gpu-status and /gpu-select API endpoints
- Implemented GPU state persistence in memory/gpu_state.json

Technical details:
- Multi-stage Dockerfile.llamaswap-rocm with ROCm 6.2.4
- llama.cpp compiled with GGML_HIP=ON for gfx1030 (RX 6800)
- Proper GPU permissions without root (groups 187/989)
- AMD container on port 8091, NVIDIA on port 8090
- Updated bot/utils/llm.py with get_current_gpu_url() and get_vision_gpu_url()
- Modified bot/utils/image_handling.py to always use NVIDIA for vision
- Enhanced web UI with GPU selector button (blue=NVIDIA, red=AMD)

Files modified:
- docker-compose.yml (added llama-swap-amd service)
- bot/globals.py (added LLAMA_AMD_URL)
- bot/api.py (added GPU selection endpoints and helper function)
- bot/utils/llm.py (GPU routing for text models)
- bot/utils/image_handling.py (GPU routing for vision models)
- bot/static/index.html (GPU selector UI)
- llama-swap-rocm-config.yaml (unified model names)

New files:
- Dockerfile.llamaswap-rocm
- bot/memory/gpu_state.json
- bot/utils/gpu_router.py (load balancing utility)
- setup-dual-gpu.sh (setup verification script)
- DUAL_GPU_*.md (documentation files)
This commit is contained in:
2026-01-09 00:03:59 +02:00
parent ed5994ec78
commit 1fc3d74a5b
21 changed files with 2836 additions and 13 deletions

191
bot/utils/gpu_router.py Normal file
View File

@@ -0,0 +1,191 @@
"""
GPU Router Utility for Dual GPU Setup
Manages routing between NVIDIA and AMD GPUs for model inference
"""
import os
import random
import logging
from typing import Optional, Literal
import globals
logger = logging.getLogger(__name__)
# Model to GPU mapping
MODEL_TO_GPU = {
# NVIDIA models (primary GPU)
"llama3.1": globals.LLAMA_URL,
"text-model": globals.LLAMA_URL,
"darkidol": globals.LLAMA_URL,
"evil-model": globals.LLAMA_URL,
"uncensored": globals.LLAMA_URL,
"vision": globals.LLAMA_URL,
"vision-model": globals.LLAMA_URL,
"minicpm": globals.LLAMA_URL,
# AMD models (secondary GPU - RX 6800)
"llama3.1-amd": globals.LLAMA_AMD_URL,
"text-model-amd": globals.LLAMA_AMD_URL,
"amd-text": globals.LLAMA_AMD_URL,
"darkidol-amd": globals.LLAMA_AMD_URL,
"evil-model-amd": globals.LLAMA_AMD_URL,
"uncensored-amd": globals.LLAMA_AMD_URL,
"moondream-amd": globals.LLAMA_AMD_URL,
"vision-amd": globals.LLAMA_AMD_URL,
"moondream": globals.LLAMA_AMD_URL,
}
# Configuration
PREFER_AMD_GPU = os.getenv("PREFER_AMD_GPU", "false").lower() == "true"
AMD_MODELS_ENABLED = os.getenv("AMD_MODELS_ENABLED", "true").lower() == "true"
def get_endpoint_for_model(model_name: str) -> str:
"""
Get the correct llama-swap endpoint for a model.
Args:
model_name: Name or alias of the model
Returns:
URL of the llama-swap endpoint (either NVIDIA or AMD)
"""
endpoint = MODEL_TO_GPU.get(model_name, globals.LLAMA_URL)
# If AMD models are disabled, use NVIDIA for AMD models too
if not AMD_MODELS_ENABLED and endpoint == globals.LLAMA_AMD_URL:
logger.warning(f"AMD GPU disabled, routing {model_name} to NVIDIA GPU")
# Map AMD model name to NVIDIA equivalent
nvidia_model = model_name.replace("-amd", "")
endpoint = globals.LLAMA_URL
return endpoint
def is_amd_model(model_name: str) -> bool:
"""
Check if a model runs on the AMD GPU.
Args:
model_name: Name or alias of the model
Returns:
True if model runs on AMD GPU, False otherwise
"""
return model_name.endswith("-amd") or model_name in ["moondream", "moondream-amd", "vision-amd"]
def get_llama_url_with_load_balancing(
prefer_amd: bool = False,
task_type: Literal["text", "vision", "evil"] = "text"
) -> tuple[str, str]:
"""
Get llama URL with optional load balancing between GPUs.
Returns both URL and recommended model name.
Args:
prefer_amd: If True, prefer AMD GPU when possible
task_type: Type of task (text, vision, or evil)
Returns:
Tuple of (url, model_name)
"""
if not AMD_MODELS_ENABLED:
# AMD disabled, use NVIDIA only
if task_type == "evil":
return globals.LLAMA_URL, "darkidol"
elif task_type == "vision":
return globals.LLAMA_URL, "vision"
else:
return globals.LLAMA_URL, "llama3.1"
# AMD enabled - implement load balancing
use_amd = prefer_amd or PREFER_AMD_GPU or (random.random() < 0.5)
if task_type == "evil":
# Evil/uncensored models
if use_amd:
return globals.LLAMA_AMD_URL, "darkidol-amd"
else:
return globals.LLAMA_URL, "darkidol"
elif task_type == "vision":
# Vision models - MiniCPM on NVIDIA, Moondream on AMD
if use_amd:
return globals.LLAMA_AMD_URL, "moondream-amd"
else:
return globals.LLAMA_URL, "vision"
else:
# Text generation - round robin between GPUs
if use_amd:
return globals.LLAMA_AMD_URL, "llama3.1-amd"
else:
return globals.LLAMA_URL, "llama3.1"
def get_vision_model_for_gpu(prefer_amd: bool = False) -> tuple[str, str]:
"""
Get the appropriate vision model based on GPU preference.
Args:
prefer_amd: If True, use AMD GPU vision model
Returns:
Tuple of (url, model_name)
"""
if prefer_amd and AMD_MODELS_ENABLED:
return globals.LLAMA_AMD_URL, "moondream-amd"
else:
return globals.LLAMA_URL, "vision"
def get_text_model_for_gpu(prefer_amd: bool = False) -> tuple[str, str]:
"""
Get the appropriate text model based on GPU preference.
Args:
prefer_amd: If True, use AMD GPU text model
Returns:
Tuple of (url, model_name)
"""
if prefer_amd and AMD_MODELS_ENABLED:
return globals.LLAMA_AMD_URL, "llama3.1-amd"
else:
return globals.LLAMA_URL, "llama3.1"
def log_gpu_routing(model_name: str, endpoint: str, task_type: str = "inference"):
"""
Log GPU routing decision for debugging.
Args:
model_name: Name of the model being used
endpoint: URL endpoint being used
task_type: Type of task being performed
"""
gpu_type = "AMD RX 6800" if endpoint == globals.LLAMA_AMD_URL else "NVIDIA"
logger.info(f"[GPU Router] {task_type} - Using {model_name} on {gpu_type} ({endpoint})")
# Example usage in bot code:
"""
# Simple routing by model name
url = get_endpoint_for_model("llama3.1-amd")
# Load balanced routing
url, model = get_llama_url_with_load_balancing(task_type="text")
response = requests.post(
f"{url}/v1/chat/completions",
json={"model": model, ...}
)
# Vision model with GPU preference
url, model = get_vision_model_for_gpu(prefer_amd=True)
# With logging
url = get_endpoint_for_model("darkidol-amd")
log_gpu_routing("darkidol-amd", url, "evil mode generation")
"""