Add dual GPU support with web UI selector

Features: - Built custom ROCm container for AMD RX 6800 GPU - Added GPU selection toggle in web UI (NVIDIA/AMD) - Unified model names across both GPUs for seamless switching - Vision model always uses NVIDIA GPU (optimal performance) - Text models (llama3.1, darkidol) can use either GPU - Added /gpu-status and /gpu-select API endpoints - Implemented GPU state persistence in memory/gpu_state.json Technical details: - Multi-stage Dockerfile.llamaswap-rocm with ROCm 6.2.4 - llama.cpp compiled with GGML_HIP=ON for gfx1030 (RX 6800) - Proper GPU permissions without root (groups 187/989) - AMD container on port 8091, NVIDIA on port 8090 - Updated bot/utils/llm.py with get_current_gpu_url() and get_vision_gpu_url() - Modified bot/utils/image_handling.py to always use NVIDIA for vision - Enhanced web UI with GPU selector button (blue=NVIDIA, red=AMD) Files modified: - docker-compose.yml (added llama-swap-amd service) - bot/globals.py (added LLAMA_AMD_URL) - bot/api.py (added GPU selection endpoints and helper function) - bot/utils/llm.py (GPU routing for text models) - bot/utils/image_handling.py (GPU routing for vision models) - bot/static/index.html (GPU selector UI) - llama-swap-rocm-config.yaml (unified model names) New files: - Dockerfile.llamaswap-rocm - bot/memory/gpu_state.json - bot/utils/gpu_router.py (load balancing utility) - setup-dual-gpu.sh (setup verification script) - DUAL_GPU_*.md (documentation files)
2026-01-09 00:03:59 +02:00
parent ed5994ec78
commit 1fc3d74a5b
21 changed files with 2836 additions and 13 deletions
--- a/bot/utils/gpu_router.py
+++ b/bot/utils/gpu_router.py
@@ -0,0 +1,191 @@
+"""
+GPU Router Utility for Dual GPU Setup
+Manages routing between NVIDIA and AMD GPUs for model inference
+"""
+
+import os
+import random
+import logging
+from typing import Optional, Literal
+
+import globals
+
+logger = logging.getLogger(__name__)
+
+# Model to GPU mapping
+MODEL_TO_GPU = {
+    # NVIDIA models (primary GPU)
+    "llama3.1": globals.LLAMA_URL,
+    "text-model": globals.LLAMA_URL,
+    "darkidol": globals.LLAMA_URL,
+    "evil-model": globals.LLAMA_URL,
+    "uncensored": globals.LLAMA_URL,
+    "vision": globals.LLAMA_URL,
+    "vision-model": globals.LLAMA_URL,
+    "minicpm": globals.LLAMA_URL,
+    
+    # AMD models (secondary GPU - RX 6800)
+    "llama3.1-amd": globals.LLAMA_AMD_URL,
+    "text-model-amd": globals.LLAMA_AMD_URL,
+    "amd-text": globals.LLAMA_AMD_URL,
+    "darkidol-amd": globals.LLAMA_AMD_URL,
+    "evil-model-amd": globals.LLAMA_AMD_URL,
+    "uncensored-amd": globals.LLAMA_AMD_URL,
+    "moondream-amd": globals.LLAMA_AMD_URL,
+    "vision-amd": globals.LLAMA_AMD_URL,
+    "moondream": globals.LLAMA_AMD_URL,
+}
+
+# Configuration
+PREFER_AMD_GPU = os.getenv("PREFER_AMD_GPU", "false").lower() == "true"
+AMD_MODELS_ENABLED = os.getenv("AMD_MODELS_ENABLED", "true").lower() == "true"
+
+
+def get_endpoint_for_model(model_name: str) -> str:
+    """
+    Get the correct llama-swap endpoint for a model.
+    
+    Args:
+        model_name: Name or alias of the model
+        
+    Returns:
+        URL of the llama-swap endpoint (either NVIDIA or AMD)
+    """
+    endpoint = MODEL_TO_GPU.get(model_name, globals.LLAMA_URL)
+    
+    # If AMD models are disabled, use NVIDIA for AMD models too
+    if not AMD_MODELS_ENABLED and endpoint == globals.LLAMA_AMD_URL:
+        logger.warning(f"AMD GPU disabled, routing {model_name} to NVIDIA GPU")
+        # Map AMD model name to NVIDIA equivalent
+        nvidia_model = model_name.replace("-amd", "")
+        endpoint = globals.LLAMA_URL
+    
+    return endpoint
+
+
+def is_amd_model(model_name: str) -> bool:
+    """
+    Check if a model runs on the AMD GPU.
+    
+    Args:
+        model_name: Name or alias of the model
+        
+    Returns:
+        True if model runs on AMD GPU, False otherwise
+    """
+    return model_name.endswith("-amd") or model_name in ["moondream", "moondream-amd", "vision-amd"]
+
+
+def get_llama_url_with_load_balancing(
+    prefer_amd: bool = False,
+    task_type: Literal["text", "vision", "evil"] = "text"
+) -> tuple[str, str]:
+    """
+    Get llama URL with optional load balancing between GPUs.
+    Returns both URL and recommended model name.
+    
+    Args:
+        prefer_amd: If True, prefer AMD GPU when possible
+        task_type: Type of task (text, vision, or evil)
+        
+    Returns:
+        Tuple of (url, model_name)
+    """
+    if not AMD_MODELS_ENABLED:
+        # AMD disabled, use NVIDIA only
+        if task_type == "evil":
+            return globals.LLAMA_URL, "darkidol"
+        elif task_type == "vision":
+            return globals.LLAMA_URL, "vision"
+        else:
+            return globals.LLAMA_URL, "llama3.1"
+    
+    # AMD enabled - implement load balancing
+    use_amd = prefer_amd or PREFER_AMD_GPU or (random.random() < 0.5)
+    
+    if task_type == "evil":
+        # Evil/uncensored models
+        if use_amd:
+            return globals.LLAMA_AMD_URL, "darkidol-amd"
+        else:
+            return globals.LLAMA_URL, "darkidol"
+    
+    elif task_type == "vision":
+        # Vision models - MiniCPM on NVIDIA, Moondream on AMD
+        if use_amd:
+            return globals.LLAMA_AMD_URL, "moondream-amd"
+        else:
+            return globals.LLAMA_URL, "vision"
+    
+    else:
+        # Text generation - round robin between GPUs
+        if use_amd:
+            return globals.LLAMA_AMD_URL, "llama3.1-amd"
+        else:
+            return globals.LLAMA_URL, "llama3.1"
+
+
+def get_vision_model_for_gpu(prefer_amd: bool = False) -> tuple[str, str]:
+    """
+    Get the appropriate vision model based on GPU preference.
+    
+    Args:
+        prefer_amd: If True, use AMD GPU vision model
+        
+    Returns:
+        Tuple of (url, model_name)
+    """
+    if prefer_amd and AMD_MODELS_ENABLED:
+        return globals.LLAMA_AMD_URL, "moondream-amd"
+    else:
+        return globals.LLAMA_URL, "vision"
+
+
+def get_text_model_for_gpu(prefer_amd: bool = False) -> tuple[str, str]:
+    """
+    Get the appropriate text model based on GPU preference.
+    
+    Args:
+        prefer_amd: If True, use AMD GPU text model
+        
+    Returns:
+        Tuple of (url, model_name)
+    """
+    if prefer_amd and AMD_MODELS_ENABLED:
+        return globals.LLAMA_AMD_URL, "llama3.1-amd"
+    else:
+        return globals.LLAMA_URL, "llama3.1"
+
+
+def log_gpu_routing(model_name: str, endpoint: str, task_type: str = "inference"):
+    """
+    Log GPU routing decision for debugging.
+    
+    Args:
+        model_name: Name of the model being used
+        endpoint: URL endpoint being used
+        task_type: Type of task being performed
+    """
+    gpu_type = "AMD RX 6800" if endpoint == globals.LLAMA_AMD_URL else "NVIDIA"
+    logger.info(f"[GPU Router] {task_type} - Using {model_name} on {gpu_type} ({endpoint})")
+
+
+# Example usage in bot code:
+"""
+# Simple routing by model name
+url = get_endpoint_for_model("llama3.1-amd")
+
+# Load balanced routing
+url, model = get_llama_url_with_load_balancing(task_type="text")
+response = requests.post(
+    f"{url}/v1/chat/completions",
+    json={"model": model, ...}
+)
+
+# Vision model with GPU preference
+url, model = get_vision_model_for_gpu(prefer_amd=True)
+
+# With logging
+url = get_endpoint_for_model("darkidol-amd")
+log_gpu_routing("darkidol-amd", url, "evil mode generation")
+"""