""" GPU Router Utility for Dual GPU Setup Manages routing between NVIDIA and AMD GPUs for model inference """ import os import random import logging from typing import Optional, Literal import globals logger = logging.getLogger(__name__) # Model to GPU mapping MODEL_TO_GPU = { # NVIDIA models (primary GPU) "llama3.1": globals.LLAMA_URL, "text-model": globals.LLAMA_URL, "darkidol": globals.LLAMA_URL, "evil-model": globals.LLAMA_URL, "uncensored": globals.LLAMA_URL, "vision": globals.LLAMA_URL, "vision-model": globals.LLAMA_URL, "minicpm": globals.LLAMA_URL, # AMD models (secondary GPU - RX 6800) "llama3.1-amd": globals.LLAMA_AMD_URL, "text-model-amd": globals.LLAMA_AMD_URL, "amd-text": globals.LLAMA_AMD_URL, "darkidol-amd": globals.LLAMA_AMD_URL, "evil-model-amd": globals.LLAMA_AMD_URL, "uncensored-amd": globals.LLAMA_AMD_URL, "moondream-amd": globals.LLAMA_AMD_URL, "vision-amd": globals.LLAMA_AMD_URL, "moondream": globals.LLAMA_AMD_URL, } # Configuration PREFER_AMD_GPU = os.getenv("PREFER_AMD_GPU", "false").lower() == "true" AMD_MODELS_ENABLED = os.getenv("AMD_MODELS_ENABLED", "true").lower() == "true" def get_endpoint_for_model(model_name: str) -> str: """ Get the correct llama-swap endpoint for a model. Args: model_name: Name or alias of the model Returns: URL of the llama-swap endpoint (either NVIDIA or AMD) """ endpoint = MODEL_TO_GPU.get(model_name, globals.LLAMA_URL) # If AMD models are disabled, use NVIDIA for AMD models too if not AMD_MODELS_ENABLED and endpoint == globals.LLAMA_AMD_URL: logger.warning(f"AMD GPU disabled, routing {model_name} to NVIDIA GPU") # Map AMD model name to NVIDIA equivalent nvidia_model = model_name.replace("-amd", "") endpoint = globals.LLAMA_URL return endpoint def is_amd_model(model_name: str) -> bool: """ Check if a model runs on the AMD GPU. Args: model_name: Name or alias of the model Returns: True if model runs on AMD GPU, False otherwise """ return model_name.endswith("-amd") or model_name in ["moondream", "moondream-amd", "vision-amd"] def get_llama_url_with_load_balancing( prefer_amd: bool = False, task_type: Literal["text", "vision", "evil"] = "text" ) -> tuple[str, str]: """ Get llama URL with optional load balancing between GPUs. Returns both URL and recommended model name. Args: prefer_amd: If True, prefer AMD GPU when possible task_type: Type of task (text, vision, or evil) Returns: Tuple of (url, model_name) """ if not AMD_MODELS_ENABLED: # AMD disabled, use NVIDIA only if task_type == "evil": return globals.LLAMA_URL, "darkidol" elif task_type == "vision": return globals.LLAMA_URL, "vision" else: return globals.LLAMA_URL, "llama3.1" # AMD enabled - implement load balancing use_amd = prefer_amd or PREFER_AMD_GPU or (random.random() < 0.5) if task_type == "evil": # Evil/uncensored models if use_amd: return globals.LLAMA_AMD_URL, "darkidol-amd" else: return globals.LLAMA_URL, "darkidol" elif task_type == "vision": # Vision models - MiniCPM on NVIDIA, Moondream on AMD if use_amd: return globals.LLAMA_AMD_URL, "moondream-amd" else: return globals.LLAMA_URL, "vision" else: # Text generation - round robin between GPUs if use_amd: return globals.LLAMA_AMD_URL, "llama3.1-amd" else: return globals.LLAMA_URL, "llama3.1" def get_vision_model_for_gpu(prefer_amd: bool = False) -> tuple[str, str]: """ Get the appropriate vision model based on GPU preference. Args: prefer_amd: If True, use AMD GPU vision model Returns: Tuple of (url, model_name) """ if prefer_amd and AMD_MODELS_ENABLED: return globals.LLAMA_AMD_URL, "moondream-amd" else: return globals.LLAMA_URL, "vision" def get_text_model_for_gpu(prefer_amd: bool = False) -> tuple[str, str]: """ Get the appropriate text model based on GPU preference. Args: prefer_amd: If True, use AMD GPU text model Returns: Tuple of (url, model_name) """ if prefer_amd and AMD_MODELS_ENABLED: return globals.LLAMA_AMD_URL, "llama3.1-amd" else: return globals.LLAMA_URL, "llama3.1" def log_gpu_routing(model_name: str, endpoint: str, task_type: str = "inference"): """ Log GPU routing decision for debugging. Args: model_name: Name of the model being used endpoint: URL endpoint being used task_type: Type of task being performed """ gpu_type = "AMD RX 6800" if endpoint == globals.LLAMA_AMD_URL else "NVIDIA" logger.info(f"[GPU Router] {task_type} - Using {model_name} on {gpu_type} ({endpoint})") # Example usage in bot code: """ # Simple routing by model name url = get_endpoint_for_model("llama3.1-amd") # Load balanced routing url, model = get_llama_url_with_load_balancing(task_type="text") response = requests.post( f"{url}/v1/chat/completions", json={"model": model, ...} ) # Vision model with GPU preference url, model = get_vision_model_for_gpu(prefer_amd=True) # With logging url = get_endpoint_for_model("darkidol-amd") log_gpu_routing("darkidol-amd", url, "evil mode generation") """