192 lines
5.5 KiB
Python
192 lines
5.5 KiB
Python
|
|
"""
|
||
|
|
GPU Router Utility for Dual GPU Setup
|
||
|
|
Manages routing between NVIDIA and AMD GPUs for model inference
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import random
|
||
|
|
import logging
|
||
|
|
from typing import Optional, Literal
|
||
|
|
|
||
|
|
import globals
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
# Model to GPU mapping
|
||
|
|
MODEL_TO_GPU = {
|
||
|
|
# NVIDIA models (primary GPU)
|
||
|
|
"llama3.1": globals.LLAMA_URL,
|
||
|
|
"text-model": globals.LLAMA_URL,
|
||
|
|
"darkidol": globals.LLAMA_URL,
|
||
|
|
"evil-model": globals.LLAMA_URL,
|
||
|
|
"uncensored": globals.LLAMA_URL,
|
||
|
|
"vision": globals.LLAMA_URL,
|
||
|
|
"vision-model": globals.LLAMA_URL,
|
||
|
|
"minicpm": globals.LLAMA_URL,
|
||
|
|
|
||
|
|
# AMD models (secondary GPU - RX 6800)
|
||
|
|
"llama3.1-amd": globals.LLAMA_AMD_URL,
|
||
|
|
"text-model-amd": globals.LLAMA_AMD_URL,
|
||
|
|
"amd-text": globals.LLAMA_AMD_URL,
|
||
|
|
"darkidol-amd": globals.LLAMA_AMD_URL,
|
||
|
|
"evil-model-amd": globals.LLAMA_AMD_URL,
|
||
|
|
"uncensored-amd": globals.LLAMA_AMD_URL,
|
||
|
|
"moondream-amd": globals.LLAMA_AMD_URL,
|
||
|
|
"vision-amd": globals.LLAMA_AMD_URL,
|
||
|
|
"moondream": globals.LLAMA_AMD_URL,
|
||
|
|
}
|
||
|
|
|
||
|
|
# Configuration
|
||
|
|
PREFER_AMD_GPU = os.getenv("PREFER_AMD_GPU", "false").lower() == "true"
|
||
|
|
AMD_MODELS_ENABLED = os.getenv("AMD_MODELS_ENABLED", "true").lower() == "true"
|
||
|
|
|
||
|
|
|
||
|
|
def get_endpoint_for_model(model_name: str) -> str:
|
||
|
|
"""
|
||
|
|
Get the correct llama-swap endpoint for a model.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
model_name: Name or alias of the model
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
URL of the llama-swap endpoint (either NVIDIA or AMD)
|
||
|
|
"""
|
||
|
|
endpoint = MODEL_TO_GPU.get(model_name, globals.LLAMA_URL)
|
||
|
|
|
||
|
|
# If AMD models are disabled, use NVIDIA for AMD models too
|
||
|
|
if not AMD_MODELS_ENABLED and endpoint == globals.LLAMA_AMD_URL:
|
||
|
|
logger.warning(f"AMD GPU disabled, routing {model_name} to NVIDIA GPU")
|
||
|
|
# Map AMD model name to NVIDIA equivalent
|
||
|
|
nvidia_model = model_name.replace("-amd", "")
|
||
|
|
endpoint = globals.LLAMA_URL
|
||
|
|
|
||
|
|
return endpoint
|
||
|
|
|
||
|
|
|
||
|
|
def is_amd_model(model_name: str) -> bool:
|
||
|
|
"""
|
||
|
|
Check if a model runs on the AMD GPU.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
model_name: Name or alias of the model
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
True if model runs on AMD GPU, False otherwise
|
||
|
|
"""
|
||
|
|
return model_name.endswith("-amd") or model_name in ["moondream", "moondream-amd", "vision-amd"]
|
||
|
|
|
||
|
|
|
||
|
|
def get_llama_url_with_load_balancing(
|
||
|
|
prefer_amd: bool = False,
|
||
|
|
task_type: Literal["text", "vision", "evil"] = "text"
|
||
|
|
) -> tuple[str, str]:
|
||
|
|
"""
|
||
|
|
Get llama URL with optional load balancing between GPUs.
|
||
|
|
Returns both URL and recommended model name.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
prefer_amd: If True, prefer AMD GPU when possible
|
||
|
|
task_type: Type of task (text, vision, or evil)
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Tuple of (url, model_name)
|
||
|
|
"""
|
||
|
|
if not AMD_MODELS_ENABLED:
|
||
|
|
# AMD disabled, use NVIDIA only
|
||
|
|
if task_type == "evil":
|
||
|
|
return globals.LLAMA_URL, "darkidol"
|
||
|
|
elif task_type == "vision":
|
||
|
|
return globals.LLAMA_URL, "vision"
|
||
|
|
else:
|
||
|
|
return globals.LLAMA_URL, "llama3.1"
|
||
|
|
|
||
|
|
# AMD enabled - implement load balancing
|
||
|
|
use_amd = prefer_amd or PREFER_AMD_GPU or (random.random() < 0.5)
|
||
|
|
|
||
|
|
if task_type == "evil":
|
||
|
|
# Evil/uncensored models
|
||
|
|
if use_amd:
|
||
|
|
return globals.LLAMA_AMD_URL, "darkidol-amd"
|
||
|
|
else:
|
||
|
|
return globals.LLAMA_URL, "darkidol"
|
||
|
|
|
||
|
|
elif task_type == "vision":
|
||
|
|
# Vision models - MiniCPM on NVIDIA, Moondream on AMD
|
||
|
|
if use_amd:
|
||
|
|
return globals.LLAMA_AMD_URL, "moondream-amd"
|
||
|
|
else:
|
||
|
|
return globals.LLAMA_URL, "vision"
|
||
|
|
|
||
|
|
else:
|
||
|
|
# Text generation - round robin between GPUs
|
||
|
|
if use_amd:
|
||
|
|
return globals.LLAMA_AMD_URL, "llama3.1-amd"
|
||
|
|
else:
|
||
|
|
return globals.LLAMA_URL, "llama3.1"
|
||
|
|
|
||
|
|
|
||
|
|
def get_vision_model_for_gpu(prefer_amd: bool = False) -> tuple[str, str]:
|
||
|
|
"""
|
||
|
|
Get the appropriate vision model based on GPU preference.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
prefer_amd: If True, use AMD GPU vision model
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Tuple of (url, model_name)
|
||
|
|
"""
|
||
|
|
if prefer_amd and AMD_MODELS_ENABLED:
|
||
|
|
return globals.LLAMA_AMD_URL, "moondream-amd"
|
||
|
|
else:
|
||
|
|
return globals.LLAMA_URL, "vision"
|
||
|
|
|
||
|
|
|
||
|
|
def get_text_model_for_gpu(prefer_amd: bool = False) -> tuple[str, str]:
|
||
|
|
"""
|
||
|
|
Get the appropriate text model based on GPU preference.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
prefer_amd: If True, use AMD GPU text model
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Tuple of (url, model_name)
|
||
|
|
"""
|
||
|
|
if prefer_amd and AMD_MODELS_ENABLED:
|
||
|
|
return globals.LLAMA_AMD_URL, "llama3.1-amd"
|
||
|
|
else:
|
||
|
|
return globals.LLAMA_URL, "llama3.1"
|
||
|
|
|
||
|
|
|
||
|
|
def log_gpu_routing(model_name: str, endpoint: str, task_type: str = "inference"):
|
||
|
|
"""
|
||
|
|
Log GPU routing decision for debugging.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
model_name: Name of the model being used
|
||
|
|
endpoint: URL endpoint being used
|
||
|
|
task_type: Type of task being performed
|
||
|
|
"""
|
||
|
|
gpu_type = "AMD RX 6800" if endpoint == globals.LLAMA_AMD_URL else "NVIDIA"
|
||
|
|
logger.info(f"[GPU Router] {task_type} - Using {model_name} on {gpu_type} ({endpoint})")
|
||
|
|
|
||
|
|
|
||
|
|
# Example usage in bot code:
|
||
|
|
"""
|
||
|
|
# Simple routing by model name
|
||
|
|
url = get_endpoint_for_model("llama3.1-amd")
|
||
|
|
|
||
|
|
# Load balanced routing
|
||
|
|
url, model = get_llama_url_with_load_balancing(task_type="text")
|
||
|
|
response = requests.post(
|
||
|
|
f"{url}/v1/chat/completions",
|
||
|
|
json={"model": model, ...}
|
||
|
|
)
|
||
|
|
|
||
|
|
# Vision model with GPU preference
|
||
|
|
url, model = get_vision_model_for_gpu(prefer_amd=True)
|
||
|
|
|
||
|
|
# With logging
|
||
|
|
url = get_endpoint_for_model("darkidol-amd")
|
||
|
|
log_gpu_routing("darkidol-amd", url, "evil mode generation")
|
||
|
|
"""
|