miku-discord/bot/utils/gpu_preload.py

"""
GPU Model Preloading Utility
Preloads models on AMD GPU to take advantage of 16GB VRAM
"""
import aiohttp
import asyncio
import json
import globals

async def preload_amd_models():
    """
    Preload both text and vision models on AMD GPU
    Since AMD RX 6800 has 16GB VRAM, we can keep both loaded simultaneously
    """
    print("🔧 Preloading models on AMD GPU...")

    # Preload text model
    try:
        async with aiohttp.ClientSession() as session:
            payload = {
                "model": "llama3.1",
                "messages": [{"role": "user", "content": "Hi"}],
                "max_tokens": 1
            }
            async with session.post(
                f"{globals.LLAMA_AMD_URL}/v1/chat/completions",
                json=payload,
                timeout=aiohttp.ClientTimeout(total=60)
            ) as response:
                if response.status == 200:
                    print("✅ Text model (llama3.1) preloaded on AMD GPU")
                else:
                    print(f"⚠️ Text model preload returned status {response.status}")
    except Exception as e:
        print(f"⚠️ Failed to preload text model on AMD: {e}")

    # Preload vision model
    try:
        async with aiohttp.ClientSession() as session:
            # Create a minimal test image (1x1 white pixel)
            import base64
            test_image = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg=="

            payload = {
                "model": "vision",
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": "What do you see?"},
                            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{test_image}"}}
                        ]
                    }
                ],
                "max_tokens": 1
            }
            async with session.post(
                f"{globals.LLAMA_AMD_URL}/v1/chat/completions",
                json=payload,
                timeout=aiohttp.ClientTimeout(total=120)
            ) as response:
                if response.status == 200:
                    print("✅ Vision model preloaded on AMD GPU")
                else:
                    print(f"⚠️ Vision model preload returned status {response.status}")
    except Exception as e:
        print(f"⚠️ Failed to preload vision model on AMD: {e}")

    print("✅ AMD GPU preload complete - both models ready")