miku-discord/stt-realtime/Dockerfile

# RealtimeSTT Container
# Uses Faster-Whisper with CUDA for GPU-accelerated inference
# Includes Silero VAD (ONNX, CPU-only) for robust voice detection
#
# Updated per RealtimeSTT PR #295:
# - CUDA 12.8.1 (latest stable)
# - PyTorch CPU-only (for Silero VAD tensor ops only - saves ~2.3 GB)
# - Faster-Whisper/CTranslate2 uses CUDA directly, no PyTorch GPU needed
# - Ubuntu 24.04 base

FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04

# Prevent interactive prompts during build
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1

# Set working directory
WORKDIR /app

# Install system dependencies (Ubuntu 24.04 has Python 3.12 by default)
RUN apt-get update && apt-get install -y \
    python3-pip \
    ffmpeg \
    libsndfile1 \
    libportaudio2 \
    git \
    curl \
    && rm -rf /var/lib/apt/lists/*

# Install PyTorch CPU-only (for Silero VAD tensor ops - GPU transcription uses CTranslate2 directly)
COPY requirements-gpu-torch.txt .
RUN python3 -m pip install --break-system-packages --no-cache-dir -r requirements-gpu-torch.txt

# Copy and install other Python dependencies
COPY requirements.txt .
RUN python3 -m pip install --break-system-packages --no-cache-dir -r requirements.txt

# Copy application code
COPY stt_server.py .

# Create models directory (models will be downloaded on first run)
RUN mkdir -p /root/.cache/huggingface

# Expose WebSocket port
EXPOSE 8766

# Health check - use netcat to check if port is listening
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
    CMD python3 -c "import socket; s=socket.socket(); s.settimeout(2); s.connect(('localhost', 8766)); s.close()" || exit 1

# Run the server
CMD ["python3", "stt_server.py"]