miku-discord/stt-realtime/Dockerfile

# RealtimeSTT Container
# Uses Faster-Whisper with CUDA for GPU-accelerated inference
# Includes dual VAD (WebRTC + Silero) for robust voice detection

FROM nvidia/cuda:12.6.2-cudnn-runtime-ubuntu22.04

# Prevent interactive prompts during build
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1

# Set working directory
WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    python3.11 \
    python3.11-venv \
    python3.11-dev \
    python3-pip \
    build-essential \
    ffmpeg \
    libsndfile1 \
    libportaudio2 \
    portaudio19-dev \
    git \
    curl \
    && rm -rf /var/lib/apt/lists/*

# Upgrade pip
RUN python3.11 -m pip install --upgrade pip

# Copy requirements first (for Docker layer caching)
COPY requirements.txt .

# Install Python dependencies
RUN python3.11 -m pip install --no-cache-dir -r requirements.txt

# Install PyTorch with CUDA 12.1 support (compatible with CUDA 12.6)
RUN python3.11 -m pip install --no-cache-dir \
    torch==2.5.1+cu121 \
    torchaudio==2.5.1+cu121 \
    --index-url https://download.pytorch.org/whl/cu121

# Copy application code
COPY stt_server.py .

# Create models directory (models will be downloaded on first run)
RUN mkdir -p /root/.cache/huggingface

# Expose WebSocket port
EXPOSE 8766

# Health check - use netcat to check if port is listening
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
    CMD python3.11 -c "import socket; s=socket.socket(); s.settimeout(2); s.connect(('localhost', 8766)); s.close()" || exit 1

# Run the server
CMD ["python3.11", "stt_server.py"]