# RealtimeSTT Container # Uses Faster-Whisper with CUDA for GPU-accelerated inference # Includes Silero VAD (ONNX, CPU-only) for robust voice detection # # Updated per RealtimeSTT PR #295: # - CUDA 12.8.1 (latest stable) # - PyTorch CPU-only (for Silero VAD tensor ops only - saves ~2.3 GB) # - Faster-Whisper/CTranslate2 uses CUDA directly, no PyTorch GPU needed # - Ubuntu 24.04 base FROM nvidia/cuda:12.8.1-cudnn-runtime-ubuntu24.04 # Prevent interactive prompts during build ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 # Set working directory WORKDIR /app # Install system dependencies (Ubuntu 24.04 has Python 3.12 by default) RUN apt-get update && apt-get install -y \ python3-pip \ ffmpeg \ libsndfile1 \ libportaudio2 \ git \ curl \ && rm -rf /var/lib/apt/lists/* # Install PyTorch CPU-only (for Silero VAD tensor ops - GPU transcription uses CTranslate2 directly) COPY requirements-gpu-torch.txt . RUN python3 -m pip install --break-system-packages --no-cache-dir -r requirements-gpu-torch.txt # Copy and install other Python dependencies COPY requirements.txt . RUN python3 -m pip install --break-system-packages --no-cache-dir -r requirements.txt # Copy application code COPY stt_server.py . # Create models directory (models will be downloaded on first run) RUN mkdir -p /root/.cache/huggingface # Expose WebSocket port EXPOSE 8766 # Health check - use netcat to check if port is listening HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \ CMD python3 -c "import socket; s=socket.socket(); s.settimeout(2); s.connect(('localhost', 8766)); s.close()" || exit 1 # Run the server CMD ["python3", "stt_server.py"]