Files
miku-discord/stt-realtime/Dockerfile

59 lines
1.6 KiB
Docker

# RealtimeSTT Container
# Uses Faster-Whisper with CUDA for GPU-accelerated inference
# Includes dual VAD (WebRTC + Silero) for robust voice detection
FROM nvidia/cuda:12.6.2-cudnn-runtime-ubuntu22.04
# Prevent interactive prompts during build
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
python3.11 \
python3.11-venv \
python3.11-dev \
python3-pip \
build-essential \
ffmpeg \
libsndfile1 \
libportaudio2 \
portaudio19-dev \
git \
curl \
&& rm -rf /var/lib/apt/lists/*
# Upgrade pip
RUN python3.11 -m pip install --upgrade pip
# Copy requirements first (for Docker layer caching)
COPY requirements.txt .
# Install Python dependencies
RUN python3.11 -m pip install --no-cache-dir -r requirements.txt
# Install PyTorch with CUDA 12.1 support (compatible with CUDA 12.6)
RUN python3.11 -m pip install --no-cache-dir \
torch==2.5.1+cu121 \
torchaudio==2.5.1+cu121 \
--index-url https://download.pytorch.org/whl/cu121
# Copy application code
COPY stt_server.py .
# Create models directory (models will be downloaded on first run)
RUN mkdir -p /root/.cache/huggingface
# Expose WebSocket port
EXPOSE 8766
# Health check - use netcat to check if port is listening
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
CMD python3.11 -c "import socket; s=socket.socket(); s.settimeout(2); s.connect(('localhost', 8766)); s.close()" || exit 1
# Run the server
CMD ["python3.11", "stt_server.py"]