Files
miku-discord/soprano_to_rvc/soprano/soprano/server.py
koko210Serve 8ca716029e add: absorb soprano_to_rvc as regular subdirectory
Voice conversion pipeline (Soprano TTS → RVC) with Docker support.
Previously tracked as bare gitlink; removed .git/ directories and
absorbed into main repo for unified tracking.

Includes: Soprano TTS, RVC WebUI integration, Docker configs,
WebSocket API, and benchmark scripts.
Updated .gitignore to exclude large model weights (*.pth, *.pt, *.onnx, *.index).
287 files (3.1GB of ML weights properly excluded via gitignore).
2026-03-04 00:24:53 +02:00

48 lines
1.4 KiB
Python

import base64
import io
import json
from typing import Generator
import numpy as np
from fastapi import FastAPI, HTTPException
from fastapi.responses import Response
from scipy.io.wavfile import write
from torch import Tensor
from soprano.tts import SopranoTTS
# Load model at startup
tts = SopranoTTS(cache_size_mb = 100)
app = FastAPI(title="Soprano TTS API")
def _tensor_to_wav_bytes(tensor: Tensor) -> bytes:
"""
Convert a 1D fp32 torch tensor to a WAV byte stream.
"""
# convert to int16
audio_int16 = (np.clip(tensor.numpy(), -1.0, 1.0) * 32767).astype(np.int16)
wav_io = io.BytesIO()
write(wav_io, 32000, audio_int16) # 32kHz sample rate
wav_io.seek(0)
return wav_io.read()
@app.post("/v1/audio/speech")
async def create_speech(payload: dict):
"""
Minimal implementation of OpenAI's Speech endpoint.
Fields:
- input: string - text to synthesize
- model, voice, etc. are accepted but ignored.
- response_format: str - ignored, only support wav.
"""
text = payload.get("input")
if not isinstance(text, str) or not text.strip():
raise HTTPException(status_code=400, detail="`input` field must be a non-empty string.")
audio_tensor = tts.infer(text)
wav_bytes = _tensor_to_wav_bytes(audio_tensor)
return Response(content=wav_bytes, media_type="audio/wav", headers={"Content-Disposition": 'attachment; filename="speech.wav"'})