Voice conversion pipeline (Soprano TTS → RVC) with Docker support. Previously tracked as bare gitlink; removed .git/ directories and absorbed into main repo for unified tracking. Includes: Soprano TTS, RVC WebUI integration, Docker configs, WebSocket API, and benchmark scripts. Updated .gitignore to exclude large model weights (*.pth, *.pt, *.onnx, *.index). 287 files (3.1GB of ML weights properly excluded via gitignore).
1868 lines
77 KiB
Python
1868 lines
77 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Soprano + RVC Streaming API
|
||
Integrated pipeline: Soprano TTS → RVC Voice Conversion → HTTP Stream
|
||
Closely follows gui_v1.py logic for buffer management and processing
|
||
"""
|
||
import os
|
||
import sys
|
||
import json
|
||
import struct
|
||
import asyncio
|
||
import threading
|
||
import time
|
||
import uuid
|
||
import logging
|
||
from queue import Queue, Empty
|
||
from typing import Optional, Dict
|
||
from dataclasses import dataclass
|
||
|
||
# Apply torch.load patch for PyTorch 2.5+ compatibility with fairseq models
|
||
try:
|
||
import rvc_torch_patch
|
||
except ImportError:
|
||
pass # Patch not available (bare metal setup)
|
||
|
||
# Add soprano and RVC to path
|
||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'soprano'))
|
||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'Retrieval-based-Voice-Conversion-WebUI'))
|
||
|
||
import numpy as np
|
||
import torch
|
||
import torch.nn.functional as F
|
||
import torchaudio.transforms as tat
|
||
import zmq # For Soprano server communication
|
||
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
||
from fastapi.responses import StreamingResponse
|
||
from pydantic import BaseModel
|
||
|
||
# Soprano runs in separate process, don't import it here
|
||
# from soprano import SopranoTTS
|
||
from infer.lib import rtrvc as rvc_for_realtime
|
||
from configs.config import Config as RVCConfig
|
||
from tools.torchgate import TorchGate
|
||
from multiprocessing import Queue as MPQueue, cpu_count
|
||
|
||
# Setup logging
|
||
logging.basicConfig(
|
||
level=logging.DEBUG, # Enable DEBUG to see RVC internal errors
|
||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
app = FastAPI(title="Soprano + RVC Streaming API")
|
||
|
||
# Add CORS middleware to allow WebSocket connections from any origin
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=["*"], # Allow all origins
|
||
allow_credentials=True,
|
||
allow_methods=["*"],
|
||
allow_headers=["*"],
|
||
)
|
||
|
||
# Global state
|
||
pipeline: Optional['SopranoRVCPipeline'] = None
|
||
pipeline_ready = False # Flag to indicate warmup complete
|
||
|
||
@dataclass
|
||
class PipelineConfig:
|
||
"""Configuration matching gui_v1.py parameters"""
|
||
pth_path: str
|
||
index_path: str
|
||
pitch: int = 0
|
||
formant: float = 0.0
|
||
index_rate: float = 0.3
|
||
rms_mix_rate: float = 0.25
|
||
block_time: float = 0.20 # 0.20s blocks = 9600 samples, reduces CPU overhead
|
||
crossfade_time: float = 0.05 # Proportional to block_time
|
||
extra_time: float = 1.8 # Reduced from 2.5s (minimum safe value per ChatGPT)
|
||
n_cpu: int = 4
|
||
f0method: str = "rmvpe" # CPU-bound but stable; GPU methods need kernel compilation
|
||
soprano_sample_rate: int = 32000
|
||
rvc_sample_rate: int = 48000
|
||
use_pv: bool = False
|
||
I_noise_reduce: bool = False
|
||
O_noise_reduce: bool = False
|
||
|
||
class TextRequest(BaseModel):
|
||
text: str
|
||
|
||
def write_wav_header(sample_rate=48000, channels=1, bits_per_sample=16):
|
||
"""Generate WAV header bytes"""
|
||
byte_rate = sample_rate * channels * bits_per_sample // 8
|
||
block_align = channels * bits_per_sample // 8
|
||
|
||
header = b'RIFF'
|
||
header += struct.pack('<I', 0xFFFFFFFF)
|
||
header += b'WAVE'
|
||
header += b'fmt '
|
||
header += struct.pack('<I', 16)
|
||
header += struct.pack('<H', 1)
|
||
header += struct.pack('<H', channels)
|
||
header += struct.pack('<I', sample_rate)
|
||
header += struct.pack('<I', byte_rate)
|
||
header += struct.pack('<H', block_align)
|
||
header += struct.pack('<H', bits_per_sample)
|
||
header += b'data'
|
||
header += struct.pack('<I', 0xFFFFFFFF)
|
||
|
||
return header
|
||
|
||
class SopranoRVCPipeline:
|
||
"""
|
||
Integrated Soprano → RVC pipeline
|
||
Based on gui_v1.py audio processing logic
|
||
"""
|
||
|
||
def __init__(self, config: PipelineConfig):
|
||
self.config = config
|
||
|
||
# Change to RVC directory for config loading
|
||
rvc_dir = os.path.join(os.path.dirname(__file__), 'Retrieval-based-Voice-Conversion-WebUI')
|
||
original_dir = os.getcwd()
|
||
os.chdir(rvc_dir)
|
||
|
||
try:
|
||
self.rvc_config = RVCConfig()
|
||
self.device = self.rvc_config.device
|
||
finally:
|
||
os.chdir(original_dir)
|
||
|
||
logger.info(f"Using device: {self.device}")
|
||
logger.info(f"Half precision: {self.rvc_config.is_half}")
|
||
|
||
# Queues for multiprocessing (harvest f0 method)
|
||
self.inp_q = MPQueue()
|
||
self.opt_q = MPQueue()
|
||
|
||
# Client management
|
||
self.text_queue = Queue(maxsize=50)
|
||
self.client_queues: Dict[str, Queue] = {}
|
||
self.client_queues_lock = threading.Lock()
|
||
self.running = False
|
||
|
||
# Async pipeline: Queue between Soprano generator and RVC processor
|
||
self.soprano_chunk_queue = Queue(maxsize=100) # Large buffer - allow Soprano to run at full speed
|
||
self.current_job_id = None
|
||
self.soprano_done = threading.Event() # Signal when Soprano finishes generating
|
||
|
||
# Diagnostic logging flag
|
||
self._log_gpu_state = os.environ.get('LOG_GPU_STATE', '0') == '1'
|
||
|
||
# ZMQ connection to Soprano server (running on GTX 1660)
|
||
self.zmq_context = zmq.Context()
|
||
self.soprano_socket = self.zmq_context.socket(zmq.REQ)
|
||
soprano_server = os.environ.get('SOPRANO_SERVER', 'tcp://localhost:5555')
|
||
self.soprano_socket.connect(soprano_server)
|
||
logger.info(f"✓ Connected to Soprano server at {soprano_server}")
|
||
|
||
# Store paths for later use
|
||
self.original_dir = os.getcwd()
|
||
self.rvc_base_dir = os.path.join(os.path.dirname(__file__), "Retrieval-based-Voice-Conversion-WebUI")
|
||
|
||
# Initialize models
|
||
# Soprano runs in separate process (GTX 1660)
|
||
# self._initialize_soprano() # Removed - using ZMQ instead
|
||
self._initialize_rvc()
|
||
self._initialize_buffers()
|
||
|
||
logger.info("Pipeline initialization complete!")
|
||
|
||
def _call_soprano_server(self, text):
|
||
"""Call Soprano server via ZMQ and get audio"""
|
||
import uuid
|
||
job_id = str(uuid.uuid4())
|
||
|
||
request = {
|
||
'job_id': job_id,
|
||
'text': text
|
||
}
|
||
|
||
logger.debug(f"[Soprano] Sending request to server: {text[:50]}...")
|
||
self.soprano_socket.send_json(request)
|
||
|
||
logger.debug(f"[Soprano] Waiting for response...")
|
||
response = self.soprano_socket.recv_json()
|
||
|
||
if 'error' in response:
|
||
raise Exception(f"Soprano server error: {response['error']}")
|
||
|
||
# Convert back to numpy array
|
||
audio = np.array(response['audio'], dtype=np.float32)
|
||
logger.debug(f"[Soprano] Received {len(audio)} samples ({response['audio_duration']:.2f}s)")
|
||
|
||
return audio
|
||
|
||
def _initialize_soprano(self):
|
||
"""Initialize Soprano TTS model"""
|
||
logger.info("Loading Soprano TTS model...")
|
||
# Force Soprano to use the same device as RVC
|
||
device_str = 'cuda' if 'cuda' in str(self.device) else str(self.device)
|
||
self.soprano = SopranoTTS(
|
||
backend='lmdeploy', # Using lmdeploy backend (proven faster)
|
||
device=device_str,
|
||
cache_size_mb=1000, # Maxing out cache (plenty of VRAM available)
|
||
decoder_batch_size=4 # Higher batching to maximize throughput
|
||
)
|
||
logger.info(f"✓ Soprano TTS loaded on {self.soprano.device} using {self.soprano.backend} backend")
|
||
|
||
def _initialize_rvc(self):
|
||
"""Initialize RVC model - follows gui_v1.py start_vc()"""
|
||
logger.info("Loading RVC model...")
|
||
logger.info(f" Model: {self.config.pth_path}")
|
||
logger.info(f" Index: {self.config.index_path}")
|
||
|
||
# Enable debug logging for RVC module
|
||
rvc_logger = logging.getLogger('infer.lib.rtrvc')
|
||
rvc_logger.setLevel(logging.DEBUG)
|
||
|
||
torch.cuda.empty_cache()
|
||
|
||
# Change to RVC directory so relative paths work (assets/hubert/hubert_base.pt)
|
||
original_dir = os.getcwd()
|
||
rvc_base_dir = os.path.join(os.path.dirname(__file__), "Retrieval-based-Voice-Conversion-WebUI")
|
||
os.chdir(rvc_base_dir)
|
||
logger.info(f" Working directory: {os.getcwd()}")
|
||
|
||
try:
|
||
# Check if HuBERT exists
|
||
hubert_path = os.path.join(os.getcwd(), "assets/hubert/hubert_base.pt")
|
||
if not os.path.exists(hubert_path):
|
||
raise FileNotFoundError(f"HuBERT model not found at: {hubert_path}")
|
||
logger.info(f" HuBERT path verified: {hubert_path}")
|
||
|
||
# Convert paths to be relative to RVC directory
|
||
# Config paths are relative to soprano_to_rvc dir, need to make them relative to RVC dir
|
||
pth_path_abs = os.path.join(original_dir, self.config.pth_path)
|
||
index_path_abs = os.path.join(original_dir, self.config.index_path)
|
||
|
||
# Make them relative to RVC directory
|
||
pth_path_rel = os.path.relpath(pth_path_abs, rvc_base_dir)
|
||
index_path_rel = os.path.relpath(index_path_abs, rvc_base_dir)
|
||
|
||
logger.info(f" Adjusted model path: {pth_path_rel}")
|
||
logger.info(f" Adjusted index path: {index_path_rel}")
|
||
|
||
self.rvc = rvc_for_realtime.RVC(
|
||
self.config.pitch,
|
||
self.config.formant,
|
||
pth_path_rel,
|
||
index_path_rel,
|
||
self.config.index_rate,
|
||
self.config.n_cpu,
|
||
self.inp_q,
|
||
self.opt_q,
|
||
self.rvc_config,
|
||
None
|
||
)
|
||
|
||
# Verify critical attributes were set
|
||
if not hasattr(self.rvc, 'model') or self.rvc.model is None:
|
||
# The RVC __init__ catches exceptions silently, so model might not be set
|
||
raise RuntimeError(
|
||
"RVC model attribute not set - HuBERT model failed to load. "
|
||
"Check that assets/hubert/hubert_base.pt is accessible and valid."
|
||
)
|
||
if not hasattr(self.rvc, 'net_g') or self.rvc.net_g is None:
|
||
raise RuntimeError("RVC net_g attribute not set - Synthesizer failed to load")
|
||
if not hasattr(self.rvc, 'version'):
|
||
self.rvc.version = 'v2' # Default to v2 for modern models
|
||
logger.info(" Version attribute not set by RVC, defaulting to v2")
|
||
|
||
# Get target sample rate
|
||
rvc_sr = getattr(self.rvc, 'tgt_sr', self.config.rvc_sample_rate)
|
||
logger.info(f"✓ RVC model loaded (version: {self.rvc.version}, target SR: {rvc_sr}Hz)")
|
||
|
||
# Apply torch optimizations for inference performance
|
||
logger.info("Applying torch inference optimizations...")
|
||
torch.set_grad_enabled(False) # Disable gradient computation (inference only)
|
||
torch.backends.cudnn.benchmark = True # Auto-tune convolution algorithms
|
||
torch.backends.cuda.matmul.allow_tf32 = True # Allow TF32 for matrix ops
|
||
torch.backends.cudnn.allow_tf32 = True # Allow TF32 for cuDNN ops
|
||
|
||
# Ensure RVC models are in FP16 if half precision is enabled
|
||
if self.rvc_config.is_half:
|
||
logger.info("Ensuring RVC models are in FP16...")
|
||
self.rvc.model = self.rvc.model.half()
|
||
self.rvc.net_g = self.rvc.net_g.half()
|
||
|
||
logger.info("✓ Torch optimizations applied")
|
||
except Exception as e:
|
||
logger.error(f"Failed to initialize RVC: {e}")
|
||
logger.error(f"Current directory: {os.getcwd()}")
|
||
logger.error(f"Files in assets/hubert: {os.listdir('assets/hubert') if os.path.exists('assets/hubert') else 'DIR NOT FOUND'}")
|
||
raise
|
||
finally:
|
||
# Always restore original directory
|
||
os.chdir(original_dir)
|
||
|
||
def _initialize_buffers(self):
|
||
"""
|
||
Initialize all buffers, windows, and resamplers
|
||
Closely follows gui_v1.py buffer setup logic
|
||
"""
|
||
logger.info("Initializing buffers and processing pipeline...")
|
||
|
||
# Sample rates
|
||
self.soprano_sr = self.config.soprano_sample_rate # 32000 Hz
|
||
self.target_sr = self.config.rvc_sample_rate # 48000 Hz (output)
|
||
|
||
# Zero-crossing rate calculation
|
||
self.zc = self.target_sr // 100 # 480 samples
|
||
|
||
# Block frames (following gui_v1.py logic)
|
||
self.block_frame = (
|
||
int(np.round(self.config.block_time * self.target_sr / self.zc)) * self.zc
|
||
)
|
||
self.block_frame_16k = 160 * self.block_frame // self.zc
|
||
|
||
# Crossfade frames
|
||
self.crossfade_frame = (
|
||
int(np.round(self.config.crossfade_time * self.target_sr / self.zc)) * self.zc
|
||
)
|
||
self.sola_buffer_frame = min(self.crossfade_frame, 4 * self.zc)
|
||
self.sola_search_frame = self.zc
|
||
|
||
# Extra frames for processing
|
||
self.extra_frame = (
|
||
int(np.round(self.config.extra_time * self.target_sr / self.zc)) * self.zc
|
||
)
|
||
|
||
logger.info(f" Block frame: {self.block_frame} samples ({self.config.block_time}s)")
|
||
logger.info(f" Crossfade: {self.crossfade_frame} samples ({self.config.crossfade_time}s)")
|
||
logger.info(f" Extra time: {self.extra_frame} samples ({self.config.extra_time}s)")
|
||
|
||
# Input buffers (following gui_v1.py)
|
||
self.input_wav = torch.zeros(
|
||
self.extra_frame + self.crossfade_frame + self.sola_search_frame + self.block_frame,
|
||
device=self.device,
|
||
dtype=torch.float32,
|
||
)
|
||
|
||
self.input_wav_denoise = self.input_wav.clone()
|
||
|
||
self.input_wav_res = torch.zeros(
|
||
160 * self.input_wav.shape[0] // self.zc,
|
||
device=self.device,
|
||
dtype=torch.float32,
|
||
)
|
||
|
||
# RMS buffer for threshold detection
|
||
self.rms_buffer = np.zeros(4 * self.zc, dtype='float32')
|
||
|
||
# SOLA buffers
|
||
self.sola_buffer = torch.zeros(
|
||
self.sola_buffer_frame, device=self.device, dtype=torch.float32
|
||
)
|
||
self.nr_buffer = self.sola_buffer.clone()
|
||
self.output_buffer = self.input_wav.clone()
|
||
|
||
# Processing parameters
|
||
self.skip_head = self.extra_frame // self.zc
|
||
self.return_length = (
|
||
self.block_frame + self.sola_buffer_frame + self.sola_search_frame
|
||
) // self.zc
|
||
|
||
# Fade windows for SOLA crossfading
|
||
self.fade_in_window = (
|
||
torch.sin(
|
||
0.5 * np.pi * torch.linspace(
|
||
0.0, 1.0, steps=self.sola_buffer_frame,
|
||
device=self.device, dtype=torch.float32
|
||
)
|
||
) ** 2
|
||
)
|
||
self.fade_out_window = 1 - self.fade_in_window
|
||
|
||
# Resamplers
|
||
# Soprano (32kHz) → Target (48kHz) for buffer accumulation
|
||
self.resampler_soprano_to_48k = tat.Resample(
|
||
orig_freq=self.soprano_sr,
|
||
new_freq=self.target_sr,
|
||
dtype=torch.float32
|
||
).to(self.device)
|
||
|
||
# Target (48kHz) → RVC input (16kHz)
|
||
self.resampler_48k_to_16k = tat.Resample(
|
||
orig_freq=self.target_sr,
|
||
new_freq=16000,
|
||
dtype=torch.float32
|
||
).to(self.device)
|
||
|
||
# Soprano (32kHz) → RVC input (16kHz) - DEPRECATED, keeping for compatibility
|
||
self.resampler_soprano_to_16k = tat.Resample(
|
||
orig_freq=self.soprano_sr,
|
||
new_freq=16000,
|
||
dtype=torch.float32
|
||
).to(self.device)
|
||
|
||
# RVC output (tgt_sr) → Target output (48kHz)
|
||
self.rvc_tgt_sr = getattr(self.rvc, 'tgt_sr', self.target_sr)
|
||
if self.rvc_tgt_sr != self.target_sr:
|
||
self.resampler_rvc_to_target = tat.Resample(
|
||
orig_freq=self.rvc_tgt_sr,
|
||
new_freq=self.target_sr,
|
||
dtype=torch.float32
|
||
).to(self.device)
|
||
else:
|
||
self.resampler_rvc_to_target = None
|
||
|
||
# Noise gate (if enabled)
|
||
if self.config.I_noise_reduce or self.config.O_noise_reduce:
|
||
self.tg = TorchGate(
|
||
sr=self.target_sr,
|
||
n_fft=4 * self.zc,
|
||
prop_decrease=0.9
|
||
).to(self.device)
|
||
|
||
logger.info("✓ Buffers initialized")
|
||
|
||
def process_soprano_chunk(self, soprano_chunk: torch.Tensor) -> torch.Tensor:
|
||
"""
|
||
Process a Soprano output chunk through the buffer pipeline
|
||
Soprano outputs at 32kHz mono, we resample to 48kHz
|
||
Returns the chunk at 48kHz
|
||
"""
|
||
# Convert to tensor if needed
|
||
if not torch.is_tensor(soprano_chunk):
|
||
soprano_chunk = torch.from_numpy(soprano_chunk)
|
||
|
||
# Ensure chunk is on the correct device
|
||
soprano_chunk = soprano_chunk.to(self.device)
|
||
|
||
# Dual resampling strategy: resample from 32kHz to both 48kHz and 16kHz in parallel
|
||
# This avoids the 48kHz→16kHz step while keeping both buffers we need:
|
||
# - 48kHz for RMS mixing and final output buffers
|
||
# - 16kHz for RVC inference
|
||
chunk_48k = self.resampler_soprano_to_48k(soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]
|
||
chunk_16k = self.resampler_soprano_to_16k(soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]
|
||
|
||
# Return both for parallel buffer management
|
||
return chunk_48k, chunk_16k
|
||
|
||
def accumulate_and_process_block(self, chunk_48k: torch.Tensor, chunk_16k: torch.Tensor) -> bool:
|
||
"""
|
||
Accumulate samples in both 48kHz and 16kHz domains and slide buffers when we have enough
|
||
Returns True if we accumulated a full block and should process through RVC
|
||
|
||
Args:
|
||
chunk_48k: Audio chunk at 48kHz (for RMS mixing buffer)
|
||
chunk_16k: Audio chunk at 16kHz (for RVC inference buffer)
|
||
"""
|
||
# Accumulation buffers for both sample rates
|
||
if not hasattr(self, 'accumulation_buffer_48k'):
|
||
self.accumulation_buffer_48k = torch.tensor([], device=self.device, dtype=torch.float32)
|
||
if not hasattr(self, 'accumulation_buffer_16k'):
|
||
self.accumulation_buffer_16k = torch.tensor([], device=self.device, dtype=torch.float32)
|
||
|
||
self.accumulation_buffer_48k = torch.cat([self.accumulation_buffer_48k, chunk_48k])
|
||
self.accumulation_buffer_16k = torch.cat([self.accumulation_buffer_16k, chunk_16k])
|
||
|
||
# Check if we have accumulated enough samples for a block (at 48kHz rate)
|
||
# We need block_frame samples (12000 at 48kHz = 0.25s)
|
||
if self.accumulation_buffer_48k.shape[0] >= self.block_frame:
|
||
# Take exactly block_frame samples from 48kHz buffer
|
||
chunk_size = self.block_frame
|
||
block_data_48k = self.accumulation_buffer_48k[:chunk_size]
|
||
self.accumulation_buffer_48k = self.accumulation_buffer_48k[chunk_size:]
|
||
|
||
# Take corresponding samples from 16kHz buffer (same time duration)
|
||
block_data_16k = self.accumulation_buffer_16k[:self.block_frame_16k]
|
||
self.accumulation_buffer_16k = self.accumulation_buffer_16k[self.block_frame_16k:]
|
||
|
||
# Slide the 48kHz input buffer (for RMS mixing)
|
||
self.input_wav[:-chunk_size] = self.input_wav[chunk_size:].clone()
|
||
self.input_wav[-chunk_size:] = block_data_48k
|
||
|
||
# Slide the 16kHz RVC input buffer (for inference)
|
||
self.input_wav_res[:-self.block_frame_16k] = self.input_wav_res[self.block_frame_16k:].clone()
|
||
self.input_wav_res[-self.block_frame_16k:] = block_data_16k
|
||
|
||
return True # Ready to process through RVC
|
||
|
||
return False # Need more samples
|
||
|
||
def flush_buffers(self) -> list:
|
||
"""
|
||
Flush any remaining audio in accumulation buffers by padding to block size.
|
||
Returns list of RVC output chunks for any remaining audio.
|
||
|
||
Call this at the end of synthesis to ensure no audio is cut off.
|
||
"""
|
||
output_chunks = []
|
||
|
||
if not hasattr(self, 'accumulation_buffer_48k') or not hasattr(self, 'accumulation_buffer_16k'):
|
||
return output_chunks
|
||
|
||
# Check if there's remaining audio in the buffers
|
||
remaining_48k = self.accumulation_buffer_48k.shape[0]
|
||
remaining_16k = self.accumulation_buffer_16k.shape[0]
|
||
|
||
if remaining_48k > 0:
|
||
logger.debug(f"[Flush] Processing {remaining_48k} remaining samples @ 48kHz")
|
||
|
||
# Pad to block size with silence
|
||
pad_size_48k = self.block_frame - remaining_48k
|
||
pad_size_16k = self.block_frame_16k - remaining_16k
|
||
|
||
if pad_size_48k > 0:
|
||
padding_48k = torch.zeros(pad_size_48k, device=self.device, dtype=torch.float32)
|
||
padding_16k = torch.zeros(pad_size_16k, device=self.device, dtype=torch.float32)
|
||
|
||
chunk_48k = torch.cat([self.accumulation_buffer_48k, padding_48k])
|
||
chunk_16k = torch.cat([self.accumulation_buffer_16k, padding_16k])
|
||
else:
|
||
chunk_48k = self.accumulation_buffer_48k[:self.block_frame]
|
||
chunk_16k = self.accumulation_buffer_16k[:self.block_frame_16k]
|
||
|
||
# Process the final block
|
||
if self.accumulate_and_process_block(chunk_48k, chunk_16k):
|
||
rvc_output = self.process_through_rvc()
|
||
|
||
# Trim the padding from output (only keep the actual audio)
|
||
# Calculate how many output samples correspond to input samples
|
||
output_samples = int(remaining_48k) # Same rate (48kHz)
|
||
if output_samples > 0 and output_samples < len(rvc_output):
|
||
rvc_output = rvc_output[:output_samples]
|
||
|
||
output_chunks.append(rvc_output)
|
||
logger.debug(f"[Flush] Generated {len(rvc_output)} output samples")
|
||
|
||
# Clear the buffers
|
||
self.accumulation_buffer_48k = torch.tensor([], device=self.device, dtype=torch.float32)
|
||
self.accumulation_buffer_16k = torch.tensor([], device=self.device, dtype=torch.float32)
|
||
|
||
return output_chunks
|
||
|
||
def process_through_rvc(self) -> Optional[torch.Tensor]:
|
||
"""
|
||
Process accumulated buffer through RVC
|
||
Returns RVC output chunk ready for streaming
|
||
"""
|
||
# GPU state logging (for diagnostic purposes)
|
||
if hasattr(self, '_log_gpu_state') and self._log_gpu_state:
|
||
mem_before = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0
|
||
reserved_before = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0
|
||
|
||
# Change to RVC directory for inference (needs to access assets/rmvpe/rmvpe.pt etc)
|
||
current_dir = os.getcwd()
|
||
os.chdir(self.rvc_base_dir)
|
||
|
||
try:
|
||
# RVC inference
|
||
rvc_start = time.time()
|
||
infer_wav = self.rvc.infer(
|
||
self.input_wav_res,
|
||
self.block_frame_16k,
|
||
self.skip_head,
|
||
self.return_length,
|
||
self.config.f0method,
|
||
)
|
||
rvc_duration = time.time() - rvc_start
|
||
|
||
# Log RVC inference time if slow
|
||
if rvc_duration > 0.200: # >200ms is slow
|
||
logger.warning(f"[RVC] Slow inference: {rvc_duration*1000:.1f}ms")
|
||
if hasattr(self, '_log_gpu_state') and self._log_gpu_state:
|
||
mem_after = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0
|
||
reserved_after = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0
|
||
logger.warning(f"[RVC] GPU mem: {mem_before:.2f}GB -> {mem_after:.2f}GB (reserved: {reserved_before:.2f} -> {reserved_after:.2f})")
|
||
finally:
|
||
# Restore original directory
|
||
os.chdir(current_dir)
|
||
|
||
# Convert to tensor if needed
|
||
if not torch.is_tensor(infer_wav):
|
||
infer_wav = torch.from_numpy(infer_wav).to(self.device)
|
||
|
||
# Resample RVC output to target SR if needed
|
||
if self.resampler_rvc_to_target is not None:
|
||
infer_wav = self.resampler_rvc_to_target(infer_wav)
|
||
|
||
# Output noise reduction (optional)
|
||
if self.config.O_noise_reduce:
|
||
self.output_buffer[:-self.block_frame] = self.output_buffer[self.block_frame:].clone()
|
||
self.output_buffer[-self.block_frame:] = infer_wav[-self.block_frame:]
|
||
infer_wav = self.tg(infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0)).squeeze(0)
|
||
|
||
# Volume envelope mixing (RMS matching)
|
||
if self.config.rms_mix_rate < 1:
|
||
if self.config.I_noise_reduce:
|
||
input_wav = self.input_wav_denoise[self.extra_frame:]
|
||
else:
|
||
input_wav = self.input_wav[self.extra_frame:]
|
||
|
||
# Calculate RMS for input and output
|
||
import librosa
|
||
rms1 = librosa.feature.rms(
|
||
y=input_wav[:infer_wav.shape[0]].cpu().numpy(),
|
||
frame_length=4 * self.zc,
|
||
hop_length=self.zc,
|
||
)
|
||
rms1 = torch.from_numpy(rms1).to(self.device)
|
||
rms1 = F.interpolate(
|
||
rms1.unsqueeze(0),
|
||
size=infer_wav.shape[0] + 1,
|
||
mode='linear',
|
||
align_corners=True,
|
||
)[0, 0, :-1]
|
||
|
||
rms2 = librosa.feature.rms(
|
||
y=infer_wav[:].cpu().numpy(),
|
||
frame_length=4 * self.zc,
|
||
hop_length=self.zc,
|
||
)
|
||
rms2 = torch.from_numpy(rms2).to(self.device)
|
||
rms2 = F.interpolate(
|
||
rms2.unsqueeze(0),
|
||
size=infer_wav.shape[0] + 1,
|
||
mode='linear',
|
||
align_corners=True,
|
||
)[0, 0, :-1]
|
||
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3)
|
||
|
||
infer_wav *= torch.pow(rms1 / rms2, torch.tensor(1 - self.config.rms_mix_rate))
|
||
|
||
# SOLA algorithm for seamless crossfading (from gui_v1.py)
|
||
conv_input = infer_wav[None, None, :self.sola_buffer_frame + self.sola_search_frame]
|
||
cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
|
||
cor_den = torch.sqrt(
|
||
F.conv1d(
|
||
conv_input ** 2,
|
||
torch.ones(1, 1, self.sola_buffer_frame, device=self.device)
|
||
) + 1e-8
|
||
)
|
||
|
||
sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
|
||
infer_wav = infer_wav[sola_offset:]
|
||
|
||
# Apply crossfade
|
||
if not self.config.use_pv: # Standard fade (phase vocoder disabled)
|
||
infer_wav[:self.sola_buffer_frame] *= self.fade_in_window
|
||
infer_wav[:self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window
|
||
|
||
# Update SOLA buffer for next iteration
|
||
self.sola_buffer[:] = infer_wav[self.block_frame:self.block_frame + self.sola_buffer_frame]
|
||
|
||
# Return the block ready for output
|
||
return infer_wav[:self.block_frame]
|
||
|
||
def soprano_generator_worker(self):
|
||
"""
|
||
Thread 1: Continuously generates Soprano chunks and puts them in queue
|
||
Runs in parallel with RVC processor - NO BLOCKING!
|
||
"""
|
||
logger.info("Soprano generator worker started")
|
||
|
||
while self.running:
|
||
try:
|
||
# Get text job from queue
|
||
try:
|
||
job = self.text_queue.get(timeout=0.1)
|
||
job_id = job['job_id']
|
||
text = job['text']
|
||
self.current_job_id = job_id
|
||
|
||
logger.info(f"[Soprano] Processing job {job_id[:8]}: {text[:50]}...")
|
||
start_time = time.time()
|
||
|
||
# Clear the done flag
|
||
self.soprano_done.clear()
|
||
|
||
# Get audio from Soprano server (GTX 1660) via ZMQ
|
||
soprano_audio = self._call_soprano_server(text)
|
||
|
||
# Convert to tensor and split into chunks for queue
|
||
soprano_audio_tensor = torch.from_numpy(soprano_audio).to(self.device).float()
|
||
|
||
# Split into chunks of ~0.5s each for queue processing
|
||
chunk_size = 16000 # 0.5s @ 32kHz
|
||
num_chunks = (len(soprano_audio_tensor) + chunk_size - 1) // chunk_size
|
||
|
||
chunk_count = 0
|
||
chunk_times = []
|
||
queue_wait_times = []
|
||
|
||
for i in range(num_chunks):
|
||
chunk_count += 1
|
||
chunk_start = i * chunk_size
|
||
chunk_end = min((i + 1) * chunk_size, len(soprano_audio_tensor))
|
||
soprano_chunk = soprano_audio_tensor[chunk_start:chunk_end].cpu().numpy()
|
||
|
||
chunk_times.append(time.time())
|
||
|
||
# Put chunk in queue for RVC processor
|
||
# Measure if this blocks (queue full)
|
||
queue_put_start = time.time()
|
||
self.soprano_chunk_queue.put({
|
||
'job_id': job_id,
|
||
'chunk': soprano_chunk,
|
||
'chunk_num': chunk_count,
|
||
'timestamp': time.time()
|
||
})
|
||
queue_wait = time.time() - queue_put_start
|
||
queue_wait_times.append(queue_wait)
|
||
|
||
if queue_wait > 0.01: # Log if queue put took >10ms (blocking)
|
||
logger.warning(f"[Soprano] Chunk {chunk_count} BLOCKED {queue_wait*1000:.1f}ms on queue.put() (queue full!)")
|
||
|
||
logger.debug(f"[Soprano] Queued chunk {chunk_count} for RVC (queue wait: {queue_wait*1000:.1f}ms)")
|
||
|
||
# Signal that Soprano is done generating for this job
|
||
self.soprano_done.set()
|
||
elapsed = time.time() - start_time
|
||
total_queue_wait = sum(queue_wait_times)
|
||
max_queue_wait = max(queue_wait_times) if queue_wait_times else 0
|
||
|
||
logger.info(f"[Soprano] Job {job_id[:8]} complete: {chunk_count} chunks in {elapsed:.2f}s")
|
||
logger.info(f"[Soprano] Total queue wait: {total_queue_wait:.2f}s ({total_queue_wait/elapsed*100:.1f}% of time)")
|
||
logger.info(f"[Soprano] Max queue wait: {max_queue_wait*1000:.1f}ms")
|
||
if total_queue_wait > 1.0:
|
||
logger.warning(f"[Soprano] ⚠️ BOTTLENECK: Spent {total_queue_wait:.2f}s blocked on queue! RVC too slow to drain.")
|
||
|
||
except Empty:
|
||
continue
|
||
|
||
except Exception as e:
|
||
logger.error(f"[Soprano] Error: {e}", exc_info=True)
|
||
self.soprano_done.set()
|
||
|
||
def rvc_processor_worker(self):
|
||
"""
|
||
Thread 2: Continuously pulls Soprano chunks from queue and processes through RVC
|
||
Runs in parallel with Soprano generator - NO BLOCKING!
|
||
"""
|
||
logger.info("RVC processor worker started")
|
||
|
||
# Timing diagnostics
|
||
job_start_time = None
|
||
current_job_id = None
|
||
block_times = []
|
||
processing_times = []
|
||
|
||
while self.running:
|
||
try:
|
||
# Get Soprano chunk from queue (with timeout to check running flag)
|
||
try:
|
||
item = self.soprano_chunk_queue.get(timeout=0.1)
|
||
job_id = item['job_id']
|
||
soprano_chunk = item['chunk']
|
||
chunk_num = item['chunk_num']
|
||
chunk_timestamp = item['timestamp']
|
||
|
||
# Track job timing
|
||
if current_job_id != job_id:
|
||
# New job started
|
||
if current_job_id is not None and len(block_times) > 0:
|
||
# Log previous job stats
|
||
self._log_job_stats(current_job_id, job_start_time, block_times, processing_times)
|
||
|
||
# GPU state logging between jobs
|
||
if self._log_gpu_state and torch.cuda.is_available():
|
||
torch.cuda.synchronize()
|
||
mem_alloc = torch.cuda.memory_allocated() / 1024**3
|
||
mem_reserved = torch.cuda.memory_reserved() / 1024**3
|
||
logger.info(f"[GPU] Between jobs: {mem_alloc:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
|
||
|
||
current_job_id = job_id
|
||
job_start_time = time.time()
|
||
block_times = []
|
||
processing_times = []
|
||
logger.info(f"[RVC] Starting job {job_id[:8]}")
|
||
|
||
# GPU state at job start
|
||
if self._log_gpu_state and torch.cuda.is_available():
|
||
mem_alloc = torch.cuda.memory_allocated() / 1024**3
|
||
mem_reserved = torch.cuda.memory_reserved() / 1024**3
|
||
logger.info(f"[GPU] Job start: {mem_alloc:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
|
||
|
||
queue_wait = time.time() - chunk_timestamp
|
||
logger.debug(f"[RVC] Processing chunk {chunk_num} (queue wait: {queue_wait*1000:.1f}ms)")
|
||
|
||
# Process and accumulate chunks (dual resampling: 48kHz + 16kHz)
|
||
chunk_48k, chunk_16k = self.process_soprano_chunk(soprano_chunk)
|
||
|
||
# IMPORTANT: Drain accumulation buffer - one Soprano chunk may fill multiple blocks!
|
||
blocks_in_this_chunk = 0
|
||
while self.accumulate_and_process_block(chunk_48k, chunk_16k):
|
||
blocks_in_this_chunk += 1
|
||
block_start = time.time()
|
||
|
||
# Process through RVC now that we have a full block
|
||
try:
|
||
processing_start = time.time()
|
||
rvc_output = self.process_through_rvc()
|
||
processing_duration = time.time() - processing_start
|
||
processing_times.append(processing_duration)
|
||
|
||
# Convert to PCM and broadcast
|
||
pcm_data = (rvc_output.cpu().numpy() * 32767).clip(-32768, 32767).astype('int16').tobytes()
|
||
|
||
# Broadcast to all clients
|
||
with self.client_queues_lock:
|
||
dead_clients = []
|
||
for client_id, queue in list(self.client_queues.items()):
|
||
try:
|
||
queue.put(pcm_data, timeout=0.5)
|
||
except Exception as e:
|
||
logger.debug(f"Client {client_id[:8]} queue put failed: {e}")
|
||
if queue.full():
|
||
dead_clients.append(client_id)
|
||
|
||
for client_id in dead_clients:
|
||
del self.client_queues[client_id]
|
||
logger.warning(f"Client {client_id[:8]} removed (queue full)")
|
||
|
||
block_times.append(time.time() - block_start)
|
||
logger.debug(f"[RVC] Block processed in {processing_duration*1000:.1f}ms")
|
||
|
||
except Exception as e:
|
||
logger.error(f"[RVC] Processing error: {e}", exc_info=True)
|
||
|
||
# After first block, give empty tensors for next iteration
|
||
chunk_48k = torch.tensor([], device=self.device, dtype=torch.float32)
|
||
chunk_16k = torch.tensor([], device=self.device, dtype=torch.float32)
|
||
|
||
logger.debug(f"[RVC] Chunk {chunk_num} produced {blocks_in_this_chunk} blocks")
|
||
|
||
except Empty:
|
||
# Check if Soprano is done and queue is empty
|
||
if self.soprano_done.is_set() and self.soprano_chunk_queue.empty():
|
||
if current_job_id is not None and len(block_times) > 0:
|
||
# Log final job stats
|
||
self._log_job_stats(current_job_id, job_start_time, block_times, processing_times)
|
||
current_job_id = None
|
||
continue
|
||
|
||
except Exception as e:
|
||
logger.error(f"[RVC] Error: {e}", exc_info=True)
|
||
|
||
def _log_job_stats(self, job_id, start_time, block_times, processing_times):
|
||
"""Log timing statistics for completed job"""
|
||
elapsed = time.time() - start_time
|
||
total_audio_duration = len(block_times) * self.config.block_time
|
||
realtime_factor = total_audio_duration / elapsed if elapsed > 0 else 0
|
||
avg_processing = sum(processing_times) / len(processing_times) if processing_times else 0
|
||
|
||
logger.info(f"""
|
||
====== ASYNC PIPELINE PERFORMANCE ======
|
||
Job: {job_id[:8]}
|
||
Total elapsed: {elapsed:.2f}s
|
||
Blocks processed: {len(block_times)}
|
||
Audio duration: {total_audio_duration:.2f}s
|
||
Realtime factor: {realtime_factor:.2f}x
|
||
Avg RVC processing: {avg_processing*1000:.1f}ms per block
|
||
========================================
|
||
""")
|
||
|
||
# Write to file for easy access
|
||
with open('/tmp/soprano_timing_debug.txt', 'a') as f:
|
||
f.write(f"\n=== ASYNC PIPELINE JOB {job_id[:8]} ===\n")
|
||
f.write(f"Blocks: {len(block_times)}, Elapsed: {elapsed:.2f}s\n")
|
||
f.write(f"Audio: {total_audio_duration:.2f}s, Realtime: {realtime_factor:.2f}x\n")
|
||
f.write(f"Avg RVC: {avg_processing*1000:.1f}ms\n")
|
||
f.write(f"==================\n\n")
|
||
|
||
def broadcast_worker(self):
|
||
"""
|
||
DEPRECATED: Old synchronous implementation
|
||
Now using soprano_generator_worker() + rvc_processor_worker() for parallel processing
|
||
|
||
Kept for reference but starts the new async workers instead
|
||
"""
|
||
logger.info("Starting async pipeline workers...")
|
||
self.running = True
|
||
|
||
# Start both workers in parallel
|
||
soprano_thread = threading.Thread(target=self.soprano_generator_worker, daemon=True, name="Soprano-Generator")
|
||
rvc_thread = threading.Thread(target=self.rvc_processor_worker, daemon=True, name="RVC-Processor")
|
||
|
||
soprano_thread.start()
|
||
rvc_thread.start()
|
||
|
||
logger.info("✓ Async pipeline workers started (Soprano + RVC running in parallel)")
|
||
|
||
# Keep main thread alive
|
||
while self.running:
|
||
time.sleep(0.1)
|
||
|
||
logger.info("Broadcast worker stopped")
|
||
|
||
def start(self):
|
||
"""Start the broadcast worker thread"""
|
||
worker = threading.Thread(target=self.broadcast_worker, daemon=True)
|
||
worker.start()
|
||
logger.info("Pipeline started")
|
||
|
||
def stop(self):
|
||
"""Stop the pipeline"""
|
||
self.running = False
|
||
logger.info("Pipeline stopped")
|
||
|
||
@app.on_event("startup")
|
||
async def startup_event():
|
||
"""Initialize pipeline on startup"""
|
||
global pipeline, pipeline_ready
|
||
|
||
logger.info("="*60)
|
||
logger.info("Soprano + RVC Streaming API")
|
||
logger.info("="*60)
|
||
|
||
# Load configuration
|
||
config_path = os.path.join(os.path.dirname(__file__), "soprano_rvc_config.json")
|
||
|
||
if not os.path.exists(config_path):
|
||
logger.error(f"Configuration file not found: {config_path}")
|
||
raise FileNotFoundError(f"Config file missing: {config_path}")
|
||
|
||
with open(config_path, 'r') as f:
|
||
config_dict = json.load(f)
|
||
|
||
config = PipelineConfig(**config_dict)
|
||
|
||
logger.info("Initializing Soprano + RVC pipeline...")
|
||
logger.info("This may take 15-20 seconds...")
|
||
|
||
# Initialize pipeline (blocks until models loaded)
|
||
pipeline = SopranoRVCPipeline(config)
|
||
pipeline.start()
|
||
|
||
logger.info("="*60)
|
||
logger.info("✓ Pipeline ready! API accepting requests on port 8765")
|
||
logger.info("="*60)
|
||
|
||
# Warmup: Generate a test phrase to initialize all components
|
||
logger.info("🔥 Warming up pipeline with test generation...")
|
||
try:
|
||
# Text sized to generate ~1-2 seconds of audio (roughly 16 blocks @ 0.2s each = 3.2s)
|
||
warmup_text = "Hello, I'm Miku. Voice system initialized and ready."
|
||
warmup_audio = pipeline._call_soprano_server(warmup_text)
|
||
warmup_tensor = torch.from_numpy(warmup_audio).to(pipeline.device).float()
|
||
|
||
# Process all chunks through the full pipeline to ensure everything is warmed up
|
||
chunk_size = 3200 # 0.1s @ 32kHz
|
||
blocks_generated = 0
|
||
|
||
for i in range(0, len(warmup_tensor), chunk_size):
|
||
chunk = warmup_tensor[i:min(i + chunk_size, len(warmup_tensor))].cpu().numpy()
|
||
chunk_48k, chunk_16k = pipeline.process_soprano_chunk(chunk)
|
||
|
||
# Keep draining blocks
|
||
while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k):
|
||
_ = pipeline.process_through_rvc()
|
||
blocks_generated += 1
|
||
# After first block, subsequent chunks are empty
|
||
chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
|
||
# Flush any remaining audio
|
||
flush_chunks = pipeline.flush_buffers()
|
||
blocks_generated += len(flush_chunks)
|
||
|
||
if blocks_generated > 0:
|
||
logger.info(f"✅ Warmup complete! Generated {blocks_generated} audio blocks. Pipeline is hot and ready.")
|
||
else:
|
||
logger.warning("⚠ Warmup didn't generate blocks, but pipeline initialized.")
|
||
|
||
pipeline_ready = True
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ Warmup failed: {e}", exc_info=True)
|
||
logger.warning("Pipeline will still accept requests, but first generation may be slow.")
|
||
pipeline_ready = True # Allow connections anyway
|
||
|
||
@app.on_event("shutdown")
|
||
async def shutdown_event():
|
||
"""Cleanup on shutdown"""
|
||
global pipeline
|
||
if pipeline:
|
||
pipeline.stop()
|
||
logger.info("Pipeline shutdown complete")
|
||
|
||
@app.post("/api/speak")
|
||
async def speak(request: TextRequest):
|
||
"""Queue text for synthesis"""
|
||
if pipeline is None:
|
||
return {"error": "Pipeline not initialized"}
|
||
|
||
job_id = str(uuid.uuid4())
|
||
pipeline.text_queue.put({'job_id': job_id, 'text': request.text})
|
||
|
||
return {
|
||
"status": "queued",
|
||
"message": "Text queued for synthesis",
|
||
"job_id": job_id,
|
||
"queue_size": pipeline.text_queue.qsize()
|
||
}
|
||
|
||
@app.post("/api/speak_to_file")
|
||
async def speak_to_file(request: TextRequest):
|
||
"""Synthesize text and save directly to WAV file (for testing)"""
|
||
if pipeline is None:
|
||
return {"error": "Pipeline not initialized"}
|
||
|
||
import wave
|
||
import tempfile
|
||
import os
|
||
|
||
job_id = str(uuid.uuid4())
|
||
output_path = f"/tmp/soprano_rvc_{job_id[:8]}.wav"
|
||
|
||
try:
|
||
logger.info(f"File synthesis job {job_id[:8]}: {request.text[:50]}...")
|
||
|
||
# Collect all audio chunks
|
||
audio_chunks = []
|
||
stream = pipeline.soprano.infer_stream(request.text, chunk_size=10)
|
||
|
||
chunk_count = 0
|
||
soprano_chunk_count = 0
|
||
|
||
for soprano_chunk in stream:
|
||
soprano_chunk_count += 1
|
||
# Debug logging (can be commented out for production)
|
||
# logger.info(f" Soprano chunk {soprano_chunk_count}: shape={soprano_chunk.shape if hasattr(soprano_chunk, 'shape') else len(soprano_chunk)}")
|
||
|
||
# Process and accumulate chunks (dual resampling)
|
||
chunk_48k, chunk_16k = pipeline.process_soprano_chunk(soprano_chunk)
|
||
# logger.info(f" After dual resample: 48k={chunk_48k.shape}, 16k={chunk_16k.shape}")
|
||
|
||
# IMPORTANT: Drain the accumulation buffer - one Soprano chunk may fill multiple blocks!
|
||
# Keep processing blocks until we don't have enough samples left
|
||
while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k):
|
||
chunk_count += 1
|
||
# logger.info(f" Block {chunk_count} ready - processing through RVC")
|
||
# logger.info(f" input_wav buffer: shape={pipeline.input_wav.shape}, mean={pipeline.input_wav.abs().mean():.6f}")
|
||
|
||
# Process through RVC
|
||
rvc_output = pipeline.process_through_rvc()
|
||
# logger.info(f" RVC output: shape={rvc_output.shape}, mean={rvc_output.abs().mean():.6f}, max={rvc_output.abs().max():.6f}")
|
||
audio_chunks.append(rvc_output.cpu().numpy())
|
||
|
||
# After first block, both chunks should be empty tensors for subsequent iterations
|
||
chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
|
||
# CRITICAL: Flush any remaining audio in buffers
|
||
flush_chunks = pipeline.flush_buffers()
|
||
for rvc_output in flush_chunks:
|
||
audio_chunks.append(rvc_output.cpu().numpy())
|
||
|
||
# Concatenate all chunks
|
||
if audio_chunks:
|
||
import numpy as np
|
||
full_audio = np.concatenate(audio_chunks)
|
||
|
||
# Write to WAV file
|
||
with wave.open(output_path, 'wb') as wav_file:
|
||
wav_file.setnchannels(1) # Mono
|
||
wav_file.setsampwidth(2) # 16-bit
|
||
wav_file.setframerate(pipeline.config.rvc_sample_rate)
|
||
|
||
# Convert to int16 PCM
|
||
pcm_data = (full_audio * 32767).clip(-32768, 32767).astype('int16')
|
||
wav_file.writeframes(pcm_data.tobytes())
|
||
|
||
logger.info(f"File synthesis complete: {output_path} ({chunk_count} chunks, {len(full_audio)/pipeline.config.rvc_sample_rate:.2f}s)")
|
||
|
||
return {
|
||
"status": "complete",
|
||
"output_file": output_path,
|
||
"chunks": chunk_count,
|
||
"duration_seconds": len(full_audio) / pipeline.config.rvc_sample_rate
|
||
}
|
||
else:
|
||
return {"error": "No audio generated"}
|
||
|
||
except Exception as e:
|
||
logger.error(f"File synthesis error: {e}", exc_info=True)
|
||
return {"error": str(e)}
|
||
|
||
@app.post("/api/speak_soprano_only")
|
||
async def speak_soprano_only(request: TextRequest):
|
||
"""Synthesize with Soprano only (no RVC) for comparison"""
|
||
if pipeline is None:
|
||
return {"error": "Pipeline not initialized"}
|
||
|
||
import wave
|
||
import numpy as np
|
||
|
||
job_id = str(uuid.uuid4())
|
||
output_path = f"/tmp/soprano_only_{job_id[:8]}.wav"
|
||
|
||
try:
|
||
logger.info(f"Soprano-only synthesis: {request.text[:50]}...")
|
||
|
||
# Generate with Soprano
|
||
audio = pipeline.soprano.infer(request.text)
|
||
|
||
# Write to WAV file at Soprano's native sample rate
|
||
with wave.open(output_path, 'wb') as wav_file:
|
||
wav_file.setnchannels(1)
|
||
wav_file.setsampwidth(2)
|
||
wav_file.setframerate(pipeline.config.soprano_sample_rate) # 32kHz
|
||
|
||
# Convert to int16 PCM
|
||
if torch.is_tensor(audio):
|
||
audio = audio.cpu().numpy()
|
||
pcm_data = (audio * 32767).clip(-32768, 32767).astype('int16')
|
||
wav_file.writeframes(pcm_data.tobytes())
|
||
|
||
logger.info(f"Soprano-only complete: {output_path}")
|
||
|
||
return {
|
||
"status": "complete",
|
||
"output_file": output_path,
|
||
"sample_rate": pipeline.config.soprano_sample_rate,
|
||
"duration_seconds": len(audio) / pipeline.config.soprano_sample_rate
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error(f"Soprano-only synthesis error: {e}", exc_info=True)
|
||
return {"error": str(e)}
|
||
|
||
@app.post("/api/debug_pre_rvc")
|
||
async def debug_pre_rvc(request: TextRequest):
|
||
"""Debug endpoint: Save reconstructed audio BEFORE RVC processing"""
|
||
if pipeline is None:
|
||
return {"error": "Pipeline not initialized"}
|
||
|
||
import wave
|
||
import numpy as np
|
||
|
||
job_id = str(uuid.uuid4())
|
||
output_48k = f"/tmp/pre_rvc_48k_{job_id[:8]}.wav"
|
||
output_16k = f"/tmp/pre_rvc_16k_{job_id[:8]}.wav"
|
||
|
||
try:
|
||
logger.info(f"Pre-RVC debug: {request.text[:50]}...")
|
||
|
||
# Collect all input_wav states (what goes INTO RVC)
|
||
input_wav_chunks_48k = []
|
||
input_wav_chunks_16k = []
|
||
|
||
stream = pipeline.soprano.infer_stream(request.text, chunk_size=10)
|
||
|
||
chunk_count = 0
|
||
for soprano_chunk in stream:
|
||
chunk_48k, chunk_16k = pipeline.process_soprano_chunk(soprano_chunk)
|
||
|
||
while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k):
|
||
chunk_count += 1
|
||
|
||
# SAVE the input_wav buffer BEFORE RVC processes it
|
||
# This is the reconstructed audio from accumulated Soprano chunks
|
||
# Take the relevant portion (last block_frame + extra)
|
||
pre_rvc_48k = pipeline.input_wav[pipeline.extra_frame:pipeline.extra_frame + pipeline.block_frame].clone()
|
||
|
||
# For 16kHz, just take the last block worth of samples from input_wav_res
|
||
pre_rvc_16k = pipeline.input_wav_res[-pipeline.block_frame_16k:].clone()
|
||
|
||
input_wav_chunks_48k.append(pre_rvc_48k.cpu().numpy())
|
||
input_wav_chunks_16k.append(pre_rvc_16k.cpu().numpy())
|
||
|
||
# Still process through RVC for timing
|
||
pipeline.process_through_rvc()
|
||
|
||
chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
|
||
# Save 48kHz version (reconstructed audio before RVC)
|
||
if input_wav_chunks_48k:
|
||
full_audio_48k = np.concatenate(input_wav_chunks_48k)
|
||
with wave.open(output_48k, 'wb') as wav_file:
|
||
wav_file.setnchannels(1)
|
||
wav_file.setsampwidth(2)
|
||
wav_file.setframerate(pipeline.config.rvc_sample_rate)
|
||
pcm_data = (full_audio_48k * 32767).clip(-32768, 32767).astype('int16')
|
||
wav_file.writeframes(pcm_data.tobytes())
|
||
|
||
# Save 16kHz version (what RVC actually receives)
|
||
if input_wav_chunks_16k:
|
||
full_audio_16k = np.concatenate(input_wav_chunks_16k)
|
||
with wave.open(output_16k, 'wb') as wav_file:
|
||
wav_file.setnchannels(1)
|
||
wav_file.setsampwidth(2)
|
||
wav_file.setframerate(16000) # RVC input sample rate
|
||
pcm_data = (full_audio_16k * 32767).clip(-32768, 32767).astype('int16')
|
||
wav_file.writeframes(pcm_data.tobytes())
|
||
|
||
logger.info(f"Pre-RVC debug complete: {output_48k}, {output_16k}")
|
||
|
||
return {
|
||
"status": "complete",
|
||
"output_48k": output_48k,
|
||
"output_16k": output_16k,
|
||
"chunks": chunk_count,
|
||
"info": "48k is reconstructed audio, 16k is what RVC receives"
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error(f"Pre-RVC debug error: {e}", exc_info=True)
|
||
return {"error": str(e)}
|
||
|
||
|
||
@app.get("/api/stream/continuous")
|
||
async def stream_continuous():
|
||
"""Continuous audio stream endpoint"""
|
||
if pipeline is None:
|
||
return {"error": "Pipeline not initialized"}
|
||
|
||
client_id = str(uuid.uuid4())
|
||
|
||
# Smaller queue for tighter flow control (20s buffer = 80 blocks at 0.25s each)
|
||
with pipeline.client_queues_lock:
|
||
pipeline.client_queues[client_id] = Queue(maxsize=80)
|
||
|
||
logger.info(f"Client {client_id[:8]} connected")
|
||
|
||
async def generate():
|
||
try:
|
||
# Send WAV header
|
||
yield write_wav_header(
|
||
sample_rate=pipeline.config.rvc_sample_rate,
|
||
channels=1,
|
||
bits_per_sample=16
|
||
)
|
||
|
||
# Wait for initial buffer (help VLC establish timing)
|
||
# Wait for at least 1-2 blocks before starting stream
|
||
initial_buffer_blocks = 2
|
||
buffered_chunks = []
|
||
while len(buffered_chunks) < initial_buffer_blocks:
|
||
try:
|
||
chunk = await asyncio.get_event_loop().run_in_executor(
|
||
None,
|
||
lambda: pipeline.client_queues[client_id].get(timeout=5.0)
|
||
)
|
||
buffered_chunks.append(chunk)
|
||
except Exception:
|
||
# Timeout waiting for initial buffer - start anyway
|
||
break
|
||
|
||
# Send initial buffer
|
||
for chunk in buffered_chunks:
|
||
yield chunk
|
||
|
||
# Stream remaining audio chunks
|
||
while True:
|
||
try:
|
||
chunk = await asyncio.get_event_loop().run_in_executor(
|
||
None,
|
||
lambda: pipeline.client_queues[client_id].get(timeout=0.1)
|
||
)
|
||
yield chunk
|
||
except Empty:
|
||
await asyncio.sleep(0.01)
|
||
continue
|
||
except Exception as e:
|
||
logger.error(f"Client {client_id[:8]} stream error: {e}")
|
||
break
|
||
|
||
finally:
|
||
with pipeline.client_queues_lock:
|
||
if client_id in pipeline.client_queues:
|
||
del pipeline.client_queues[client_id]
|
||
logger.info(f"Client {client_id[:8]} disconnected")
|
||
|
||
return StreamingResponse(
|
||
generate(),
|
||
media_type="audio/wav",
|
||
headers={
|
||
"Cache-Control": "no-cache",
|
||
"Connection": "keep-alive",
|
||
}
|
||
)
|
||
|
||
@app.get("/api/test_soprano_stream")
|
||
async def test_soprano_stream(text: str = "This is a test of raw Soprano TTS performance without RVC processing."):
|
||
"""Test Soprano TTS performance in isolation - streams raw Soprano output without RVC"""
|
||
if pipeline is None:
|
||
return {"error": "Pipeline not initialized"}
|
||
|
||
import time
|
||
import numpy as np
|
||
|
||
logger.info(f"Testing Soprano-only streaming: {text[:50]}...")
|
||
|
||
async def generate():
|
||
# Send WAV header (Soprano outputs at 32kHz)
|
||
yield write_wav_header(
|
||
sample_rate=32000,
|
||
channels=1,
|
||
bits_per_sample=16
|
||
)
|
||
|
||
# Track timing
|
||
start_time = time.time()
|
||
chunk_count = 0
|
||
total_samples = 0
|
||
|
||
# Stream directly from Soprano
|
||
stream = pipeline.soprano.infer_stream(text, chunk_size=10)
|
||
|
||
for soprano_chunk in stream:
|
||
chunk_count += 1
|
||
|
||
# Convert to int16 PCM
|
||
if not torch.is_tensor(soprano_chunk):
|
||
soprano_chunk = torch.from_numpy(soprano_chunk)
|
||
|
||
pcm_data = (soprano_chunk.cpu().numpy() * 32767).clip(-32768, 32767).astype('int16').tobytes()
|
||
total_samples += len(soprano_chunk)
|
||
|
||
yield pcm_data
|
||
|
||
# Log performance
|
||
elapsed = time.time() - start_time
|
||
audio_duration = total_samples / 32000 # Soprano is 32kHz
|
||
realtime_factor = audio_duration / elapsed if elapsed > 0 else 0
|
||
|
||
logger.info(f"Soprano-only test complete:")
|
||
logger.info(f" Chunks: {chunk_count}")
|
||
logger.info(f" Elapsed: {elapsed:.2f}s")
|
||
logger.info(f" Audio duration: {audio_duration:.2f}s")
|
||
logger.info(f" Realtime factor: {realtime_factor:.2f}x")
|
||
|
||
# Write to debug file
|
||
with open('/tmp/soprano_only_timing.txt', 'w') as f:
|
||
f.write(f"Soprano-only test results:\n")
|
||
f.write(f"Text: {text}\n")
|
||
f.write(f"Chunks: {chunk_count}\n")
|
||
f.write(f"Elapsed: {elapsed:.2f}s\n")
|
||
f.write(f"Audio duration: {audio_duration:.2f}s\n")
|
||
f.write(f"Realtime factor: {realtime_factor:.2f}x\n")
|
||
|
||
return StreamingResponse(
|
||
generate(),
|
||
media_type="audio/wav",
|
||
headers={
|
||
"Cache-Control": "no-cache",
|
||
"Connection": "keep-alive",
|
||
}
|
||
)
|
||
|
||
@app.get("/api/test_rvc_only")
|
||
async def test_rvc_only(input_file: str = "/tmp/soprano_test_3.wav"):
|
||
"""Test RVC performance in isolation - processes pre-existing Soprano audio"""
|
||
if pipeline is None:
|
||
return {"error": "Pipeline not initialized"}
|
||
|
||
import time
|
||
import wave
|
||
import numpy as np
|
||
|
||
logger.info(f"Testing RVC-only performance with input: {input_file}")
|
||
|
||
# Verify input file exists
|
||
if not os.path.exists(input_file):
|
||
return {"error": f"Input file not found: {input_file}"}
|
||
|
||
async def generate():
|
||
# Send WAV header (RVC outputs at 48kHz)
|
||
yield write_wav_header(
|
||
sample_rate=48000,
|
||
channels=1,
|
||
bits_per_sample=16
|
||
)
|
||
|
||
# Load the input audio file (Soprano output at 32kHz)
|
||
with wave.open(input_file, 'rb') as wav:
|
||
sample_rate = wav.getframerate()
|
||
n_channels = wav.getnchannels()
|
||
audio_data = wav.readframes(wav.getnframes())
|
||
audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32767.0
|
||
|
||
logger.info(f"Loaded audio: {len(audio_np)} samples at {sample_rate}Hz")
|
||
|
||
# Convert to tensor and resample from 32kHz to 48kHz (input buffer) and 16kHz (RVC input)
|
||
audio_tensor = torch.from_numpy(audio_np).to(pipeline.device)
|
||
|
||
# Resample to 48kHz for input buffer (RMS mixing needs this)
|
||
audio_48k = pipeline.resampler_soprano_to_48k(audio_tensor.unsqueeze(0).unsqueeze(0))[0, 0]
|
||
|
||
# Resample to 16kHz for RVC inference
|
||
audio_16k = pipeline.resampler_soprano_to_16k(audio_tensor.unsqueeze(0).unsqueeze(0))[0, 0]
|
||
|
||
# Track timing
|
||
start_time = time.time()
|
||
block_count = 0
|
||
rvc_times = []
|
||
|
||
# Process through pipeline in blocks (simulating real streaming)
|
||
chunk_48k = audio_48k
|
||
chunk_16k = audio_16k
|
||
|
||
while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k):
|
||
block_count += 1
|
||
|
||
# Process through RVC
|
||
rvc_start = time.time()
|
||
rvc_output = pipeline.process_through_rvc()
|
||
rvc_duration = time.time() - rvc_start
|
||
rvc_times.append(rvc_duration)
|
||
|
||
# Convert to PCM and yield
|
||
pcm_data = (rvc_output.cpu().numpy() * 32767).clip(-32768, 32767).astype('int16').tobytes()
|
||
yield pcm_data
|
||
|
||
# Give empty tensors for next iteration
|
||
chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
|
||
# Calculate stats
|
||
elapsed = time.time() - start_time
|
||
audio_duration = block_count * pipeline.config.block_time # 0.16s per block
|
||
realtime_factor = audio_duration / elapsed if elapsed > 0 else 0
|
||
avg_rvc = sum(rvc_times) / len(rvc_times) if rvc_times else 0
|
||
|
||
logger.info(f"RVC-only test complete:")
|
||
logger.info(f" Blocks: {block_count}")
|
||
logger.info(f" Elapsed: {elapsed:.2f}s")
|
||
logger.info(f" Audio duration: {audio_duration:.2f}s")
|
||
logger.info(f" Realtime factor: {realtime_factor:.2f}x")
|
||
logger.info(f" Avg RVC time: {avg_rvc*1000:.1f}ms")
|
||
|
||
# Write to debug file
|
||
with open('/tmp/rvc_only_timing.txt', 'w') as f:
|
||
f.write(f"RVC-only test results:\n")
|
||
f.write(f"Input file: {input_file}\n")
|
||
f.write(f"Blocks: {block_count}\n")
|
||
f.write(f"Elapsed: {elapsed:.2f}s\n")
|
||
f.write(f"Audio duration: {audio_duration:.2f}s\n")
|
||
f.write(f"Realtime factor: {realtime_factor:.2f}x\n")
|
||
f.write(f"Avg RVC processing: {avg_rvc*1000:.1f}ms per block\n")
|
||
f.write(f"Total RVC time: {sum(rvc_times):.2f}s ({sum(rvc_times)/elapsed*100:.1f}%)\n")
|
||
|
||
return StreamingResponse(
|
||
generate(),
|
||
media_type="audio/wav",
|
||
headers={
|
||
"Cache-Control": "no-cache",
|
||
"Connection": "keep-alive",
|
||
}
|
||
)
|
||
|
||
@app.get("/api/status")
|
||
async def status():
|
||
"""Get API status"""
|
||
if pipeline is None:
|
||
return {"status": "initializing", "ready": False}
|
||
|
||
with pipeline.client_queues_lock:
|
||
num_clients = len(pipeline.client_queues)
|
||
|
||
return {
|
||
"status": "running",
|
||
"ready": True,
|
||
"queue_size": pipeline.text_queue.qsize(),
|
||
"connected_clients": num_clients,
|
||
"config": {
|
||
"soprano_sr": pipeline.soprano_sr,
|
||
"target_sr": pipeline.target_sr,
|
||
"rvc_model_sr": pipeline.rvc_tgt_sr,
|
||
"block_time": pipeline.config.block_time,
|
||
"f0method": pipeline.config.f0method,
|
||
}
|
||
}
|
||
|
||
@app.get("/health")
|
||
async def health_check():
|
||
"""Health check endpoint for Docker healthcheck"""
|
||
if pipeline is None:
|
||
return {
|
||
"status": "unhealthy",
|
||
"error": "Pipeline not initialized"
|
||
}, 503
|
||
|
||
# Test Soprano server connectivity
|
||
soprano_connected = False
|
||
try:
|
||
# Quick connectivity test - don't actually synthesize
|
||
test_socket = pipeline.zmq_context.socket(zmq.REQ)
|
||
test_socket.setsockopt(zmq.RCVTIMEO, 2000) # 2s timeout
|
||
test_socket.connect(pipeline.soprano_socket.getsockopt(zmq.LAST_ENDPOINT).decode())
|
||
test_socket.close()
|
||
soprano_connected = True
|
||
except Exception as e:
|
||
logger.warning(f"Soprano health check failed: {e}")
|
||
soprano_connected = False
|
||
|
||
return {
|
||
"status": "healthy" if (soprano_connected and pipeline_ready) else "degraded",
|
||
"soprano_connected": soprano_connected,
|
||
"rvc_initialized": pipeline.rvc is not None,
|
||
"pipeline_ready": pipeline is not None,
|
||
"warmed_up": pipeline_ready
|
||
}
|
||
|
||
@app.post("/interrupt")
|
||
async def interrupt_synthesis():
|
||
"""
|
||
Interrupt current synthesis and flush all buffers.
|
||
Used when user speaks over Miku to cancel ongoing TTS playback.
|
||
|
||
Returns:
|
||
{"status": "interrupted", "flushed": true}
|
||
"""
|
||
global pipeline
|
||
|
||
if pipeline is None:
|
||
return {"status": "error", "message": "Pipeline not initialized"}, 503
|
||
|
||
try:
|
||
# Flush Soprano ZMQ socket (drain any pending audio chunks)
|
||
flushed_chunks = 0
|
||
while pipeline.soprano_socket.poll(timeout=0):
|
||
pipeline.soprano_socket.recv()
|
||
flushed_chunks += 1
|
||
|
||
# Clear RVC audio buffer (stop processing queued audio)
|
||
if hasattr(pipeline, 'rvc_audio_buffer'):
|
||
buffer_size = len(pipeline.rvc_audio_buffer)
|
||
pipeline.rvc_audio_buffer.clear()
|
||
logger.info(f"Interrupted: Flushed {flushed_chunks} ZMQ chunks, cleared {buffer_size} RVC buffer samples")
|
||
else:
|
||
logger.info(f"Interrupted: Flushed {flushed_chunks} ZMQ chunks")
|
||
|
||
return {
|
||
"status": "interrupted",
|
||
"flushed": True,
|
||
"zmq_chunks_flushed": flushed_chunks
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error(f"Interrupt failed: {e}", exc_info=True)
|
||
return {"status": "error", "message": str(e)}, 500
|
||
|
||
@app.websocket("/ws/stream")
|
||
async def websocket_stream(websocket: WebSocket):
|
||
"""
|
||
WebSocket endpoint for token-by-token streaming synthesis.
|
||
|
||
Protocol:
|
||
- Client sends: {"token": "text", "pitch_shift": 0, "flush": false}
|
||
- Server sends: binary audio data (PCM float32, 48kHz mono)
|
||
- Client sends: {"flush": true} to process remaining buffer
|
||
|
||
This enables real-time TTS as tokens arrive from LLM streaming.
|
||
"""
|
||
await websocket.accept()
|
||
logger.info("WebSocket client connected")
|
||
|
||
# Wait for warmup if not ready yet
|
||
if not pipeline_ready:
|
||
logger.warning("[WS] Pipeline not warmed up yet, waiting...")
|
||
await websocket.send_json({
|
||
"status": "warming_up",
|
||
"message": "Pipeline warming up, please wait..."
|
||
})
|
||
|
||
# Poll until ready (max 30 seconds)
|
||
for _ in range(60): # 60 * 0.5s = 30s
|
||
if pipeline_ready:
|
||
logger.info("[WS] Pipeline ready, proceeding with connection")
|
||
await websocket.send_json({
|
||
"status": "ready",
|
||
"message": "Pipeline ready!"
|
||
})
|
||
break
|
||
await asyncio.sleep(0.5)
|
||
else:
|
||
logger.error("[WS] Pipeline warmup timeout!")
|
||
await websocket.send_json({
|
||
"error": "Pipeline warmup timeout",
|
||
"message": "Pipeline failed to initialize in time"
|
||
})
|
||
await websocket.close()
|
||
return
|
||
|
||
# Buffer for accumulating tokens until sentence boundary
|
||
text_buffer = ""
|
||
config = {
|
||
"pitch_shift": 0,
|
||
"sample_rate": 48000
|
||
}
|
||
|
||
try:
|
||
while True:
|
||
# Receive token from client
|
||
logger.debug("[WS] Waiting for message...")
|
||
data = await websocket.receive_json()
|
||
logger.info(f"[WS] Received data: {data}")
|
||
|
||
# Update config if provided
|
||
if "pitch_shift" in data:
|
||
config["pitch_shift"] = data["pitch_shift"]
|
||
|
||
# Handle flush request (process whatever is in buffer)
|
||
if data.get("flush", False):
|
||
if text_buffer.strip():
|
||
logger.info(f"[WS] Flushing buffer: {text_buffer[:50]}...")
|
||
await _synthesize_and_send(websocket, text_buffer, config)
|
||
text_buffer = ""
|
||
continue
|
||
|
||
# Add token to buffer
|
||
token = data.get("token", "")
|
||
if not token:
|
||
continue
|
||
|
||
text_buffer += token
|
||
logger.debug(f"[WS] Token received: '{token}' | Buffer: {len(text_buffer)} chars")
|
||
|
||
# Check for sentence boundaries
|
||
# Synthesize when we hit punctuation or buffer gets too long
|
||
should_synthesize = False
|
||
|
||
# Sentence-ending punctuation
|
||
token_stripped = token.rstrip()
|
||
if token_stripped and token_stripped[-1] in ['.', '!', '?', '。', '!', '?']:
|
||
should_synthesize = True
|
||
logger.info(f"[WS] Sentence boundary detected: {text_buffer[:50]}...")
|
||
|
||
# Comma/pause (optional - creates more natural pauses)
|
||
elif token_stripped and token_stripped[-1] in [',', ';', ',', '、']:
|
||
should_synthesize = True
|
||
logger.info(f"[WS] Pause boundary detected: {text_buffer[:50]}...")
|
||
|
||
# Buffer too long (prevent memory issues)
|
||
elif len(text_buffer) > 200:
|
||
should_synthesize = True
|
||
logger.info(f"[WS] Buffer limit reached: {text_buffer[:50]}...")
|
||
|
||
# Synthesize if needed
|
||
if should_synthesize and text_buffer.strip():
|
||
await _synthesize_and_send(websocket, text_buffer, config)
|
||
text_buffer = ""
|
||
|
||
except WebSocketDisconnect:
|
||
logger.info("WebSocket client disconnected")
|
||
# Don't try to send anything - client is already gone
|
||
|
||
except Exception as e:
|
||
logger.error(f"WebSocket error: {e}", exc_info=True)
|
||
try:
|
||
await websocket.close(code=1011, reason=str(e))
|
||
except:
|
||
pass
|
||
|
||
async def _synthesize_and_send(websocket: WebSocket, text: str, config: dict):
|
||
"""
|
||
Helper function to synthesize text and stream audio chunks via WebSocket.
|
||
Sends raw PCM float32 audio data at 48kHz mono.
|
||
"""
|
||
if pipeline is None:
|
||
await websocket.send_json({"error": "Pipeline not initialized"})
|
||
return
|
||
|
||
# Check if WebSocket is still connected
|
||
if websocket.client_state.value != 1: # 1 = CONNECTED
|
||
logger.warning(f"[WS] Cannot send - WebSocket not connected (state: {websocket.client_state})")
|
||
return
|
||
|
||
try:
|
||
# Performance tracking
|
||
pipeline_start = time.time()
|
||
rvc_times = []
|
||
total_blocks = 0
|
||
|
||
# Get audio from Soprano server via ZMQ
|
||
soprano_start = time.time()
|
||
soprano_audio = pipeline._call_soprano_server(text)
|
||
soprano_time = time.time() - soprano_start
|
||
|
||
# Convert to tensor
|
||
soprano_audio_tensor = torch.from_numpy(soprano_audio).to(pipeline.device).float()
|
||
|
||
# Split into chunks for streaming (0.1s chunks @ 32kHz)
|
||
chunk_size = 3200 # 0.1s @ 32kHz
|
||
num_chunks = (len(soprano_audio_tensor) + chunk_size - 1) // chunk_size
|
||
|
||
for i in range(num_chunks):
|
||
chunk_start = i * chunk_size
|
||
chunk_end = min((i + 1) * chunk_size, len(soprano_audio_tensor))
|
||
soprano_chunk = soprano_audio_tensor[chunk_start:chunk_end].cpu().numpy()
|
||
|
||
# Process and accumulate chunks (dual resampling)
|
||
chunk_48k, chunk_16k = pipeline.process_soprano_chunk(soprano_chunk)
|
||
|
||
# Drain the accumulation buffer - one Soprano chunk may fill multiple blocks!
|
||
while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k):
|
||
# Check connection state before sending
|
||
if websocket.client_state.value != 1:
|
||
logger.warning("[WS] Client disconnected during synthesis, aborting")
|
||
return
|
||
|
||
# Process through RVC
|
||
rvc_start = time.time()
|
||
rvc_output = pipeline.process_through_rvc()
|
||
rvc_time = time.time() - rvc_start
|
||
rvc_times.append(rvc_time)
|
||
total_blocks += 1
|
||
|
||
# Convert to bytes and send
|
||
# Format: float32 PCM, 48kHz, mono
|
||
audio_bytes = rvc_output.cpu().numpy().astype(np.float32).tobytes()
|
||
await websocket.send_bytes(audio_bytes)
|
||
|
||
logger.debug(f"[WS] Sent audio chunk: {len(audio_bytes)} bytes ({len(rvc_output)} samples)")
|
||
|
||
# After first block, both chunks should be empty tensors
|
||
chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
|
||
# CRITICAL: Flush any remaining audio in buffers
|
||
# Check connection one more time before flushing
|
||
if websocket.client_state.value != 1:
|
||
logger.warning("[WS] Client disconnected before flush, skipping")
|
||
return
|
||
|
||
flush_chunks = pipeline.flush_buffers()
|
||
for rvc_output in flush_chunks:
|
||
# Check connection state before each flush chunk
|
||
if websocket.client_state.value != 1:
|
||
logger.warning("[WS] Client disconnected during flush, aborting")
|
||
return
|
||
|
||
# Time the flush processing
|
||
rvc_start = time.time()
|
||
audio_bytes = rvc_output.cpu().numpy().astype(np.float32).tobytes()
|
||
rvc_time = time.time() - rvc_start
|
||
rvc_times.append(rvc_time)
|
||
total_blocks += 1
|
||
|
||
await websocket.send_bytes(audio_bytes)
|
||
logger.debug(f"[WS] Sent final flush chunk: {len(audio_bytes)} bytes")
|
||
|
||
# Calculate performance metrics
|
||
pipeline_time = time.time() - pipeline_start
|
||
audio_duration = len(soprano_audio) / pipeline.soprano_sr # Duration in seconds
|
||
realtime_factor = audio_duration / pipeline_time if pipeline_time > 0 else 0
|
||
avg_rvc_time = sum(rvc_times) / len(rvc_times) if rvc_times else 0
|
||
|
||
# Log performance summary
|
||
logger.info(f"""
|
||
[WS] Job complete: {total_blocks} blocks in {pipeline_time:.2f}s
|
||
Audio duration: {audio_duration:.2f}s
|
||
Realtime factor: {realtime_factor:.2f}x
|
||
Avg Soprano: {soprano_time*1000:.1f}ms
|
||
Avg RVC: {avg_rvc_time*1000:.1f}ms
|
||
Text: '{text[:50]}...'""")
|
||
|
||
except RuntimeError as e:
|
||
# Handle "Unexpected ASGI message" when client disconnects during send
|
||
if "Unexpected ASGI message" in str(e) or "websocket.close" in str(e):
|
||
logger.info(f"[WS] Client disconnected during synthesis: {e}")
|
||
else:
|
||
logger.error(f"RuntimeError during synthesis: {e}", exc_info=True)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Synthesis error: {e}", exc_info=True)
|
||
await websocket.send_json({"error": str(e)})
|
||
|
||
@app.get("/stream/wav")
|
||
async def stream_wav(text: str, pitch_shift: int = 0):
|
||
"""
|
||
HTTP streaming endpoint that serves audio as WAV (playable in VLC/browsers).
|
||
|
||
Usage:
|
||
vlc http://localhost:8765/stream/wav?text=Hello%20world!
|
||
curl http://localhost:8765/stream/wav?text=Test > output.wav
|
||
|
||
Returns: WAV file (PCM s16le, 48kHz, mono)
|
||
"""
|
||
if pipeline is None:
|
||
return {"error": "Pipeline not initialized"}
|
||
|
||
import io
|
||
import wave
|
||
|
||
async def generate_wav():
|
||
"""Generate WAV file on the fly as audio chunks are synthesized"""
|
||
# Create in-memory WAV file
|
||
buffer = io.BytesIO()
|
||
|
||
# Get audio from Soprano + RVC
|
||
soprano_audio = pipeline._call_soprano_server(text)
|
||
soprano_audio_tensor = torch.from_numpy(soprano_audio).to(pipeline.device).float()
|
||
|
||
# Collect all audio chunks
|
||
audio_chunks = []
|
||
|
||
# Process through pipeline
|
||
chunk_size = 3200 # 0.1s @ 32kHz
|
||
num_chunks = (len(soprano_audio_tensor) + chunk_size - 1) // chunk_size
|
||
|
||
for i in range(num_chunks):
|
||
chunk_start = i * chunk_size
|
||
chunk_end = min((i + 1) * chunk_size, len(soprano_audio_tensor))
|
||
soprano_chunk = soprano_audio_tensor[chunk_start:chunk_end].cpu().numpy()
|
||
|
||
chunk_48k, chunk_16k = pipeline.process_soprano_chunk(soprano_chunk)
|
||
|
||
while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k):
|
||
rvc_output = pipeline.process_through_rvc()
|
||
audio_chunks.append(rvc_output.cpu().numpy())
|
||
|
||
chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32)
|
||
|
||
# CRITICAL: Flush any remaining audio in buffers
|
||
flush_chunks = pipeline.flush_buffers()
|
||
for rvc_output in flush_chunks:
|
||
audio_chunks.append(rvc_output.cpu().numpy())
|
||
|
||
# Concatenate all audio
|
||
if not audio_chunks:
|
||
yield b''
|
||
return
|
||
|
||
full_audio = np.concatenate(audio_chunks)
|
||
|
||
# Convert float32 to int16 PCM
|
||
audio_int16 = (full_audio * 32767).clip(-32768, 32767).astype(np.int16)
|
||
|
||
# Write WAV header and data
|
||
with wave.open(buffer, 'wb') as wav:
|
||
wav.setnchannels(1) # Mono
|
||
wav.setsampwidth(2) # 16-bit
|
||
wav.setframerate(48000) # 48kHz
|
||
wav.writeframes(audio_int16.tobytes())
|
||
|
||
# Send WAV data
|
||
yield buffer.getvalue()
|
||
|
||
return StreamingResponse(
|
||
generate_wav(),
|
||
media_type="audio/wav",
|
||
headers={
|
||
"Content-Disposition": f'inline; filename="tts.wav"',
|
||
"Cache-Control": "no-cache"
|
||
}
|
||
)
|
||
|
||
@app.get("/")
|
||
|
||
async def root():
|
||
"""Root endpoint"""
|
||
return {
|
||
"service": "Soprano + RVC Streaming API",
|
||
"version": "1.0",
|
||
"endpoints": {
|
||
"speak": "POST /api/speak - Queue text for synthesis",
|
||
"stream": "GET /api/stream/continuous - Continuous audio stream",
|
||
"stream_wav": "GET /stream/wav?text=... - HTTP streaming (VLC compatible)",
|
||
"websocket": "WS /ws/stream - Token-by-token streaming (recommended for Discord)",
|
||
"status": "GET /api/status - Pipeline status",
|
||
"health": "GET /health - Health check"
|
||
}
|
||
}
|
||
|
||
if __name__ == "__main__":
|
||
import uvicorn
|
||
uvicorn.run(app, host="0.0.0.0", port=8765)
|