#!/usr/bin/env python3 """ Soprano + RVC Streaming API Integrated pipeline: Soprano TTS → RVC Voice Conversion → HTTP Stream Closely follows gui_v1.py logic for buffer management and processing """ import os import sys import json import struct import asyncio import threading import time import uuid import logging from queue import Queue, Empty from typing import Optional, Dict from dataclasses import dataclass # Apply torch.load patch for PyTorch 2.5+ compatibility with fairseq models try: import rvc_torch_patch except ImportError: pass # Patch not available (bare metal setup) # Add soprano and RVC to path sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'soprano')) sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'Retrieval-based-Voice-Conversion-WebUI')) import numpy as np import torch import torch.nn.functional as F import torchaudio.transforms as tat import zmq # For Soprano server communication from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.responses import StreamingResponse from pydantic import BaseModel # Soprano runs in separate process, don't import it here # from soprano import SopranoTTS from infer.lib import rtrvc as rvc_for_realtime from configs.config import Config as RVCConfig from tools.torchgate import TorchGate from multiprocessing import Queue as MPQueue, cpu_count # Setup logging logging.basicConfig( level=logging.DEBUG, # Enable DEBUG to see RVC internal errors format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) app = FastAPI(title="Soprano + RVC Streaming API") # Add CORS middleware to allow WebSocket connections from any origin from fastapi.middleware.cors import CORSMiddleware app.add_middleware( CORSMiddleware, allow_origins=["*"], # Allow all origins allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Global state pipeline: Optional['SopranoRVCPipeline'] = None pipeline_ready = False # Flag to indicate warmup complete @dataclass class PipelineConfig: """Configuration matching gui_v1.py parameters""" pth_path: str index_path: str pitch: int = 0 formant: float = 0.0 index_rate: float = 0.3 rms_mix_rate: float = 0.25 block_time: float = 0.20 # 0.20s blocks = 9600 samples, reduces CPU overhead crossfade_time: float = 0.05 # Proportional to block_time extra_time: float = 1.8 # Reduced from 2.5s (minimum safe value per ChatGPT) n_cpu: int = 4 f0method: str = "rmvpe" # CPU-bound but stable; GPU methods need kernel compilation soprano_sample_rate: int = 32000 rvc_sample_rate: int = 48000 use_pv: bool = False I_noise_reduce: bool = False O_noise_reduce: bool = False class TextRequest(BaseModel): text: str def write_wav_header(sample_rate=48000, channels=1, bits_per_sample=16): """Generate WAV header bytes""" byte_rate = sample_rate * channels * bits_per_sample // 8 block_align = channels * bits_per_sample // 8 header = b'RIFF' header += struct.pack(' torch.Tensor: """ Process a Soprano output chunk through the buffer pipeline Soprano outputs at 32kHz mono, we resample to 48kHz Returns the chunk at 48kHz """ # Convert to tensor if needed if not torch.is_tensor(soprano_chunk): soprano_chunk = torch.from_numpy(soprano_chunk) # Ensure chunk is on the correct device soprano_chunk = soprano_chunk.to(self.device) # Dual resampling strategy: resample from 32kHz to both 48kHz and 16kHz in parallel # This avoids the 48kHz→16kHz step while keeping both buffers we need: # - 48kHz for RMS mixing and final output buffers # - 16kHz for RVC inference chunk_48k = self.resampler_soprano_to_48k(soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0] chunk_16k = self.resampler_soprano_to_16k(soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0] # Return both for parallel buffer management return chunk_48k, chunk_16k def accumulate_and_process_block(self, chunk_48k: torch.Tensor, chunk_16k: torch.Tensor) -> bool: """ Accumulate samples in both 48kHz and 16kHz domains and slide buffers when we have enough Returns True if we accumulated a full block and should process through RVC Args: chunk_48k: Audio chunk at 48kHz (for RMS mixing buffer) chunk_16k: Audio chunk at 16kHz (for RVC inference buffer) """ # Accumulation buffers for both sample rates if not hasattr(self, 'accumulation_buffer_48k'): self.accumulation_buffer_48k = torch.tensor([], device=self.device, dtype=torch.float32) if not hasattr(self, 'accumulation_buffer_16k'): self.accumulation_buffer_16k = torch.tensor([], device=self.device, dtype=torch.float32) self.accumulation_buffer_48k = torch.cat([self.accumulation_buffer_48k, chunk_48k]) self.accumulation_buffer_16k = torch.cat([self.accumulation_buffer_16k, chunk_16k]) # Check if we have accumulated enough samples for a block (at 48kHz rate) # We need block_frame samples (12000 at 48kHz = 0.25s) if self.accumulation_buffer_48k.shape[0] >= self.block_frame: # Take exactly block_frame samples from 48kHz buffer chunk_size = self.block_frame block_data_48k = self.accumulation_buffer_48k[:chunk_size] self.accumulation_buffer_48k = self.accumulation_buffer_48k[chunk_size:] # Take corresponding samples from 16kHz buffer (same time duration) block_data_16k = self.accumulation_buffer_16k[:self.block_frame_16k] self.accumulation_buffer_16k = self.accumulation_buffer_16k[self.block_frame_16k:] # Slide the 48kHz input buffer (for RMS mixing) self.input_wav[:-chunk_size] = self.input_wav[chunk_size:].clone() self.input_wav[-chunk_size:] = block_data_48k # Slide the 16kHz RVC input buffer (for inference) self.input_wav_res[:-self.block_frame_16k] = self.input_wav_res[self.block_frame_16k:].clone() self.input_wav_res[-self.block_frame_16k:] = block_data_16k return True # Ready to process through RVC return False # Need more samples def flush_buffers(self) -> list: """ Flush any remaining audio in accumulation buffers by padding to block size. Returns list of RVC output chunks for any remaining audio. Call this at the end of synthesis to ensure no audio is cut off. """ output_chunks = [] if not hasattr(self, 'accumulation_buffer_48k') or not hasattr(self, 'accumulation_buffer_16k'): return output_chunks # Check if there's remaining audio in the buffers remaining_48k = self.accumulation_buffer_48k.shape[0] remaining_16k = self.accumulation_buffer_16k.shape[0] if remaining_48k > 0: logger.debug(f"[Flush] Processing {remaining_48k} remaining samples @ 48kHz") # Pad to block size with silence pad_size_48k = self.block_frame - remaining_48k pad_size_16k = self.block_frame_16k - remaining_16k if pad_size_48k > 0: padding_48k = torch.zeros(pad_size_48k, device=self.device, dtype=torch.float32) padding_16k = torch.zeros(pad_size_16k, device=self.device, dtype=torch.float32) chunk_48k = torch.cat([self.accumulation_buffer_48k, padding_48k]) chunk_16k = torch.cat([self.accumulation_buffer_16k, padding_16k]) else: chunk_48k = self.accumulation_buffer_48k[:self.block_frame] chunk_16k = self.accumulation_buffer_16k[:self.block_frame_16k] # Process the final block if self.accumulate_and_process_block(chunk_48k, chunk_16k): rvc_output = self.process_through_rvc() # Trim the padding from output (only keep the actual audio) # Calculate how many output samples correspond to input samples output_samples = int(remaining_48k) # Same rate (48kHz) if output_samples > 0 and output_samples < len(rvc_output): rvc_output = rvc_output[:output_samples] output_chunks.append(rvc_output) logger.debug(f"[Flush] Generated {len(rvc_output)} output samples") # Clear the buffers self.accumulation_buffer_48k = torch.tensor([], device=self.device, dtype=torch.float32) self.accumulation_buffer_16k = torch.tensor([], device=self.device, dtype=torch.float32) return output_chunks def process_through_rvc(self) -> Optional[torch.Tensor]: """ Process accumulated buffer through RVC Returns RVC output chunk ready for streaming """ # GPU state logging (for diagnostic purposes) if hasattr(self, '_log_gpu_state') and self._log_gpu_state: mem_before = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0 reserved_before = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0 # Change to RVC directory for inference (needs to access assets/rmvpe/rmvpe.pt etc) current_dir = os.getcwd() os.chdir(self.rvc_base_dir) try: # RVC inference rvc_start = time.time() infer_wav = self.rvc.infer( self.input_wav_res, self.block_frame_16k, self.skip_head, self.return_length, self.config.f0method, ) rvc_duration = time.time() - rvc_start # Log RVC inference time if slow if rvc_duration > 0.200: # >200ms is slow logger.warning(f"[RVC] Slow inference: {rvc_duration*1000:.1f}ms") if hasattr(self, '_log_gpu_state') and self._log_gpu_state: mem_after = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0 reserved_after = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0 logger.warning(f"[RVC] GPU mem: {mem_before:.2f}GB -> {mem_after:.2f}GB (reserved: {reserved_before:.2f} -> {reserved_after:.2f})") finally: # Restore original directory os.chdir(current_dir) # Convert to tensor if needed if not torch.is_tensor(infer_wav): infer_wav = torch.from_numpy(infer_wav).to(self.device) # Resample RVC output to target SR if needed if self.resampler_rvc_to_target is not None: infer_wav = self.resampler_rvc_to_target(infer_wav) # Output noise reduction (optional) if self.config.O_noise_reduce: self.output_buffer[:-self.block_frame] = self.output_buffer[self.block_frame:].clone() self.output_buffer[-self.block_frame:] = infer_wav[-self.block_frame:] infer_wav = self.tg(infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0)).squeeze(0) # Volume envelope mixing (RMS matching) if self.config.rms_mix_rate < 1: if self.config.I_noise_reduce: input_wav = self.input_wav_denoise[self.extra_frame:] else: input_wav = self.input_wav[self.extra_frame:] # Calculate RMS for input and output import librosa rms1 = librosa.feature.rms( y=input_wav[:infer_wav.shape[0]].cpu().numpy(), frame_length=4 * self.zc, hop_length=self.zc, ) rms1 = torch.from_numpy(rms1).to(self.device) rms1 = F.interpolate( rms1.unsqueeze(0), size=infer_wav.shape[0] + 1, mode='linear', align_corners=True, )[0, 0, :-1] rms2 = librosa.feature.rms( y=infer_wav[:].cpu().numpy(), frame_length=4 * self.zc, hop_length=self.zc, ) rms2 = torch.from_numpy(rms2).to(self.device) rms2 = F.interpolate( rms2.unsqueeze(0), size=infer_wav.shape[0] + 1, mode='linear', align_corners=True, )[0, 0, :-1] rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3) infer_wav *= torch.pow(rms1 / rms2, torch.tensor(1 - self.config.rms_mix_rate)) # SOLA algorithm for seamless crossfading (from gui_v1.py) conv_input = infer_wav[None, None, :self.sola_buffer_frame + self.sola_search_frame] cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :]) cor_den = torch.sqrt( F.conv1d( conv_input ** 2, torch.ones(1, 1, self.sola_buffer_frame, device=self.device) ) + 1e-8 ) sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) infer_wav = infer_wav[sola_offset:] # Apply crossfade if not self.config.use_pv: # Standard fade (phase vocoder disabled) infer_wav[:self.sola_buffer_frame] *= self.fade_in_window infer_wav[:self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window # Update SOLA buffer for next iteration self.sola_buffer[:] = infer_wav[self.block_frame:self.block_frame + self.sola_buffer_frame] # Return the block ready for output return infer_wav[:self.block_frame] def soprano_generator_worker(self): """ Thread 1: Continuously generates Soprano chunks and puts them in queue Runs in parallel with RVC processor - NO BLOCKING! """ logger.info("Soprano generator worker started") while self.running: try: # Get text job from queue try: job = self.text_queue.get(timeout=0.1) job_id = job['job_id'] text = job['text'] self.current_job_id = job_id logger.info(f"[Soprano] Processing job {job_id[:8]}: {text[:50]}...") start_time = time.time() # Clear the done flag self.soprano_done.clear() # Get audio from Soprano server (GTX 1660) via ZMQ soprano_audio = self._call_soprano_server(text) # Convert to tensor and split into chunks for queue soprano_audio_tensor = torch.from_numpy(soprano_audio).to(self.device).float() # Split into chunks of ~0.5s each for queue processing chunk_size = 16000 # 0.5s @ 32kHz num_chunks = (len(soprano_audio_tensor) + chunk_size - 1) // chunk_size chunk_count = 0 chunk_times = [] queue_wait_times = [] for i in range(num_chunks): chunk_count += 1 chunk_start = i * chunk_size chunk_end = min((i + 1) * chunk_size, len(soprano_audio_tensor)) soprano_chunk = soprano_audio_tensor[chunk_start:chunk_end].cpu().numpy() chunk_times.append(time.time()) # Put chunk in queue for RVC processor # Measure if this blocks (queue full) queue_put_start = time.time() self.soprano_chunk_queue.put({ 'job_id': job_id, 'chunk': soprano_chunk, 'chunk_num': chunk_count, 'timestamp': time.time() }) queue_wait = time.time() - queue_put_start queue_wait_times.append(queue_wait) if queue_wait > 0.01: # Log if queue put took >10ms (blocking) logger.warning(f"[Soprano] Chunk {chunk_count} BLOCKED {queue_wait*1000:.1f}ms on queue.put() (queue full!)") logger.debug(f"[Soprano] Queued chunk {chunk_count} for RVC (queue wait: {queue_wait*1000:.1f}ms)") # Signal that Soprano is done generating for this job self.soprano_done.set() elapsed = time.time() - start_time total_queue_wait = sum(queue_wait_times) max_queue_wait = max(queue_wait_times) if queue_wait_times else 0 logger.info(f"[Soprano] Job {job_id[:8]} complete: {chunk_count} chunks in {elapsed:.2f}s") logger.info(f"[Soprano] Total queue wait: {total_queue_wait:.2f}s ({total_queue_wait/elapsed*100:.1f}% of time)") logger.info(f"[Soprano] Max queue wait: {max_queue_wait*1000:.1f}ms") if total_queue_wait > 1.0: logger.warning(f"[Soprano] ⚠️ BOTTLENECK: Spent {total_queue_wait:.2f}s blocked on queue! RVC too slow to drain.") except Empty: continue except Exception as e: logger.error(f"[Soprano] Error: {e}", exc_info=True) self.soprano_done.set() def rvc_processor_worker(self): """ Thread 2: Continuously pulls Soprano chunks from queue and processes through RVC Runs in parallel with Soprano generator - NO BLOCKING! """ logger.info("RVC processor worker started") # Timing diagnostics job_start_time = None current_job_id = None block_times = [] processing_times = [] while self.running: try: # Get Soprano chunk from queue (with timeout to check running flag) try: item = self.soprano_chunk_queue.get(timeout=0.1) job_id = item['job_id'] soprano_chunk = item['chunk'] chunk_num = item['chunk_num'] chunk_timestamp = item['timestamp'] # Track job timing if current_job_id != job_id: # New job started if current_job_id is not None and len(block_times) > 0: # Log previous job stats self._log_job_stats(current_job_id, job_start_time, block_times, processing_times) # GPU state logging between jobs if self._log_gpu_state and torch.cuda.is_available(): torch.cuda.synchronize() mem_alloc = torch.cuda.memory_allocated() / 1024**3 mem_reserved = torch.cuda.memory_reserved() / 1024**3 logger.info(f"[GPU] Between jobs: {mem_alloc:.2f}GB allocated, {mem_reserved:.2f}GB reserved") current_job_id = job_id job_start_time = time.time() block_times = [] processing_times = [] logger.info(f"[RVC] Starting job {job_id[:8]}") # GPU state at job start if self._log_gpu_state and torch.cuda.is_available(): mem_alloc = torch.cuda.memory_allocated() / 1024**3 mem_reserved = torch.cuda.memory_reserved() / 1024**3 logger.info(f"[GPU] Job start: {mem_alloc:.2f}GB allocated, {mem_reserved:.2f}GB reserved") queue_wait = time.time() - chunk_timestamp logger.debug(f"[RVC] Processing chunk {chunk_num} (queue wait: {queue_wait*1000:.1f}ms)") # Process and accumulate chunks (dual resampling: 48kHz + 16kHz) chunk_48k, chunk_16k = self.process_soprano_chunk(soprano_chunk) # IMPORTANT: Drain accumulation buffer - one Soprano chunk may fill multiple blocks! blocks_in_this_chunk = 0 while self.accumulate_and_process_block(chunk_48k, chunk_16k): blocks_in_this_chunk += 1 block_start = time.time() # Process through RVC now that we have a full block try: processing_start = time.time() rvc_output = self.process_through_rvc() processing_duration = time.time() - processing_start processing_times.append(processing_duration) # Convert to PCM and broadcast pcm_data = (rvc_output.cpu().numpy() * 32767).clip(-32768, 32767).astype('int16').tobytes() # Broadcast to all clients with self.client_queues_lock: dead_clients = [] for client_id, queue in list(self.client_queues.items()): try: queue.put(pcm_data, timeout=0.5) except Exception as e: logger.debug(f"Client {client_id[:8]} queue put failed: {e}") if queue.full(): dead_clients.append(client_id) for client_id in dead_clients: del self.client_queues[client_id] logger.warning(f"Client {client_id[:8]} removed (queue full)") block_times.append(time.time() - block_start) logger.debug(f"[RVC] Block processed in {processing_duration*1000:.1f}ms") except Exception as e: logger.error(f"[RVC] Processing error: {e}", exc_info=True) # After first block, give empty tensors for next iteration chunk_48k = torch.tensor([], device=self.device, dtype=torch.float32) chunk_16k = torch.tensor([], device=self.device, dtype=torch.float32) logger.debug(f"[RVC] Chunk {chunk_num} produced {blocks_in_this_chunk} blocks") except Empty: # Check if Soprano is done and queue is empty if self.soprano_done.is_set() and self.soprano_chunk_queue.empty(): if current_job_id is not None and len(block_times) > 0: # Log final job stats self._log_job_stats(current_job_id, job_start_time, block_times, processing_times) current_job_id = None continue except Exception as e: logger.error(f"[RVC] Error: {e}", exc_info=True) def _log_job_stats(self, job_id, start_time, block_times, processing_times): """Log timing statistics for completed job""" elapsed = time.time() - start_time total_audio_duration = len(block_times) * self.config.block_time realtime_factor = total_audio_duration / elapsed if elapsed > 0 else 0 avg_processing = sum(processing_times) / len(processing_times) if processing_times else 0 logger.info(f""" ====== ASYNC PIPELINE PERFORMANCE ====== Job: {job_id[:8]} Total elapsed: {elapsed:.2f}s Blocks processed: {len(block_times)} Audio duration: {total_audio_duration:.2f}s Realtime factor: {realtime_factor:.2f}x Avg RVC processing: {avg_processing*1000:.1f}ms per block ======================================== """) # Write to file for easy access with open('/tmp/soprano_timing_debug.txt', 'a') as f: f.write(f"\n=== ASYNC PIPELINE JOB {job_id[:8]} ===\n") f.write(f"Blocks: {len(block_times)}, Elapsed: {elapsed:.2f}s\n") f.write(f"Audio: {total_audio_duration:.2f}s, Realtime: {realtime_factor:.2f}x\n") f.write(f"Avg RVC: {avg_processing*1000:.1f}ms\n") f.write(f"==================\n\n") def broadcast_worker(self): """ DEPRECATED: Old synchronous implementation Now using soprano_generator_worker() + rvc_processor_worker() for parallel processing Kept for reference but starts the new async workers instead """ logger.info("Starting async pipeline workers...") self.running = True # Start both workers in parallel soprano_thread = threading.Thread(target=self.soprano_generator_worker, daemon=True, name="Soprano-Generator") rvc_thread = threading.Thread(target=self.rvc_processor_worker, daemon=True, name="RVC-Processor") soprano_thread.start() rvc_thread.start() logger.info("✓ Async pipeline workers started (Soprano + RVC running in parallel)") # Keep main thread alive while self.running: time.sleep(0.1) logger.info("Broadcast worker stopped") def start(self): """Start the broadcast worker thread""" worker = threading.Thread(target=self.broadcast_worker, daemon=True) worker.start() logger.info("Pipeline started") def stop(self): """Stop the pipeline""" self.running = False logger.info("Pipeline stopped") @app.on_event("startup") async def startup_event(): """Initialize pipeline on startup""" global pipeline, pipeline_ready logger.info("="*60) logger.info("Soprano + RVC Streaming API") logger.info("="*60) # Load configuration config_path = os.path.join(os.path.dirname(__file__), "soprano_rvc_config.json") if not os.path.exists(config_path): logger.error(f"Configuration file not found: {config_path}") raise FileNotFoundError(f"Config file missing: {config_path}") with open(config_path, 'r') as f: config_dict = json.load(f) config = PipelineConfig(**config_dict) logger.info("Initializing Soprano + RVC pipeline...") logger.info("This may take 15-20 seconds...") # Initialize pipeline (blocks until models loaded) pipeline = SopranoRVCPipeline(config) pipeline.start() logger.info("="*60) logger.info("✓ Pipeline ready! API accepting requests on port 8765") logger.info("="*60) # Warmup: Generate a test phrase to initialize all components logger.info("🔥 Warming up pipeline with test generation...") try: # Text sized to generate ~1-2 seconds of audio (roughly 16 blocks @ 0.2s each = 3.2s) warmup_text = "Hello, I'm Miku. Voice system initialized and ready." warmup_audio = pipeline._call_soprano_server(warmup_text) warmup_tensor = torch.from_numpy(warmup_audio).to(pipeline.device).float() # Process all chunks through the full pipeline to ensure everything is warmed up chunk_size = 3200 # 0.1s @ 32kHz blocks_generated = 0 for i in range(0, len(warmup_tensor), chunk_size): chunk = warmup_tensor[i:min(i + chunk_size, len(warmup_tensor))].cpu().numpy() chunk_48k, chunk_16k = pipeline.process_soprano_chunk(chunk) # Keep draining blocks while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k): _ = pipeline.process_through_rvc() blocks_generated += 1 # After first block, subsequent chunks are empty chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32) chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32) # Flush any remaining audio flush_chunks = pipeline.flush_buffers() blocks_generated += len(flush_chunks) if blocks_generated > 0: logger.info(f"✅ Warmup complete! Generated {blocks_generated} audio blocks. Pipeline is hot and ready.") else: logger.warning("⚠ Warmup didn't generate blocks, but pipeline initialized.") pipeline_ready = True except Exception as e: logger.error(f"❌ Warmup failed: {e}", exc_info=True) logger.warning("Pipeline will still accept requests, but first generation may be slow.") pipeline_ready = True # Allow connections anyway @app.on_event("shutdown") async def shutdown_event(): """Cleanup on shutdown""" global pipeline if pipeline: pipeline.stop() logger.info("Pipeline shutdown complete") @app.post("/api/speak") async def speak(request: TextRequest): """Queue text for synthesis""" if pipeline is None: return {"error": "Pipeline not initialized"} job_id = str(uuid.uuid4()) pipeline.text_queue.put({'job_id': job_id, 'text': request.text}) return { "status": "queued", "message": "Text queued for synthesis", "job_id": job_id, "queue_size": pipeline.text_queue.qsize() } @app.post("/api/speak_to_file") async def speak_to_file(request: TextRequest): """Synthesize text and save directly to WAV file (for testing)""" if pipeline is None: return {"error": "Pipeline not initialized"} import wave import tempfile import os job_id = str(uuid.uuid4()) output_path = f"/tmp/soprano_rvc_{job_id[:8]}.wav" try: logger.info(f"File synthesis job {job_id[:8]}: {request.text[:50]}...") # Collect all audio chunks audio_chunks = [] stream = pipeline.soprano.infer_stream(request.text, chunk_size=10) chunk_count = 0 soprano_chunk_count = 0 for soprano_chunk in stream: soprano_chunk_count += 1 # Debug logging (can be commented out for production) # logger.info(f" Soprano chunk {soprano_chunk_count}: shape={soprano_chunk.shape if hasattr(soprano_chunk, 'shape') else len(soprano_chunk)}") # Process and accumulate chunks (dual resampling) chunk_48k, chunk_16k = pipeline.process_soprano_chunk(soprano_chunk) # logger.info(f" After dual resample: 48k={chunk_48k.shape}, 16k={chunk_16k.shape}") # IMPORTANT: Drain the accumulation buffer - one Soprano chunk may fill multiple blocks! # Keep processing blocks until we don't have enough samples left while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k): chunk_count += 1 # logger.info(f" Block {chunk_count} ready - processing through RVC") # logger.info(f" input_wav buffer: shape={pipeline.input_wav.shape}, mean={pipeline.input_wav.abs().mean():.6f}") # Process through RVC rvc_output = pipeline.process_through_rvc() # logger.info(f" RVC output: shape={rvc_output.shape}, mean={rvc_output.abs().mean():.6f}, max={rvc_output.abs().max():.6f}") audio_chunks.append(rvc_output.cpu().numpy()) # After first block, both chunks should be empty tensors for subsequent iterations chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32) chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32) # CRITICAL: Flush any remaining audio in buffers flush_chunks = pipeline.flush_buffers() for rvc_output in flush_chunks: audio_chunks.append(rvc_output.cpu().numpy()) # Concatenate all chunks if audio_chunks: import numpy as np full_audio = np.concatenate(audio_chunks) # Write to WAV file with wave.open(output_path, 'wb') as wav_file: wav_file.setnchannels(1) # Mono wav_file.setsampwidth(2) # 16-bit wav_file.setframerate(pipeline.config.rvc_sample_rate) # Convert to int16 PCM pcm_data = (full_audio * 32767).clip(-32768, 32767).astype('int16') wav_file.writeframes(pcm_data.tobytes()) logger.info(f"File synthesis complete: {output_path} ({chunk_count} chunks, {len(full_audio)/pipeline.config.rvc_sample_rate:.2f}s)") return { "status": "complete", "output_file": output_path, "chunks": chunk_count, "duration_seconds": len(full_audio) / pipeline.config.rvc_sample_rate } else: return {"error": "No audio generated"} except Exception as e: logger.error(f"File synthesis error: {e}", exc_info=True) return {"error": str(e)} @app.post("/api/speak_soprano_only") async def speak_soprano_only(request: TextRequest): """Synthesize with Soprano only (no RVC) for comparison""" if pipeline is None: return {"error": "Pipeline not initialized"} import wave import numpy as np job_id = str(uuid.uuid4()) output_path = f"/tmp/soprano_only_{job_id[:8]}.wav" try: logger.info(f"Soprano-only synthesis: {request.text[:50]}...") # Generate with Soprano audio = pipeline.soprano.infer(request.text) # Write to WAV file at Soprano's native sample rate with wave.open(output_path, 'wb') as wav_file: wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(pipeline.config.soprano_sample_rate) # 32kHz # Convert to int16 PCM if torch.is_tensor(audio): audio = audio.cpu().numpy() pcm_data = (audio * 32767).clip(-32768, 32767).astype('int16') wav_file.writeframes(pcm_data.tobytes()) logger.info(f"Soprano-only complete: {output_path}") return { "status": "complete", "output_file": output_path, "sample_rate": pipeline.config.soprano_sample_rate, "duration_seconds": len(audio) / pipeline.config.soprano_sample_rate } except Exception as e: logger.error(f"Soprano-only synthesis error: {e}", exc_info=True) return {"error": str(e)} @app.post("/api/debug_pre_rvc") async def debug_pre_rvc(request: TextRequest): """Debug endpoint: Save reconstructed audio BEFORE RVC processing""" if pipeline is None: return {"error": "Pipeline not initialized"} import wave import numpy as np job_id = str(uuid.uuid4()) output_48k = f"/tmp/pre_rvc_48k_{job_id[:8]}.wav" output_16k = f"/tmp/pre_rvc_16k_{job_id[:8]}.wav" try: logger.info(f"Pre-RVC debug: {request.text[:50]}...") # Collect all input_wav states (what goes INTO RVC) input_wav_chunks_48k = [] input_wav_chunks_16k = [] stream = pipeline.soprano.infer_stream(request.text, chunk_size=10) chunk_count = 0 for soprano_chunk in stream: chunk_48k, chunk_16k = pipeline.process_soprano_chunk(soprano_chunk) while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k): chunk_count += 1 # SAVE the input_wav buffer BEFORE RVC processes it # This is the reconstructed audio from accumulated Soprano chunks # Take the relevant portion (last block_frame + extra) pre_rvc_48k = pipeline.input_wav[pipeline.extra_frame:pipeline.extra_frame + pipeline.block_frame].clone() # For 16kHz, just take the last block worth of samples from input_wav_res pre_rvc_16k = pipeline.input_wav_res[-pipeline.block_frame_16k:].clone() input_wav_chunks_48k.append(pre_rvc_48k.cpu().numpy()) input_wav_chunks_16k.append(pre_rvc_16k.cpu().numpy()) # Still process through RVC for timing pipeline.process_through_rvc() chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32) chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32) # Save 48kHz version (reconstructed audio before RVC) if input_wav_chunks_48k: full_audio_48k = np.concatenate(input_wav_chunks_48k) with wave.open(output_48k, 'wb') as wav_file: wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(pipeline.config.rvc_sample_rate) pcm_data = (full_audio_48k * 32767).clip(-32768, 32767).astype('int16') wav_file.writeframes(pcm_data.tobytes()) # Save 16kHz version (what RVC actually receives) if input_wav_chunks_16k: full_audio_16k = np.concatenate(input_wav_chunks_16k) with wave.open(output_16k, 'wb') as wav_file: wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(16000) # RVC input sample rate pcm_data = (full_audio_16k * 32767).clip(-32768, 32767).astype('int16') wav_file.writeframes(pcm_data.tobytes()) logger.info(f"Pre-RVC debug complete: {output_48k}, {output_16k}") return { "status": "complete", "output_48k": output_48k, "output_16k": output_16k, "chunks": chunk_count, "info": "48k is reconstructed audio, 16k is what RVC receives" } except Exception as e: logger.error(f"Pre-RVC debug error: {e}", exc_info=True) return {"error": str(e)} @app.get("/api/stream/continuous") async def stream_continuous(): """Continuous audio stream endpoint""" if pipeline is None: return {"error": "Pipeline not initialized"} client_id = str(uuid.uuid4()) # Smaller queue for tighter flow control (20s buffer = 80 blocks at 0.25s each) with pipeline.client_queues_lock: pipeline.client_queues[client_id] = Queue(maxsize=80) logger.info(f"Client {client_id[:8]} connected") async def generate(): try: # Send WAV header yield write_wav_header( sample_rate=pipeline.config.rvc_sample_rate, channels=1, bits_per_sample=16 ) # Wait for initial buffer (help VLC establish timing) # Wait for at least 1-2 blocks before starting stream initial_buffer_blocks = 2 buffered_chunks = [] while len(buffered_chunks) < initial_buffer_blocks: try: chunk = await asyncio.get_event_loop().run_in_executor( None, lambda: pipeline.client_queues[client_id].get(timeout=5.0) ) buffered_chunks.append(chunk) except Exception: # Timeout waiting for initial buffer - start anyway break # Send initial buffer for chunk in buffered_chunks: yield chunk # Stream remaining audio chunks while True: try: chunk = await asyncio.get_event_loop().run_in_executor( None, lambda: pipeline.client_queues[client_id].get(timeout=0.1) ) yield chunk except Empty: await asyncio.sleep(0.01) continue except Exception as e: logger.error(f"Client {client_id[:8]} stream error: {e}") break finally: with pipeline.client_queues_lock: if client_id in pipeline.client_queues: del pipeline.client_queues[client_id] logger.info(f"Client {client_id[:8]} disconnected") return StreamingResponse( generate(), media_type="audio/wav", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", } ) @app.get("/api/test_soprano_stream") async def test_soprano_stream(text: str = "This is a test of raw Soprano TTS performance without RVC processing."): """Test Soprano TTS performance in isolation - streams raw Soprano output without RVC""" if pipeline is None: return {"error": "Pipeline not initialized"} import time import numpy as np logger.info(f"Testing Soprano-only streaming: {text[:50]}...") async def generate(): # Send WAV header (Soprano outputs at 32kHz) yield write_wav_header( sample_rate=32000, channels=1, bits_per_sample=16 ) # Track timing start_time = time.time() chunk_count = 0 total_samples = 0 # Stream directly from Soprano stream = pipeline.soprano.infer_stream(text, chunk_size=10) for soprano_chunk in stream: chunk_count += 1 # Convert to int16 PCM if not torch.is_tensor(soprano_chunk): soprano_chunk = torch.from_numpy(soprano_chunk) pcm_data = (soprano_chunk.cpu().numpy() * 32767).clip(-32768, 32767).astype('int16').tobytes() total_samples += len(soprano_chunk) yield pcm_data # Log performance elapsed = time.time() - start_time audio_duration = total_samples / 32000 # Soprano is 32kHz realtime_factor = audio_duration / elapsed if elapsed > 0 else 0 logger.info(f"Soprano-only test complete:") logger.info(f" Chunks: {chunk_count}") logger.info(f" Elapsed: {elapsed:.2f}s") logger.info(f" Audio duration: {audio_duration:.2f}s") logger.info(f" Realtime factor: {realtime_factor:.2f}x") # Write to debug file with open('/tmp/soprano_only_timing.txt', 'w') as f: f.write(f"Soprano-only test results:\n") f.write(f"Text: {text}\n") f.write(f"Chunks: {chunk_count}\n") f.write(f"Elapsed: {elapsed:.2f}s\n") f.write(f"Audio duration: {audio_duration:.2f}s\n") f.write(f"Realtime factor: {realtime_factor:.2f}x\n") return StreamingResponse( generate(), media_type="audio/wav", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", } ) @app.get("/api/test_rvc_only") async def test_rvc_only(input_file: str = "/tmp/soprano_test_3.wav"): """Test RVC performance in isolation - processes pre-existing Soprano audio""" if pipeline is None: return {"error": "Pipeline not initialized"} import time import wave import numpy as np logger.info(f"Testing RVC-only performance with input: {input_file}") # Verify input file exists if not os.path.exists(input_file): return {"error": f"Input file not found: {input_file}"} async def generate(): # Send WAV header (RVC outputs at 48kHz) yield write_wav_header( sample_rate=48000, channels=1, bits_per_sample=16 ) # Load the input audio file (Soprano output at 32kHz) with wave.open(input_file, 'rb') as wav: sample_rate = wav.getframerate() n_channels = wav.getnchannels() audio_data = wav.readframes(wav.getnframes()) audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32767.0 logger.info(f"Loaded audio: {len(audio_np)} samples at {sample_rate}Hz") # Convert to tensor and resample from 32kHz to 48kHz (input buffer) and 16kHz (RVC input) audio_tensor = torch.from_numpy(audio_np).to(pipeline.device) # Resample to 48kHz for input buffer (RMS mixing needs this) audio_48k = pipeline.resampler_soprano_to_48k(audio_tensor.unsqueeze(0).unsqueeze(0))[0, 0] # Resample to 16kHz for RVC inference audio_16k = pipeline.resampler_soprano_to_16k(audio_tensor.unsqueeze(0).unsqueeze(0))[0, 0] # Track timing start_time = time.time() block_count = 0 rvc_times = [] # Process through pipeline in blocks (simulating real streaming) chunk_48k = audio_48k chunk_16k = audio_16k while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k): block_count += 1 # Process through RVC rvc_start = time.time() rvc_output = pipeline.process_through_rvc() rvc_duration = time.time() - rvc_start rvc_times.append(rvc_duration) # Convert to PCM and yield pcm_data = (rvc_output.cpu().numpy() * 32767).clip(-32768, 32767).astype('int16').tobytes() yield pcm_data # Give empty tensors for next iteration chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32) chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32) # Calculate stats elapsed = time.time() - start_time audio_duration = block_count * pipeline.config.block_time # 0.16s per block realtime_factor = audio_duration / elapsed if elapsed > 0 else 0 avg_rvc = sum(rvc_times) / len(rvc_times) if rvc_times else 0 logger.info(f"RVC-only test complete:") logger.info(f" Blocks: {block_count}") logger.info(f" Elapsed: {elapsed:.2f}s") logger.info(f" Audio duration: {audio_duration:.2f}s") logger.info(f" Realtime factor: {realtime_factor:.2f}x") logger.info(f" Avg RVC time: {avg_rvc*1000:.1f}ms") # Write to debug file with open('/tmp/rvc_only_timing.txt', 'w') as f: f.write(f"RVC-only test results:\n") f.write(f"Input file: {input_file}\n") f.write(f"Blocks: {block_count}\n") f.write(f"Elapsed: {elapsed:.2f}s\n") f.write(f"Audio duration: {audio_duration:.2f}s\n") f.write(f"Realtime factor: {realtime_factor:.2f}x\n") f.write(f"Avg RVC processing: {avg_rvc*1000:.1f}ms per block\n") f.write(f"Total RVC time: {sum(rvc_times):.2f}s ({sum(rvc_times)/elapsed*100:.1f}%)\n") return StreamingResponse( generate(), media_type="audio/wav", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", } ) @app.get("/api/status") async def status(): """Get API status""" if pipeline is None: return {"status": "initializing", "ready": False} with pipeline.client_queues_lock: num_clients = len(pipeline.client_queues) return { "status": "running", "ready": True, "queue_size": pipeline.text_queue.qsize(), "connected_clients": num_clients, "config": { "soprano_sr": pipeline.soprano_sr, "target_sr": pipeline.target_sr, "rvc_model_sr": pipeline.rvc_tgt_sr, "block_time": pipeline.config.block_time, "f0method": pipeline.config.f0method, } } @app.get("/health") async def health_check(): """Health check endpoint for Docker healthcheck""" if pipeline is None: return { "status": "unhealthy", "error": "Pipeline not initialized" }, 503 # Test Soprano server connectivity soprano_connected = False try: # Quick connectivity test - don't actually synthesize test_socket = pipeline.zmq_context.socket(zmq.REQ) test_socket.setsockopt(zmq.RCVTIMEO, 2000) # 2s timeout test_socket.connect(pipeline.soprano_socket.getsockopt(zmq.LAST_ENDPOINT).decode()) test_socket.close() soprano_connected = True except Exception as e: logger.warning(f"Soprano health check failed: {e}") soprano_connected = False return { "status": "healthy" if (soprano_connected and pipeline_ready) else "degraded", "soprano_connected": soprano_connected, "rvc_initialized": pipeline.rvc is not None, "pipeline_ready": pipeline is not None, "warmed_up": pipeline_ready } @app.post("/interrupt") async def interrupt_synthesis(): """ Interrupt current synthesis and flush all buffers. Used when user speaks over Miku to cancel ongoing TTS playback. Returns: {"status": "interrupted", "flushed": true} """ global pipeline if pipeline is None: return {"status": "error", "message": "Pipeline not initialized"}, 503 try: # Flush Soprano ZMQ socket (drain any pending audio chunks) flushed_chunks = 0 while pipeline.soprano_socket.poll(timeout=0): pipeline.soprano_socket.recv() flushed_chunks += 1 # Clear RVC audio buffer (stop processing queued audio) if hasattr(pipeline, 'rvc_audio_buffer'): buffer_size = len(pipeline.rvc_audio_buffer) pipeline.rvc_audio_buffer.clear() logger.info(f"Interrupted: Flushed {flushed_chunks} ZMQ chunks, cleared {buffer_size} RVC buffer samples") else: logger.info(f"Interrupted: Flushed {flushed_chunks} ZMQ chunks") return { "status": "interrupted", "flushed": True, "zmq_chunks_flushed": flushed_chunks } except Exception as e: logger.error(f"Interrupt failed: {e}", exc_info=True) return {"status": "error", "message": str(e)}, 500 @app.websocket("/ws/stream") async def websocket_stream(websocket: WebSocket): """ WebSocket endpoint for token-by-token streaming synthesis. Protocol: - Client sends: {"token": "text", "pitch_shift": 0, "flush": false} - Server sends: binary audio data (PCM float32, 48kHz mono) - Client sends: {"flush": true} to process remaining buffer This enables real-time TTS as tokens arrive from LLM streaming. """ await websocket.accept() logger.info("WebSocket client connected") # Wait for warmup if not ready yet if not pipeline_ready: logger.warning("[WS] Pipeline not warmed up yet, waiting...") await websocket.send_json({ "status": "warming_up", "message": "Pipeline warming up, please wait..." }) # Poll until ready (max 30 seconds) for _ in range(60): # 60 * 0.5s = 30s if pipeline_ready: logger.info("[WS] Pipeline ready, proceeding with connection") await websocket.send_json({ "status": "ready", "message": "Pipeline ready!" }) break await asyncio.sleep(0.5) else: logger.error("[WS] Pipeline warmup timeout!") await websocket.send_json({ "error": "Pipeline warmup timeout", "message": "Pipeline failed to initialize in time" }) await websocket.close() return # Buffer for accumulating tokens until sentence boundary text_buffer = "" config = { "pitch_shift": 0, "sample_rate": 48000 } try: while True: # Receive token from client logger.debug("[WS] Waiting for message...") data = await websocket.receive_json() logger.info(f"[WS] Received data: {data}") # Update config if provided if "pitch_shift" in data: config["pitch_shift"] = data["pitch_shift"] # Handle flush request (process whatever is in buffer) if data.get("flush", False): if text_buffer.strip(): logger.info(f"[WS] Flushing buffer: {text_buffer[:50]}...") await _synthesize_and_send(websocket, text_buffer, config) text_buffer = "" continue # Add token to buffer token = data.get("token", "") if not token: continue text_buffer += token logger.debug(f"[WS] Token received: '{token}' | Buffer: {len(text_buffer)} chars") # Check for sentence boundaries # Synthesize when we hit punctuation or buffer gets too long should_synthesize = False # Sentence-ending punctuation token_stripped = token.rstrip() if token_stripped and token_stripped[-1] in ['.', '!', '?', '。', '!', '?']: should_synthesize = True logger.info(f"[WS] Sentence boundary detected: {text_buffer[:50]}...") # Comma/pause (optional - creates more natural pauses) elif token_stripped and token_stripped[-1] in [',', ';', ',', '、']: should_synthesize = True logger.info(f"[WS] Pause boundary detected: {text_buffer[:50]}...") # Buffer too long (prevent memory issues) elif len(text_buffer) > 200: should_synthesize = True logger.info(f"[WS] Buffer limit reached: {text_buffer[:50]}...") # Synthesize if needed if should_synthesize and text_buffer.strip(): await _synthesize_and_send(websocket, text_buffer, config) text_buffer = "" except WebSocketDisconnect: logger.info("WebSocket client disconnected") # Don't try to send anything - client is already gone except Exception as e: logger.error(f"WebSocket error: {e}", exc_info=True) try: await websocket.close(code=1011, reason=str(e)) except: pass async def _synthesize_and_send(websocket: WebSocket, text: str, config: dict): """ Helper function to synthesize text and stream audio chunks via WebSocket. Sends raw PCM float32 audio data at 48kHz mono. """ if pipeline is None: await websocket.send_json({"error": "Pipeline not initialized"}) return # Check if WebSocket is still connected if websocket.client_state.value != 1: # 1 = CONNECTED logger.warning(f"[WS] Cannot send - WebSocket not connected (state: {websocket.client_state})") return try: # Performance tracking pipeline_start = time.time() rvc_times = [] total_blocks = 0 # Get audio from Soprano server via ZMQ soprano_start = time.time() soprano_audio = pipeline._call_soprano_server(text) soprano_time = time.time() - soprano_start # Convert to tensor soprano_audio_tensor = torch.from_numpy(soprano_audio).to(pipeline.device).float() # Split into chunks for streaming (0.1s chunks @ 32kHz) chunk_size = 3200 # 0.1s @ 32kHz num_chunks = (len(soprano_audio_tensor) + chunk_size - 1) // chunk_size for i in range(num_chunks): chunk_start = i * chunk_size chunk_end = min((i + 1) * chunk_size, len(soprano_audio_tensor)) soprano_chunk = soprano_audio_tensor[chunk_start:chunk_end].cpu().numpy() # Process and accumulate chunks (dual resampling) chunk_48k, chunk_16k = pipeline.process_soprano_chunk(soprano_chunk) # Drain the accumulation buffer - one Soprano chunk may fill multiple blocks! while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k): # Check connection state before sending if websocket.client_state.value != 1: logger.warning("[WS] Client disconnected during synthesis, aborting") return # Process through RVC rvc_start = time.time() rvc_output = pipeline.process_through_rvc() rvc_time = time.time() - rvc_start rvc_times.append(rvc_time) total_blocks += 1 # Convert to bytes and send # Format: float32 PCM, 48kHz, mono audio_bytes = rvc_output.cpu().numpy().astype(np.float32).tobytes() await websocket.send_bytes(audio_bytes) logger.debug(f"[WS] Sent audio chunk: {len(audio_bytes)} bytes ({len(rvc_output)} samples)") # After first block, both chunks should be empty tensors chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32) chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32) # CRITICAL: Flush any remaining audio in buffers # Check connection one more time before flushing if websocket.client_state.value != 1: logger.warning("[WS] Client disconnected before flush, skipping") return flush_chunks = pipeline.flush_buffers() for rvc_output in flush_chunks: # Check connection state before each flush chunk if websocket.client_state.value != 1: logger.warning("[WS] Client disconnected during flush, aborting") return # Time the flush processing rvc_start = time.time() audio_bytes = rvc_output.cpu().numpy().astype(np.float32).tobytes() rvc_time = time.time() - rvc_start rvc_times.append(rvc_time) total_blocks += 1 await websocket.send_bytes(audio_bytes) logger.debug(f"[WS] Sent final flush chunk: {len(audio_bytes)} bytes") # Calculate performance metrics pipeline_time = time.time() - pipeline_start audio_duration = len(soprano_audio) / pipeline.soprano_sr # Duration in seconds realtime_factor = audio_duration / pipeline_time if pipeline_time > 0 else 0 avg_rvc_time = sum(rvc_times) / len(rvc_times) if rvc_times else 0 # Log performance summary logger.info(f""" [WS] Job complete: {total_blocks} blocks in {pipeline_time:.2f}s Audio duration: {audio_duration:.2f}s Realtime factor: {realtime_factor:.2f}x Avg Soprano: {soprano_time*1000:.1f}ms Avg RVC: {avg_rvc_time*1000:.1f}ms Text: '{text[:50]}...'""") except RuntimeError as e: # Handle "Unexpected ASGI message" when client disconnects during send if "Unexpected ASGI message" in str(e) or "websocket.close" in str(e): logger.info(f"[WS] Client disconnected during synthesis: {e}") else: logger.error(f"RuntimeError during synthesis: {e}", exc_info=True) except Exception as e: logger.error(f"Synthesis error: {e}", exc_info=True) await websocket.send_json({"error": str(e)}) @app.get("/stream/wav") async def stream_wav(text: str, pitch_shift: int = 0): """ HTTP streaming endpoint that serves audio as WAV (playable in VLC/browsers). Usage: vlc http://localhost:8765/stream/wav?text=Hello%20world! curl http://localhost:8765/stream/wav?text=Test > output.wav Returns: WAV file (PCM s16le, 48kHz, mono) """ if pipeline is None: return {"error": "Pipeline not initialized"} import io import wave async def generate_wav(): """Generate WAV file on the fly as audio chunks are synthesized""" # Create in-memory WAV file buffer = io.BytesIO() # Get audio from Soprano + RVC soprano_audio = pipeline._call_soprano_server(text) soprano_audio_tensor = torch.from_numpy(soprano_audio).to(pipeline.device).float() # Collect all audio chunks audio_chunks = [] # Process through pipeline chunk_size = 3200 # 0.1s @ 32kHz num_chunks = (len(soprano_audio_tensor) + chunk_size - 1) // chunk_size for i in range(num_chunks): chunk_start = i * chunk_size chunk_end = min((i + 1) * chunk_size, len(soprano_audio_tensor)) soprano_chunk = soprano_audio_tensor[chunk_start:chunk_end].cpu().numpy() chunk_48k, chunk_16k = pipeline.process_soprano_chunk(soprano_chunk) while pipeline.accumulate_and_process_block(chunk_48k, chunk_16k): rvc_output = pipeline.process_through_rvc() audio_chunks.append(rvc_output.cpu().numpy()) chunk_48k = torch.tensor([], device=pipeline.device, dtype=torch.float32) chunk_16k = torch.tensor([], device=pipeline.device, dtype=torch.float32) # CRITICAL: Flush any remaining audio in buffers flush_chunks = pipeline.flush_buffers() for rvc_output in flush_chunks: audio_chunks.append(rvc_output.cpu().numpy()) # Concatenate all audio if not audio_chunks: yield b'' return full_audio = np.concatenate(audio_chunks) # Convert float32 to int16 PCM audio_int16 = (full_audio * 32767).clip(-32768, 32767).astype(np.int16) # Write WAV header and data with wave.open(buffer, 'wb') as wav: wav.setnchannels(1) # Mono wav.setsampwidth(2) # 16-bit wav.setframerate(48000) # 48kHz wav.writeframes(audio_int16.tobytes()) # Send WAV data yield buffer.getvalue() return StreamingResponse( generate_wav(), media_type="audio/wav", headers={ "Content-Disposition": f'inline; filename="tts.wav"', "Cache-Control": "no-cache" } ) @app.get("/") async def root(): """Root endpoint""" return { "service": "Soprano + RVC Streaming API", "version": "1.0", "endpoints": { "speak": "POST /api/speak - Queue text for synthesis", "stream": "GET /api/stream/continuous - Continuous audio stream", "stream_wav": "GET /stream/wav?text=... - HTTP streaming (VLC compatible)", "websocket": "WS /ws/stream - Token-by-token streaming (recommended for Discord)", "status": "GET /api/status - Pipeline status", "health": "GET /health - Health check" } } if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8765)