#!/usr/bin/env python3 """ Complete component benchmark replicating the actual soprano_rvc_api.py pipeline logic. This ensures accurate RVC-only performance measurement. """ import sys import time import os import numpy as np import zmq import torch import json from collections import defaultdict import statistics sys.path.insert(0, '/app/Retrieval-based-Voice-Conversion-WebUI') # Test sentences TEST_CASES = [ ("tiny", "Hello!"), ("short", "Hello, this is a test."), ("medium", "The quick brown fox jumps over the lazy dog."), ("long", "Artificial intelligence is revolutionizing the way we interact with technology."), ("very_long", "Artificial intelligence and machine learning are fundamentally transforming our understanding of human-computer interaction, enabling unprecedented levels of automation and personalization across diverse industries."), ] RUNS_PER_TEST = 5 def setup_soprano_connection(): """Connect to Soprano server""" zmq_context = zmq.Context() soprano_socket = zmq_context.socket(zmq.REQ) soprano_socket.connect("tcp://soprano:5555") soprano_socket.setsockopt(zmq.RCVTIMEO, 30000) return soprano_socket def call_soprano(socket, text): """Get audio from Soprano""" import uuid job_id = str(uuid.uuid4()) start = time.time() socket.send_json({"job_id": job_id, "text": text}) response = socket.recv_json() elapsed = time.time() - start if 'error' in response: raise Exception(f"Soprano error: {response['error']}") audio = np.array(response['audio'], dtype=np.float32) return audio, elapsed def setup_rvc(): """Initialize RVC exactly as soprano_rvc_api.py does""" from infer.lib import rtrvc as rvc_for_realtime from configs.config import Config from multiprocessing import Queue as MPQueue import torchaudio # Change to RVC directory rvc_dir = '/app/Retrieval-based-Voice-Conversion-WebUI' original_dir = os.getcwd() os.chdir(rvc_dir) try: config = Config() device = config.device # Create queues inp_q = MPQueue() opt_q = MPQueue() # Initialize RVC rvc = rvc_for_realtime.RVC( key=0, formant=0, pth_path="assets/weights/MikuAI_e210_s6300.pth", index_path="assets/indices/added_IVF1811_Flat_nprobe_1_MikuAI_v2.index", index_rate=0.5, n_cpu=4, inp_q=inp_q, opt_q=opt_q, config=config, last_rvc=None ) # Optimizations torch.set_grad_enabled(False) torch.backends.cudnn.benchmark = True if config.is_half: rvc.model = rvc.model.half() rvc.net_g = rvc.net_g.half() # Create resamplers (32kHz Soprano → 48kHz and 16kHz) resampler_32_to_48 = torchaudio.transforms.Resample( orig_freq=32000, new_freq=48000 ).to(device) resampler_32_to_16 = torchaudio.transforms.Resample( orig_freq=32000, new_freq=16000 ).to(device) # Prepare RVC buffers block_frame = 9600 # 0.2s @ 48kHz block_frame_16k = 3200 # 0.2s @ 16kHz crossfade_frame = int(block_frame * 0.25) # 0.05s extra_frame = int(block_frame * 9) # 1.8s context input_wav_res = torch.zeros( extra_frame + crossfade_frame + block_frame_16k + crossfade_frame, device=device, dtype=torch.float32 ) input_wav = torch.zeros( extra_frame + crossfade_frame + block_frame + crossfade_frame, device=device, dtype=torch.float32 ) # RVC processing parameters (zc = target_sr // 100 = 48000 // 100 = 480) zc = 480 # Window size for RVC processing skip_head = extra_frame // zc return_length = (block_frame_16k + crossfade_frame) // zc finally: os.chdir(original_dir) return { 'rvc': rvc, 'config': config, 'device': device, 'resampler_32_to_48': resampler_32_to_48, 'resampler_32_to_16': resampler_32_to_16, 'block_frame': block_frame, 'block_frame_16k': block_frame_16k, 'crossfade_frame': crossfade_frame, 'extra_frame': extra_frame, 'input_wav_res': input_wav_res, 'input_wav': input_wav, 'skip_head': skip_head, 'return_length': return_length, 'rvc_dir': rvc_dir, } def process_audio_through_rvc(soprano_audio, rvc_ctx): """ Process Soprano audio through RVC pipeline exactly as soprano_rvc_api.py does. Returns: (converted_audio, time_taken) """ device = rvc_ctx['device'] rvc = rvc_ctx['rvc'] # Convert to tensor soprano_tensor = torch.from_numpy(soprano_audio).to(device).float() # Split into chunks (0.1s @ 32kHz) chunk_size = 3200 num_chunks = (len(soprano_tensor) + chunk_size - 1) // chunk_size # Accumulation buffers acc_buffer_48k = torch.tensor([], device=device, dtype=torch.float32) acc_buffer_16k = torch.tensor([], device=device, dtype=torch.float32) output_chunks = [] block_times = [] original_dir = os.getcwd() os.chdir(rvc_ctx['rvc_dir']) try: for i in range(num_chunks): chunk_start = i * chunk_size chunk_end = min((i + 1) * chunk_size, len(soprano_tensor)) soprano_chunk = soprano_tensor[chunk_start:chunk_end] # Resample to 48kHz and 16kHz chunk_48k = rvc_ctx['resampler_32_to_48'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0] chunk_16k = rvc_ctx['resampler_32_to_16'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0] # Accumulate acc_buffer_48k = torch.cat([acc_buffer_48k, chunk_48k]) acc_buffer_16k = torch.cat([acc_buffer_16k, chunk_16k]) # Process blocks when we have enough samples while acc_buffer_48k.shape[0] >= rvc_ctx['block_frame']: block_start = time.time() # Take a block block_48k = acc_buffer_48k[:rvc_ctx['block_frame']] block_16k = acc_buffer_16k[:rvc_ctx['block_frame_16k']] acc_buffer_48k = acc_buffer_48k[rvc_ctx['block_frame']:] acc_buffer_16k = acc_buffer_16k[rvc_ctx['block_frame_16k']:] # Update input buffers (slide window) rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone() rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone() rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k # RVC inference infer_wav = rvc.infer( rvc_ctx['input_wav_res'], rvc_ctx['block_frame_16k'], rvc_ctx['skip_head'], rvc_ctx['return_length'], "rmvpe", # f0method ) # Convert to tensor if not torch.is_tensor(infer_wav): infer_wav = torch.from_numpy(infer_wav).to(device) output_chunks.append(infer_wav) block_times.append(time.time() - block_start) # Flush remaining audio if acc_buffer_48k.shape[0] > 0: block_start = time.time() # Pad to block size pad_size_48k = rvc_ctx['block_frame'] - acc_buffer_48k.shape[0] pad_size_16k = rvc_ctx['block_frame_16k'] - acc_buffer_16k.shape[0] padding_48k = torch.zeros(pad_size_48k, device=device, dtype=torch.float32) padding_16k = torch.zeros(pad_size_16k, device=device, dtype=torch.float32) block_48k = torch.cat([acc_buffer_48k, padding_48k]) block_16k = torch.cat([acc_buffer_16k, padding_16k]) # Update buffers rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone() rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone() rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k # Inference infer_wav = rvc.infer( rvc_ctx['input_wav_res'], rvc_ctx['block_frame_16k'], rvc_ctx['skip_head'], rvc_ctx['return_length'], "rmvpe", ) if not torch.is_tensor(infer_wav): infer_wav = torch.from_numpy(infer_wav).to(device) # Trim padding output_samples = acc_buffer_48k.shape[0] if output_samples < len(infer_wav): infer_wav = infer_wav[:output_samples] output_chunks.append(infer_wav) block_times.append(time.time() - block_start) finally: os.chdir(original_dir) # Concatenate all output chunks if output_chunks: full_audio = torch.cat(output_chunks).cpu().numpy() else: full_audio = np.array([]) return full_audio, sum(block_times), block_times def run_benchmark(): """Run comprehensive benchmark""" print("\n" + "="*80) print(" "*20 + "COMPREHENSIVE SOPRANO vs RVC BENCHMARK") print("="*80 + "\n") soprano_socket = setup_soprano_connection() print("āœ… Connected to Soprano server") print("\nšŸ“¦ Loading RVC model...") rvc_start = time.time() rvc_ctx = setup_rvc() print(f"āœ… RVC loaded in {time.time() - rvc_start:.2f}s\n") results = {} for label, text in TEST_CASES: print(f"\n{'='*80}") print(f"Testing: {label.upper()}") print(f"Text: \"{text}\"") print('='*80) results[label] = { 'soprano_times': [], 'rvc_times': [], 'rvc_block_times': [], 'pipeline_times': [], 'audio_duration': None, 'num_blocks': [], } for run in range(RUNS_PER_TEST): print(f"\n Run {run+1}/{RUNS_PER_TEST}:") # Test 1: Soprano only try: soprano_audio, soprano_time = call_soprano(soprano_socket, text) audio_duration = len(soprano_audio) / 32000 results[label]['soprano_times'].append(soprano_time) results[label]['audio_duration'] = audio_duration soprano_rtf = audio_duration / soprano_time if soprano_time > 0 else 0 print(f" šŸŽ¤ Soprano: {soprano_time*1000:.1f}ms → {audio_duration:.2f}s audio (RTF: {soprano_rtf:.2f}x)") # Test 2: RVC only (using cached Soprano output from run 1) rvc_output, rvc_time, block_times = process_audio_through_rvc(soprano_audio, rvc_ctx) results[label]['rvc_times'].append(rvc_time) results[label]['rvc_block_times'].extend(block_times) results[label]['num_blocks'].append(len(block_times)) rvc_rtf = audio_duration / rvc_time if rvc_time > 0 else 0 avg_block_time = statistics.mean(block_times) if block_times else 0 print(f" šŸŽ™ļø RVC: {rvc_time*1000:.1f}ms ({len(block_times)} blocks, avg {avg_block_time*1000:.1f}ms/block, RTF: {rvc_rtf:.2f}x)") # Test 3: Full pipeline time pipeline_time = soprano_time + rvc_time results[label]['pipeline_times'].append(pipeline_time) pipeline_rtf = audio_duration / pipeline_time if pipeline_time > 0 else 0 print(f" ⚔ Pipeline: {pipeline_time*1000:.1f}ms (RTF: {pipeline_rtf:.2f}x)") except Exception as e: print(f" ERROR: {e}") # Print comprehensive analysis print("\n\n" + "="*80) print(" "*25 + "PERFORMANCE ANALYSIS") print("="*80 + "\n") for label, text in TEST_CASES: data = results[label] if not data['soprano_times']: continue print(f"\n{'─'*80}") print(f"šŸ“Š {label.upper()}: \"{text[:60]}{'...' if len(text) > 60 else ''}\"") print('─'*80) audio_dur = data['audio_duration'] print(f"\n šŸŽµ Audio Duration: {audio_dur:.2f}s") # Soprano stats s_mean = statistics.mean(data['soprano_times']) s_median = statistics.median(data['soprano_times']) s_std = statistics.stdev(data['soprano_times']) if len(data['soprano_times']) > 1 else 0 s_rtf = audio_dur / s_mean if s_mean > 0 else 0 print(f"\n šŸŽ¤ SOPRANO:") print(f" ā”œā”€ Mean: {s_mean*1000:.1f}ms") print(f" ā”œā”€ Median: {s_median*1000:.1f}ms") print(f" ā”œā”€ Std Dev: {s_std*1000:.1f}ms") print(f" ā”œā”€ Range: {min(data['soprano_times'])*1000:.1f} - {max(data['soprano_times'])*1000:.1f}ms") print(f" └─ RTF: {s_rtf:.2f}x") # RVC stats r_mean = statistics.mean(data['rvc_times']) r_median = statistics.median(data['rvc_times']) r_std = statistics.stdev(data['rvc_times']) if len(data['rvc_times']) > 1 else 0 r_rtf = audio_dur / r_mean if r_mean > 0 else 0 avg_blocks = statistics.mean(data['num_blocks']) avg_block_time = statistics.mean(data['rvc_block_times']) if data['rvc_block_times'] else 0 print(f"\n šŸŽ™ļø RVC:") print(f" ā”œā”€ Mean: {r_mean*1000:.1f}ms") print(f" ā”œā”€ Median: {r_median*1000:.1f}ms") print(f" ā”œā”€ Std Dev: {r_std*1000:.1f}ms") print(f" ā”œā”€ Range: {min(data['rvc_times'])*1000:.1f} - {max(data['rvc_times'])*1000:.1f}ms") print(f" ā”œā”€ Avg blocks: {avg_blocks:.1f}") print(f" ā”œā”€ Avg per block: {avg_block_time*1000:.1f}ms") print(f" └─ RTF: {r_rtf:.2f}x") # Pipeline stats p_mean = statistics.mean(data['pipeline_times']) p_rtf = audio_dur / p_mean if p_mean > 0 else 0 print(f"\n ⚔ FULL PIPELINE:") print(f" ā”œā”€ Mean: {p_mean*1000:.1f}ms") print(f" └─ RTF: {p_rtf:.2f}x") # Breakdown print(f"\n šŸ“ˆ BREAKDOWN:") print(f" ā”œā”€ Soprano: {s_mean*1000:.1f}ms ({s_mean/p_mean*100:.1f}%)") print(f" └─ RVC: {r_mean*1000:.1f}ms ({r_mean/p_mean*100:.1f}%)") # Summary print("\n\n" + "="*80) print(" "*30 + "SUMMARY") print("="*80 + "\n") all_soprano_rtf = [] all_rvc_rtf = [] all_pipeline_rtf = [] for label in results: data = results[label] if data['audio_duration'] and data['soprano_times'] and data['rvc_times']: s_rtf = data['audio_duration'] / statistics.mean(data['soprano_times']) r_rtf = data['audio_duration'] / statistics.mean(data['rvc_times']) p_rtf = data['audio_duration'] / statistics.mean(data['pipeline_times']) all_soprano_rtf.append(s_rtf) all_rvc_rtf.append(r_rtf) all_pipeline_rtf.append(p_rtf) if all_soprano_rtf: print(f" šŸŽ¤ Soprano Average RTF: {statistics.mean(all_soprano_rtf):.2f}x") if all_rvc_rtf: print(f" šŸŽ™ļø RVC Average RTF: {statistics.mean(all_rvc_rtf):.2f}x") if all_pipeline_rtf: print(f" ⚔ Pipeline Average RTF: {statistics.mean(all_pipeline_rtf):.2f}x") print(f"\n šŸ’” BOTTLENECK ANALYSIS:") if all_soprano_rtf and all_rvc_rtf: soprano_avg = statistics.mean(all_soprano_rtf) rvc_avg = statistics.mean(all_rvc_rtf) if soprano_avg < rvc_avg: bottleneck = "Soprano" ratio = rvc_avg / soprano_avg else: bottleneck = "RVC" ratio = soprano_avg / rvc_avg print(f" └─ {bottleneck} is {ratio:.2f}x slower than the other") if bottleneck == "RVC": print(f"\n āœ… CONCLUSION: RVC (voice conversion) is the bottleneck, NOT Soprano") print(f" - Soprano: {soprano_avg:.2f}x realtime (FAST)") print(f" - RVC: {rvc_avg:.2f}x realtime (BOTTLENECK)") print("\n" + "="*80) print(" "*28 + "BENCHMARK COMPLETE") print("="*80 + "\n") if __name__ == "__main__": run_benchmark()