Files
miku-discord/soprano_to_rvc/benchmark_complete.py
koko210Serve 8ca716029e add: absorb soprano_to_rvc as regular subdirectory
Voice conversion pipeline (Soprano TTS → RVC) with Docker support.
Previously tracked as bare gitlink; removed .git/ directories and
absorbed into main repo for unified tracking.

Includes: Soprano TTS, RVC WebUI integration, Docker configs,
WebSocket API, and benchmark scripts.
Updated .gitignore to exclude large model weights (*.pth, *.pt, *.onnx, *.index).
287 files (3.1GB of ML weights properly excluded via gitignore).
2026-03-04 00:24:53 +02:00

455 lines
17 KiB
Python

#!/usr/bin/env python3
"""
Complete component benchmark replicating the actual soprano_rvc_api.py pipeline logic.
This ensures accurate RVC-only performance measurement.
"""
import sys
import time
import os
import numpy as np
import zmq
import torch
import json
from collections import defaultdict
import statistics
sys.path.insert(0, '/app/Retrieval-based-Voice-Conversion-WebUI')
# Test sentences
TEST_CASES = [
("tiny", "Hello!"),
("short", "Hello, this is a test."),
("medium", "The quick brown fox jumps over the lazy dog."),
("long", "Artificial intelligence is revolutionizing the way we interact with technology."),
("very_long", "Artificial intelligence and machine learning are fundamentally transforming our understanding of human-computer interaction, enabling unprecedented levels of automation and personalization across diverse industries."),
]
RUNS_PER_TEST = 5
def setup_soprano_connection():
"""Connect to Soprano server"""
zmq_context = zmq.Context()
soprano_socket = zmq_context.socket(zmq.REQ)
soprano_socket.connect("tcp://soprano:5555")
soprano_socket.setsockopt(zmq.RCVTIMEO, 30000)
return soprano_socket
def call_soprano(socket, text):
"""Get audio from Soprano"""
import uuid
job_id = str(uuid.uuid4())
start = time.time()
socket.send_json({"job_id": job_id, "text": text})
response = socket.recv_json()
elapsed = time.time() - start
if 'error' in response:
raise Exception(f"Soprano error: {response['error']}")
audio = np.array(response['audio'], dtype=np.float32)
return audio, elapsed
def setup_rvc():
"""Initialize RVC exactly as soprano_rvc_api.py does"""
from infer.lib import rtrvc as rvc_for_realtime
from configs.config import Config
from multiprocessing import Queue as MPQueue
import torchaudio
# Change to RVC directory
rvc_dir = '/app/Retrieval-based-Voice-Conversion-WebUI'
original_dir = os.getcwd()
os.chdir(rvc_dir)
try:
config = Config()
device = config.device
# Create queues
inp_q = MPQueue()
opt_q = MPQueue()
# Initialize RVC
rvc = rvc_for_realtime.RVC(
key=0,
formant=0,
pth_path="assets/weights/MikuAI_e210_s6300.pth",
index_path="assets/indices/added_IVF1811_Flat_nprobe_1_MikuAI_v2.index",
index_rate=0.5,
n_cpu=4,
inp_q=inp_q,
opt_q=opt_q,
config=config,
last_rvc=None
)
# Optimizations
torch.set_grad_enabled(False)
torch.backends.cudnn.benchmark = True
if config.is_half:
rvc.model = rvc.model.half()
rvc.net_g = rvc.net_g.half()
# Create resamplers (32kHz Soprano → 48kHz and 16kHz)
resampler_32_to_48 = torchaudio.transforms.Resample(
orig_freq=32000, new_freq=48000
).to(device)
resampler_32_to_16 = torchaudio.transforms.Resample(
orig_freq=32000, new_freq=16000
).to(device)
# Prepare RVC buffers
block_frame = 9600 # 0.2s @ 48kHz
block_frame_16k = 3200 # 0.2s @ 16kHz
crossfade_frame = int(block_frame * 0.25) # 0.05s
extra_frame = int(block_frame * 9) # 1.8s context
input_wav_res = torch.zeros(
extra_frame + crossfade_frame + block_frame_16k + crossfade_frame,
device=device, dtype=torch.float32
)
input_wav = torch.zeros(
extra_frame + crossfade_frame + block_frame + crossfade_frame,
device=device, dtype=torch.float32
)
# RVC processing parameters (zc = target_sr // 100 = 48000 // 100 = 480)
zc = 480 # Window size for RVC processing
skip_head = extra_frame // zc
return_length = (block_frame_16k + crossfade_frame) // zc
finally:
os.chdir(original_dir)
return {
'rvc': rvc,
'config': config,
'device': device,
'resampler_32_to_48': resampler_32_to_48,
'resampler_32_to_16': resampler_32_to_16,
'block_frame': block_frame,
'block_frame_16k': block_frame_16k,
'crossfade_frame': crossfade_frame,
'extra_frame': extra_frame,
'input_wav_res': input_wav_res,
'input_wav': input_wav,
'skip_head': skip_head,
'return_length': return_length,
'rvc_dir': rvc_dir,
}
def process_audio_through_rvc(soprano_audio, rvc_ctx):
"""
Process Soprano audio through RVC pipeline exactly as soprano_rvc_api.py does.
Returns: (converted_audio, time_taken)
"""
device = rvc_ctx['device']
rvc = rvc_ctx['rvc']
# Convert to tensor
soprano_tensor = torch.from_numpy(soprano_audio).to(device).float()
# Split into chunks (0.1s @ 32kHz)
chunk_size = 3200
num_chunks = (len(soprano_tensor) + chunk_size - 1) // chunk_size
# Accumulation buffers
acc_buffer_48k = torch.tensor([], device=device, dtype=torch.float32)
acc_buffer_16k = torch.tensor([], device=device, dtype=torch.float32)
output_chunks = []
block_times = []
original_dir = os.getcwd()
os.chdir(rvc_ctx['rvc_dir'])
try:
for i in range(num_chunks):
chunk_start = i * chunk_size
chunk_end = min((i + 1) * chunk_size, len(soprano_tensor))
soprano_chunk = soprano_tensor[chunk_start:chunk_end]
# Resample to 48kHz and 16kHz
chunk_48k = rvc_ctx['resampler_32_to_48'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]
chunk_16k = rvc_ctx['resampler_32_to_16'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]
# Accumulate
acc_buffer_48k = torch.cat([acc_buffer_48k, chunk_48k])
acc_buffer_16k = torch.cat([acc_buffer_16k, chunk_16k])
# Process blocks when we have enough samples
while acc_buffer_48k.shape[0] >= rvc_ctx['block_frame']:
block_start = time.time()
# Take a block
block_48k = acc_buffer_48k[:rvc_ctx['block_frame']]
block_16k = acc_buffer_16k[:rvc_ctx['block_frame_16k']]
acc_buffer_48k = acc_buffer_48k[rvc_ctx['block_frame']:]
acc_buffer_16k = acc_buffer_16k[rvc_ctx['block_frame_16k']:]
# Update input buffers (slide window)
rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone()
rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k
rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone()
rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k
# RVC inference
infer_wav = rvc.infer(
rvc_ctx['input_wav_res'],
rvc_ctx['block_frame_16k'],
rvc_ctx['skip_head'],
rvc_ctx['return_length'],
"rmvpe", # f0method
)
# Convert to tensor
if not torch.is_tensor(infer_wav):
infer_wav = torch.from_numpy(infer_wav).to(device)
output_chunks.append(infer_wav)
block_times.append(time.time() - block_start)
# Flush remaining audio
if acc_buffer_48k.shape[0] > 0:
block_start = time.time()
# Pad to block size
pad_size_48k = rvc_ctx['block_frame'] - acc_buffer_48k.shape[0]
pad_size_16k = rvc_ctx['block_frame_16k'] - acc_buffer_16k.shape[0]
padding_48k = torch.zeros(pad_size_48k, device=device, dtype=torch.float32)
padding_16k = torch.zeros(pad_size_16k, device=device, dtype=torch.float32)
block_48k = torch.cat([acc_buffer_48k, padding_48k])
block_16k = torch.cat([acc_buffer_16k, padding_16k])
# Update buffers
rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone()
rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k
rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone()
rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k
# Inference
infer_wav = rvc.infer(
rvc_ctx['input_wav_res'],
rvc_ctx['block_frame_16k'],
rvc_ctx['skip_head'],
rvc_ctx['return_length'],
"rmvpe",
)
if not torch.is_tensor(infer_wav):
infer_wav = torch.from_numpy(infer_wav).to(device)
# Trim padding
output_samples = acc_buffer_48k.shape[0]
if output_samples < len(infer_wav):
infer_wav = infer_wav[:output_samples]
output_chunks.append(infer_wav)
block_times.append(time.time() - block_start)
finally:
os.chdir(original_dir)
# Concatenate all output chunks
if output_chunks:
full_audio = torch.cat(output_chunks).cpu().numpy()
else:
full_audio = np.array([])
return full_audio, sum(block_times), block_times
def run_benchmark():
"""Run comprehensive benchmark"""
print("\n" + "="*80)
print(" "*20 + "COMPREHENSIVE SOPRANO vs RVC BENCHMARK")
print("="*80 + "\n")
soprano_socket = setup_soprano_connection()
print("✅ Connected to Soprano server")
print("\n📦 Loading RVC model...")
rvc_start = time.time()
rvc_ctx = setup_rvc()
print(f"✅ RVC loaded in {time.time() - rvc_start:.2f}s\n")
results = {}
for label, text in TEST_CASES:
print(f"\n{'='*80}")
print(f"Testing: {label.upper()}")
print(f"Text: \"{text}\"")
print('='*80)
results[label] = {
'soprano_times': [],
'rvc_times': [],
'rvc_block_times': [],
'pipeline_times': [],
'audio_duration': None,
'num_blocks': [],
}
for run in range(RUNS_PER_TEST):
print(f"\n Run {run+1}/{RUNS_PER_TEST}:")
# Test 1: Soprano only
try:
soprano_audio, soprano_time = call_soprano(soprano_socket, text)
audio_duration = len(soprano_audio) / 32000
results[label]['soprano_times'].append(soprano_time)
results[label]['audio_duration'] = audio_duration
soprano_rtf = audio_duration / soprano_time if soprano_time > 0 else 0
print(f" 🎤 Soprano: {soprano_time*1000:.1f}ms → {audio_duration:.2f}s audio (RTF: {soprano_rtf:.2f}x)")
# Test 2: RVC only (using cached Soprano output from run 1)
rvc_output, rvc_time, block_times = process_audio_through_rvc(soprano_audio, rvc_ctx)
results[label]['rvc_times'].append(rvc_time)
results[label]['rvc_block_times'].extend(block_times)
results[label]['num_blocks'].append(len(block_times))
rvc_rtf = audio_duration / rvc_time if rvc_time > 0 else 0
avg_block_time = statistics.mean(block_times) if block_times else 0
print(f" 🎙️ RVC: {rvc_time*1000:.1f}ms ({len(block_times)} blocks, avg {avg_block_time*1000:.1f}ms/block, RTF: {rvc_rtf:.2f}x)")
# Test 3: Full pipeline time
pipeline_time = soprano_time + rvc_time
results[label]['pipeline_times'].append(pipeline_time)
pipeline_rtf = audio_duration / pipeline_time if pipeline_time > 0 else 0
print(f" ⚡ Pipeline: {pipeline_time*1000:.1f}ms (RTF: {pipeline_rtf:.2f}x)")
except Exception as e:
print(f" ERROR: {e}")
# Print comprehensive analysis
print("\n\n" + "="*80)
print(" "*25 + "PERFORMANCE ANALYSIS")
print("="*80 + "\n")
for label, text in TEST_CASES:
data = results[label]
if not data['soprano_times']:
continue
print(f"\n{''*80}")
print(f"📊 {label.upper()}: \"{text[:60]}{'...' if len(text) > 60 else ''}\"")
print(''*80)
audio_dur = data['audio_duration']
print(f"\n 🎵 Audio Duration: {audio_dur:.2f}s")
# Soprano stats
s_mean = statistics.mean(data['soprano_times'])
s_median = statistics.median(data['soprano_times'])
s_std = statistics.stdev(data['soprano_times']) if len(data['soprano_times']) > 1 else 0
s_rtf = audio_dur / s_mean if s_mean > 0 else 0
print(f"\n 🎤 SOPRANO:")
print(f" ├─ Mean: {s_mean*1000:.1f}ms")
print(f" ├─ Median: {s_median*1000:.1f}ms")
print(f" ├─ Std Dev: {s_std*1000:.1f}ms")
print(f" ├─ Range: {min(data['soprano_times'])*1000:.1f} - {max(data['soprano_times'])*1000:.1f}ms")
print(f" └─ RTF: {s_rtf:.2f}x")
# RVC stats
r_mean = statistics.mean(data['rvc_times'])
r_median = statistics.median(data['rvc_times'])
r_std = statistics.stdev(data['rvc_times']) if len(data['rvc_times']) > 1 else 0
r_rtf = audio_dur / r_mean if r_mean > 0 else 0
avg_blocks = statistics.mean(data['num_blocks'])
avg_block_time = statistics.mean(data['rvc_block_times']) if data['rvc_block_times'] else 0
print(f"\n 🎙️ RVC:")
print(f" ├─ Mean: {r_mean*1000:.1f}ms")
print(f" ├─ Median: {r_median*1000:.1f}ms")
print(f" ├─ Std Dev: {r_std*1000:.1f}ms")
print(f" ├─ Range: {min(data['rvc_times'])*1000:.1f} - {max(data['rvc_times'])*1000:.1f}ms")
print(f" ├─ Avg blocks: {avg_blocks:.1f}")
print(f" ├─ Avg per block: {avg_block_time*1000:.1f}ms")
print(f" └─ RTF: {r_rtf:.2f}x")
# Pipeline stats
p_mean = statistics.mean(data['pipeline_times'])
p_rtf = audio_dur / p_mean if p_mean > 0 else 0
print(f"\n ⚡ FULL PIPELINE:")
print(f" ├─ Mean: {p_mean*1000:.1f}ms")
print(f" └─ RTF: {p_rtf:.2f}x")
# Breakdown
print(f"\n 📈 BREAKDOWN:")
print(f" ├─ Soprano: {s_mean*1000:.1f}ms ({s_mean/p_mean*100:.1f}%)")
print(f" └─ RVC: {r_mean*1000:.1f}ms ({r_mean/p_mean*100:.1f}%)")
# Summary
print("\n\n" + "="*80)
print(" "*30 + "SUMMARY")
print("="*80 + "\n")
all_soprano_rtf = []
all_rvc_rtf = []
all_pipeline_rtf = []
for label in results:
data = results[label]
if data['audio_duration'] and data['soprano_times'] and data['rvc_times']:
s_rtf = data['audio_duration'] / statistics.mean(data['soprano_times'])
r_rtf = data['audio_duration'] / statistics.mean(data['rvc_times'])
p_rtf = data['audio_duration'] / statistics.mean(data['pipeline_times'])
all_soprano_rtf.append(s_rtf)
all_rvc_rtf.append(r_rtf)
all_pipeline_rtf.append(p_rtf)
if all_soprano_rtf:
print(f" 🎤 Soprano Average RTF: {statistics.mean(all_soprano_rtf):.2f}x")
if all_rvc_rtf:
print(f" 🎙️ RVC Average RTF: {statistics.mean(all_rvc_rtf):.2f}x")
if all_pipeline_rtf:
print(f" ⚡ Pipeline Average RTF: {statistics.mean(all_pipeline_rtf):.2f}x")
print(f"\n 💡 BOTTLENECK ANALYSIS:")
if all_soprano_rtf and all_rvc_rtf:
soprano_avg = statistics.mean(all_soprano_rtf)
rvc_avg = statistics.mean(all_rvc_rtf)
if soprano_avg < rvc_avg:
bottleneck = "Soprano"
ratio = rvc_avg / soprano_avg
else:
bottleneck = "RVC"
ratio = soprano_avg / rvc_avg
print(f" └─ {bottleneck} is {ratio:.2f}x slower than the other")
if bottleneck == "RVC":
print(f"\n ✅ CONCLUSION: RVC (voice conversion) is the bottleneck, NOT Soprano")
print(f" - Soprano: {soprano_avg:.2f}x realtime (FAST)")
print(f" - RVC: {rvc_avg:.2f}x realtime (BOTTLENECK)")
print("\n" + "="*80)
print(" "*28 + "BENCHMARK COMPLETE")
print("="*80 + "\n")
if __name__ == "__main__":
run_benchmark()