Voice conversion pipeline (Soprano TTS → RVC) with Docker support. Previously tracked as bare gitlink; removed .git/ directories and absorbed into main repo for unified tracking. Includes: Soprano TTS, RVC WebUI integration, Docker configs, WebSocket API, and benchmark scripts. Updated .gitignore to exclude large model weights (*.pth, *.pt, *.onnx, *.index). 287 files (3.1GB of ML weights properly excluded via gitignore).
455 lines
17 KiB
Python
455 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Complete component benchmark replicating the actual soprano_rvc_api.py pipeline logic.
|
|
This ensures accurate RVC-only performance measurement.
|
|
"""
|
|
|
|
import sys
|
|
import time
|
|
import os
|
|
import numpy as np
|
|
import zmq
|
|
import torch
|
|
import json
|
|
from collections import defaultdict
|
|
import statistics
|
|
|
|
sys.path.insert(0, '/app/Retrieval-based-Voice-Conversion-WebUI')
|
|
|
|
# Test sentences
|
|
TEST_CASES = [
|
|
("tiny", "Hello!"),
|
|
("short", "Hello, this is a test."),
|
|
("medium", "The quick brown fox jumps over the lazy dog."),
|
|
("long", "Artificial intelligence is revolutionizing the way we interact with technology."),
|
|
("very_long", "Artificial intelligence and machine learning are fundamentally transforming our understanding of human-computer interaction, enabling unprecedented levels of automation and personalization across diverse industries."),
|
|
]
|
|
|
|
RUNS_PER_TEST = 5
|
|
|
|
|
|
def setup_soprano_connection():
|
|
"""Connect to Soprano server"""
|
|
zmq_context = zmq.Context()
|
|
soprano_socket = zmq_context.socket(zmq.REQ)
|
|
soprano_socket.connect("tcp://soprano:5555")
|
|
soprano_socket.setsockopt(zmq.RCVTIMEO, 30000)
|
|
return soprano_socket
|
|
|
|
|
|
def call_soprano(socket, text):
|
|
"""Get audio from Soprano"""
|
|
import uuid
|
|
job_id = str(uuid.uuid4())
|
|
|
|
start = time.time()
|
|
socket.send_json({"job_id": job_id, "text": text})
|
|
response = socket.recv_json()
|
|
elapsed = time.time() - start
|
|
|
|
if 'error' in response:
|
|
raise Exception(f"Soprano error: {response['error']}")
|
|
|
|
audio = np.array(response['audio'], dtype=np.float32)
|
|
return audio, elapsed
|
|
|
|
|
|
def setup_rvc():
|
|
"""Initialize RVC exactly as soprano_rvc_api.py does"""
|
|
from infer.lib import rtrvc as rvc_for_realtime
|
|
from configs.config import Config
|
|
from multiprocessing import Queue as MPQueue
|
|
import torchaudio
|
|
|
|
# Change to RVC directory
|
|
rvc_dir = '/app/Retrieval-based-Voice-Conversion-WebUI'
|
|
original_dir = os.getcwd()
|
|
os.chdir(rvc_dir)
|
|
|
|
try:
|
|
config = Config()
|
|
device = config.device
|
|
|
|
# Create queues
|
|
inp_q = MPQueue()
|
|
opt_q = MPQueue()
|
|
|
|
# Initialize RVC
|
|
rvc = rvc_for_realtime.RVC(
|
|
key=0,
|
|
formant=0,
|
|
pth_path="assets/weights/MikuAI_e210_s6300.pth",
|
|
index_path="assets/indices/added_IVF1811_Flat_nprobe_1_MikuAI_v2.index",
|
|
index_rate=0.5,
|
|
n_cpu=4,
|
|
inp_q=inp_q,
|
|
opt_q=opt_q,
|
|
config=config,
|
|
last_rvc=None
|
|
)
|
|
|
|
# Optimizations
|
|
torch.set_grad_enabled(False)
|
|
torch.backends.cudnn.benchmark = True
|
|
|
|
if config.is_half:
|
|
rvc.model = rvc.model.half()
|
|
rvc.net_g = rvc.net_g.half()
|
|
|
|
# Create resamplers (32kHz Soprano → 48kHz and 16kHz)
|
|
resampler_32_to_48 = torchaudio.transforms.Resample(
|
|
orig_freq=32000, new_freq=48000
|
|
).to(device)
|
|
|
|
resampler_32_to_16 = torchaudio.transforms.Resample(
|
|
orig_freq=32000, new_freq=16000
|
|
).to(device)
|
|
|
|
# Prepare RVC buffers
|
|
block_frame = 9600 # 0.2s @ 48kHz
|
|
block_frame_16k = 3200 # 0.2s @ 16kHz
|
|
crossfade_frame = int(block_frame * 0.25) # 0.05s
|
|
extra_frame = int(block_frame * 9) # 1.8s context
|
|
|
|
input_wav_res = torch.zeros(
|
|
extra_frame + crossfade_frame + block_frame_16k + crossfade_frame,
|
|
device=device, dtype=torch.float32
|
|
)
|
|
|
|
input_wav = torch.zeros(
|
|
extra_frame + crossfade_frame + block_frame + crossfade_frame,
|
|
device=device, dtype=torch.float32
|
|
)
|
|
|
|
# RVC processing parameters (zc = target_sr // 100 = 48000 // 100 = 480)
|
|
zc = 480 # Window size for RVC processing
|
|
skip_head = extra_frame // zc
|
|
return_length = (block_frame_16k + crossfade_frame) // zc
|
|
|
|
finally:
|
|
os.chdir(original_dir)
|
|
|
|
return {
|
|
'rvc': rvc,
|
|
'config': config,
|
|
'device': device,
|
|
'resampler_32_to_48': resampler_32_to_48,
|
|
'resampler_32_to_16': resampler_32_to_16,
|
|
'block_frame': block_frame,
|
|
'block_frame_16k': block_frame_16k,
|
|
'crossfade_frame': crossfade_frame,
|
|
'extra_frame': extra_frame,
|
|
'input_wav_res': input_wav_res,
|
|
'input_wav': input_wav,
|
|
'skip_head': skip_head,
|
|
'return_length': return_length,
|
|
'rvc_dir': rvc_dir,
|
|
}
|
|
|
|
|
|
def process_audio_through_rvc(soprano_audio, rvc_ctx):
|
|
"""
|
|
Process Soprano audio through RVC pipeline exactly as soprano_rvc_api.py does.
|
|
Returns: (converted_audio, time_taken)
|
|
"""
|
|
device = rvc_ctx['device']
|
|
rvc = rvc_ctx['rvc']
|
|
|
|
# Convert to tensor
|
|
soprano_tensor = torch.from_numpy(soprano_audio).to(device).float()
|
|
|
|
# Split into chunks (0.1s @ 32kHz)
|
|
chunk_size = 3200
|
|
num_chunks = (len(soprano_tensor) + chunk_size - 1) // chunk_size
|
|
|
|
# Accumulation buffers
|
|
acc_buffer_48k = torch.tensor([], device=device, dtype=torch.float32)
|
|
acc_buffer_16k = torch.tensor([], device=device, dtype=torch.float32)
|
|
|
|
output_chunks = []
|
|
block_times = []
|
|
|
|
original_dir = os.getcwd()
|
|
os.chdir(rvc_ctx['rvc_dir'])
|
|
|
|
try:
|
|
for i in range(num_chunks):
|
|
chunk_start = i * chunk_size
|
|
chunk_end = min((i + 1) * chunk_size, len(soprano_tensor))
|
|
soprano_chunk = soprano_tensor[chunk_start:chunk_end]
|
|
|
|
# Resample to 48kHz and 16kHz
|
|
chunk_48k = rvc_ctx['resampler_32_to_48'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]
|
|
chunk_16k = rvc_ctx['resampler_32_to_16'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]
|
|
|
|
# Accumulate
|
|
acc_buffer_48k = torch.cat([acc_buffer_48k, chunk_48k])
|
|
acc_buffer_16k = torch.cat([acc_buffer_16k, chunk_16k])
|
|
|
|
# Process blocks when we have enough samples
|
|
while acc_buffer_48k.shape[0] >= rvc_ctx['block_frame']:
|
|
block_start = time.time()
|
|
|
|
# Take a block
|
|
block_48k = acc_buffer_48k[:rvc_ctx['block_frame']]
|
|
block_16k = acc_buffer_16k[:rvc_ctx['block_frame_16k']]
|
|
|
|
acc_buffer_48k = acc_buffer_48k[rvc_ctx['block_frame']:]
|
|
acc_buffer_16k = acc_buffer_16k[rvc_ctx['block_frame_16k']:]
|
|
|
|
# Update input buffers (slide window)
|
|
rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone()
|
|
rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k
|
|
|
|
rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone()
|
|
rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k
|
|
|
|
# RVC inference
|
|
infer_wav = rvc.infer(
|
|
rvc_ctx['input_wav_res'],
|
|
rvc_ctx['block_frame_16k'],
|
|
rvc_ctx['skip_head'],
|
|
rvc_ctx['return_length'],
|
|
"rmvpe", # f0method
|
|
)
|
|
|
|
# Convert to tensor
|
|
if not torch.is_tensor(infer_wav):
|
|
infer_wav = torch.from_numpy(infer_wav).to(device)
|
|
|
|
output_chunks.append(infer_wav)
|
|
block_times.append(time.time() - block_start)
|
|
|
|
# Flush remaining audio
|
|
if acc_buffer_48k.shape[0] > 0:
|
|
block_start = time.time()
|
|
|
|
# Pad to block size
|
|
pad_size_48k = rvc_ctx['block_frame'] - acc_buffer_48k.shape[0]
|
|
pad_size_16k = rvc_ctx['block_frame_16k'] - acc_buffer_16k.shape[0]
|
|
|
|
padding_48k = torch.zeros(pad_size_48k, device=device, dtype=torch.float32)
|
|
padding_16k = torch.zeros(pad_size_16k, device=device, dtype=torch.float32)
|
|
|
|
block_48k = torch.cat([acc_buffer_48k, padding_48k])
|
|
block_16k = torch.cat([acc_buffer_16k, padding_16k])
|
|
|
|
# Update buffers
|
|
rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone()
|
|
rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k
|
|
|
|
rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone()
|
|
rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k
|
|
|
|
# Inference
|
|
infer_wav = rvc.infer(
|
|
rvc_ctx['input_wav_res'],
|
|
rvc_ctx['block_frame_16k'],
|
|
rvc_ctx['skip_head'],
|
|
rvc_ctx['return_length'],
|
|
"rmvpe",
|
|
)
|
|
|
|
if not torch.is_tensor(infer_wav):
|
|
infer_wav = torch.from_numpy(infer_wav).to(device)
|
|
|
|
# Trim padding
|
|
output_samples = acc_buffer_48k.shape[0]
|
|
if output_samples < len(infer_wav):
|
|
infer_wav = infer_wav[:output_samples]
|
|
|
|
output_chunks.append(infer_wav)
|
|
block_times.append(time.time() - block_start)
|
|
|
|
finally:
|
|
os.chdir(original_dir)
|
|
|
|
# Concatenate all output chunks
|
|
if output_chunks:
|
|
full_audio = torch.cat(output_chunks).cpu().numpy()
|
|
else:
|
|
full_audio = np.array([])
|
|
|
|
return full_audio, sum(block_times), block_times
|
|
|
|
|
|
def run_benchmark():
|
|
"""Run comprehensive benchmark"""
|
|
print("\n" + "="*80)
|
|
print(" "*20 + "COMPREHENSIVE SOPRANO vs RVC BENCHMARK")
|
|
print("="*80 + "\n")
|
|
|
|
soprano_socket = setup_soprano_connection()
|
|
print("✅ Connected to Soprano server")
|
|
|
|
print("\n📦 Loading RVC model...")
|
|
rvc_start = time.time()
|
|
rvc_ctx = setup_rvc()
|
|
print(f"✅ RVC loaded in {time.time() - rvc_start:.2f}s\n")
|
|
|
|
results = {}
|
|
|
|
for label, text in TEST_CASES:
|
|
print(f"\n{'='*80}")
|
|
print(f"Testing: {label.upper()}")
|
|
print(f"Text: \"{text}\"")
|
|
print('='*80)
|
|
|
|
results[label] = {
|
|
'soprano_times': [],
|
|
'rvc_times': [],
|
|
'rvc_block_times': [],
|
|
'pipeline_times': [],
|
|
'audio_duration': None,
|
|
'num_blocks': [],
|
|
}
|
|
|
|
for run in range(RUNS_PER_TEST):
|
|
print(f"\n Run {run+1}/{RUNS_PER_TEST}:")
|
|
|
|
# Test 1: Soprano only
|
|
try:
|
|
soprano_audio, soprano_time = call_soprano(soprano_socket, text)
|
|
audio_duration = len(soprano_audio) / 32000
|
|
results[label]['soprano_times'].append(soprano_time)
|
|
results[label]['audio_duration'] = audio_duration
|
|
|
|
soprano_rtf = audio_duration / soprano_time if soprano_time > 0 else 0
|
|
print(f" 🎤 Soprano: {soprano_time*1000:.1f}ms → {audio_duration:.2f}s audio (RTF: {soprano_rtf:.2f}x)")
|
|
|
|
# Test 2: RVC only (using cached Soprano output from run 1)
|
|
rvc_output, rvc_time, block_times = process_audio_through_rvc(soprano_audio, rvc_ctx)
|
|
results[label]['rvc_times'].append(rvc_time)
|
|
results[label]['rvc_block_times'].extend(block_times)
|
|
results[label]['num_blocks'].append(len(block_times))
|
|
|
|
rvc_rtf = audio_duration / rvc_time if rvc_time > 0 else 0
|
|
avg_block_time = statistics.mean(block_times) if block_times else 0
|
|
print(f" 🎙️ RVC: {rvc_time*1000:.1f}ms ({len(block_times)} blocks, avg {avg_block_time*1000:.1f}ms/block, RTF: {rvc_rtf:.2f}x)")
|
|
|
|
# Test 3: Full pipeline time
|
|
pipeline_time = soprano_time + rvc_time
|
|
results[label]['pipeline_times'].append(pipeline_time)
|
|
|
|
pipeline_rtf = audio_duration / pipeline_time if pipeline_time > 0 else 0
|
|
print(f" ⚡ Pipeline: {pipeline_time*1000:.1f}ms (RTF: {pipeline_rtf:.2f}x)")
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
|
|
# Print comprehensive analysis
|
|
print("\n\n" + "="*80)
|
|
print(" "*25 + "PERFORMANCE ANALYSIS")
|
|
print("="*80 + "\n")
|
|
|
|
for label, text in TEST_CASES:
|
|
data = results[label]
|
|
|
|
if not data['soprano_times']:
|
|
continue
|
|
|
|
print(f"\n{'─'*80}")
|
|
print(f"📊 {label.upper()}: \"{text[:60]}{'...' if len(text) > 60 else ''}\"")
|
|
print('─'*80)
|
|
|
|
audio_dur = data['audio_duration']
|
|
print(f"\n 🎵 Audio Duration: {audio_dur:.2f}s")
|
|
|
|
# Soprano stats
|
|
s_mean = statistics.mean(data['soprano_times'])
|
|
s_median = statistics.median(data['soprano_times'])
|
|
s_std = statistics.stdev(data['soprano_times']) if len(data['soprano_times']) > 1 else 0
|
|
s_rtf = audio_dur / s_mean if s_mean > 0 else 0
|
|
|
|
print(f"\n 🎤 SOPRANO:")
|
|
print(f" ├─ Mean: {s_mean*1000:.1f}ms")
|
|
print(f" ├─ Median: {s_median*1000:.1f}ms")
|
|
print(f" ├─ Std Dev: {s_std*1000:.1f}ms")
|
|
print(f" ├─ Range: {min(data['soprano_times'])*1000:.1f} - {max(data['soprano_times'])*1000:.1f}ms")
|
|
print(f" └─ RTF: {s_rtf:.2f}x")
|
|
|
|
# RVC stats
|
|
r_mean = statistics.mean(data['rvc_times'])
|
|
r_median = statistics.median(data['rvc_times'])
|
|
r_std = statistics.stdev(data['rvc_times']) if len(data['rvc_times']) > 1 else 0
|
|
r_rtf = audio_dur / r_mean if r_mean > 0 else 0
|
|
|
|
avg_blocks = statistics.mean(data['num_blocks'])
|
|
avg_block_time = statistics.mean(data['rvc_block_times']) if data['rvc_block_times'] else 0
|
|
|
|
print(f"\n 🎙️ RVC:")
|
|
print(f" ├─ Mean: {r_mean*1000:.1f}ms")
|
|
print(f" ├─ Median: {r_median*1000:.1f}ms")
|
|
print(f" ├─ Std Dev: {r_std*1000:.1f}ms")
|
|
print(f" ├─ Range: {min(data['rvc_times'])*1000:.1f} - {max(data['rvc_times'])*1000:.1f}ms")
|
|
print(f" ├─ Avg blocks: {avg_blocks:.1f}")
|
|
print(f" ├─ Avg per block: {avg_block_time*1000:.1f}ms")
|
|
print(f" └─ RTF: {r_rtf:.2f}x")
|
|
|
|
# Pipeline stats
|
|
p_mean = statistics.mean(data['pipeline_times'])
|
|
p_rtf = audio_dur / p_mean if p_mean > 0 else 0
|
|
|
|
print(f"\n ⚡ FULL PIPELINE:")
|
|
print(f" ├─ Mean: {p_mean*1000:.1f}ms")
|
|
print(f" └─ RTF: {p_rtf:.2f}x")
|
|
|
|
# Breakdown
|
|
print(f"\n 📈 BREAKDOWN:")
|
|
print(f" ├─ Soprano: {s_mean*1000:.1f}ms ({s_mean/p_mean*100:.1f}%)")
|
|
print(f" └─ RVC: {r_mean*1000:.1f}ms ({r_mean/p_mean*100:.1f}%)")
|
|
|
|
# Summary
|
|
print("\n\n" + "="*80)
|
|
print(" "*30 + "SUMMARY")
|
|
print("="*80 + "\n")
|
|
|
|
all_soprano_rtf = []
|
|
all_rvc_rtf = []
|
|
all_pipeline_rtf = []
|
|
|
|
for label in results:
|
|
data = results[label]
|
|
if data['audio_duration'] and data['soprano_times'] and data['rvc_times']:
|
|
s_rtf = data['audio_duration'] / statistics.mean(data['soprano_times'])
|
|
r_rtf = data['audio_duration'] / statistics.mean(data['rvc_times'])
|
|
p_rtf = data['audio_duration'] / statistics.mean(data['pipeline_times'])
|
|
|
|
all_soprano_rtf.append(s_rtf)
|
|
all_rvc_rtf.append(r_rtf)
|
|
all_pipeline_rtf.append(p_rtf)
|
|
|
|
if all_soprano_rtf:
|
|
print(f" 🎤 Soprano Average RTF: {statistics.mean(all_soprano_rtf):.2f}x")
|
|
if all_rvc_rtf:
|
|
print(f" 🎙️ RVC Average RTF: {statistics.mean(all_rvc_rtf):.2f}x")
|
|
if all_pipeline_rtf:
|
|
print(f" ⚡ Pipeline Average RTF: {statistics.mean(all_pipeline_rtf):.2f}x")
|
|
|
|
print(f"\n 💡 BOTTLENECK ANALYSIS:")
|
|
if all_soprano_rtf and all_rvc_rtf:
|
|
soprano_avg = statistics.mean(all_soprano_rtf)
|
|
rvc_avg = statistics.mean(all_rvc_rtf)
|
|
|
|
if soprano_avg < rvc_avg:
|
|
bottleneck = "Soprano"
|
|
ratio = rvc_avg / soprano_avg
|
|
else:
|
|
bottleneck = "RVC"
|
|
ratio = soprano_avg / rvc_avg
|
|
|
|
print(f" └─ {bottleneck} is {ratio:.2f}x slower than the other")
|
|
|
|
if bottleneck == "RVC":
|
|
print(f"\n ✅ CONCLUSION: RVC (voice conversion) is the bottleneck, NOT Soprano")
|
|
print(f" - Soprano: {soprano_avg:.2f}x realtime (FAST)")
|
|
print(f" - RVC: {rvc_avg:.2f}x realtime (BOTTLENECK)")
|
|
|
|
print("\n" + "="*80)
|
|
print(" "*28 + "BENCHMARK COMPLETE")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run_benchmark()
|