add: absorb soprano_to_rvc as regular subdirectory
Voice conversion pipeline (Soprano TTS → RVC) with Docker support. Previously tracked as bare gitlink; removed .git/ directories and absorbed into main repo for unified tracking. Includes: Soprano TTS, RVC WebUI integration, Docker configs, WebSocket API, and benchmark scripts. Updated .gitignore to exclude large model weights (*.pth, *.pt, *.onnx, *.index). 287 files (3.1GB of ML weights properly excluded via gitignore).
This commit is contained in:
454
soprano_to_rvc/benchmark_complete.py
Normal file
454
soprano_to_rvc/benchmark_complete.py
Normal file
@@ -0,0 +1,454 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete component benchmark replicating the actual soprano_rvc_api.py pipeline logic.
|
||||
This ensures accurate RVC-only performance measurement.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
import os
|
||||
import numpy as np
|
||||
import zmq
|
||||
import torch
|
||||
import json
|
||||
from collections import defaultdict
|
||||
import statistics
|
||||
|
||||
sys.path.insert(0, '/app/Retrieval-based-Voice-Conversion-WebUI')
|
||||
|
||||
# Test sentences
|
||||
TEST_CASES = [
|
||||
("tiny", "Hello!"),
|
||||
("short", "Hello, this is a test."),
|
||||
("medium", "The quick brown fox jumps over the lazy dog."),
|
||||
("long", "Artificial intelligence is revolutionizing the way we interact with technology."),
|
||||
("very_long", "Artificial intelligence and machine learning are fundamentally transforming our understanding of human-computer interaction, enabling unprecedented levels of automation and personalization across diverse industries."),
|
||||
]
|
||||
|
||||
RUNS_PER_TEST = 5
|
||||
|
||||
|
||||
def setup_soprano_connection():
|
||||
"""Connect to Soprano server"""
|
||||
zmq_context = zmq.Context()
|
||||
soprano_socket = zmq_context.socket(zmq.REQ)
|
||||
soprano_socket.connect("tcp://soprano:5555")
|
||||
soprano_socket.setsockopt(zmq.RCVTIMEO, 30000)
|
||||
return soprano_socket
|
||||
|
||||
|
||||
def call_soprano(socket, text):
|
||||
"""Get audio from Soprano"""
|
||||
import uuid
|
||||
job_id = str(uuid.uuid4())
|
||||
|
||||
start = time.time()
|
||||
socket.send_json({"job_id": job_id, "text": text})
|
||||
response = socket.recv_json()
|
||||
elapsed = time.time() - start
|
||||
|
||||
if 'error' in response:
|
||||
raise Exception(f"Soprano error: {response['error']}")
|
||||
|
||||
audio = np.array(response['audio'], dtype=np.float32)
|
||||
return audio, elapsed
|
||||
|
||||
|
||||
def setup_rvc():
|
||||
"""Initialize RVC exactly as soprano_rvc_api.py does"""
|
||||
from infer.lib import rtrvc as rvc_for_realtime
|
||||
from configs.config import Config
|
||||
from multiprocessing import Queue as MPQueue
|
||||
import torchaudio
|
||||
|
||||
# Change to RVC directory
|
||||
rvc_dir = '/app/Retrieval-based-Voice-Conversion-WebUI'
|
||||
original_dir = os.getcwd()
|
||||
os.chdir(rvc_dir)
|
||||
|
||||
try:
|
||||
config = Config()
|
||||
device = config.device
|
||||
|
||||
# Create queues
|
||||
inp_q = MPQueue()
|
||||
opt_q = MPQueue()
|
||||
|
||||
# Initialize RVC
|
||||
rvc = rvc_for_realtime.RVC(
|
||||
key=0,
|
||||
formant=0,
|
||||
pth_path="assets/weights/MikuAI_e210_s6300.pth",
|
||||
index_path="assets/indices/added_IVF1811_Flat_nprobe_1_MikuAI_v2.index",
|
||||
index_rate=0.5,
|
||||
n_cpu=4,
|
||||
inp_q=inp_q,
|
||||
opt_q=opt_q,
|
||||
config=config,
|
||||
last_rvc=None
|
||||
)
|
||||
|
||||
# Optimizations
|
||||
torch.set_grad_enabled(False)
|
||||
torch.backends.cudnn.benchmark = True
|
||||
|
||||
if config.is_half:
|
||||
rvc.model = rvc.model.half()
|
||||
rvc.net_g = rvc.net_g.half()
|
||||
|
||||
# Create resamplers (32kHz Soprano → 48kHz and 16kHz)
|
||||
resampler_32_to_48 = torchaudio.transforms.Resample(
|
||||
orig_freq=32000, new_freq=48000
|
||||
).to(device)
|
||||
|
||||
resampler_32_to_16 = torchaudio.transforms.Resample(
|
||||
orig_freq=32000, new_freq=16000
|
||||
).to(device)
|
||||
|
||||
# Prepare RVC buffers
|
||||
block_frame = 9600 # 0.2s @ 48kHz
|
||||
block_frame_16k = 3200 # 0.2s @ 16kHz
|
||||
crossfade_frame = int(block_frame * 0.25) # 0.05s
|
||||
extra_frame = int(block_frame * 9) # 1.8s context
|
||||
|
||||
input_wav_res = torch.zeros(
|
||||
extra_frame + crossfade_frame + block_frame_16k + crossfade_frame,
|
||||
device=device, dtype=torch.float32
|
||||
)
|
||||
|
||||
input_wav = torch.zeros(
|
||||
extra_frame + crossfade_frame + block_frame + crossfade_frame,
|
||||
device=device, dtype=torch.float32
|
||||
)
|
||||
|
||||
# RVC processing parameters (zc = target_sr // 100 = 48000 // 100 = 480)
|
||||
zc = 480 # Window size for RVC processing
|
||||
skip_head = extra_frame // zc
|
||||
return_length = (block_frame_16k + crossfade_frame) // zc
|
||||
|
||||
finally:
|
||||
os.chdir(original_dir)
|
||||
|
||||
return {
|
||||
'rvc': rvc,
|
||||
'config': config,
|
||||
'device': device,
|
||||
'resampler_32_to_48': resampler_32_to_48,
|
||||
'resampler_32_to_16': resampler_32_to_16,
|
||||
'block_frame': block_frame,
|
||||
'block_frame_16k': block_frame_16k,
|
||||
'crossfade_frame': crossfade_frame,
|
||||
'extra_frame': extra_frame,
|
||||
'input_wav_res': input_wav_res,
|
||||
'input_wav': input_wav,
|
||||
'skip_head': skip_head,
|
||||
'return_length': return_length,
|
||||
'rvc_dir': rvc_dir,
|
||||
}
|
||||
|
||||
|
||||
def process_audio_through_rvc(soprano_audio, rvc_ctx):
|
||||
"""
|
||||
Process Soprano audio through RVC pipeline exactly as soprano_rvc_api.py does.
|
||||
Returns: (converted_audio, time_taken)
|
||||
"""
|
||||
device = rvc_ctx['device']
|
||||
rvc = rvc_ctx['rvc']
|
||||
|
||||
# Convert to tensor
|
||||
soprano_tensor = torch.from_numpy(soprano_audio).to(device).float()
|
||||
|
||||
# Split into chunks (0.1s @ 32kHz)
|
||||
chunk_size = 3200
|
||||
num_chunks = (len(soprano_tensor) + chunk_size - 1) // chunk_size
|
||||
|
||||
# Accumulation buffers
|
||||
acc_buffer_48k = torch.tensor([], device=device, dtype=torch.float32)
|
||||
acc_buffer_16k = torch.tensor([], device=device, dtype=torch.float32)
|
||||
|
||||
output_chunks = []
|
||||
block_times = []
|
||||
|
||||
original_dir = os.getcwd()
|
||||
os.chdir(rvc_ctx['rvc_dir'])
|
||||
|
||||
try:
|
||||
for i in range(num_chunks):
|
||||
chunk_start = i * chunk_size
|
||||
chunk_end = min((i + 1) * chunk_size, len(soprano_tensor))
|
||||
soprano_chunk = soprano_tensor[chunk_start:chunk_end]
|
||||
|
||||
# Resample to 48kHz and 16kHz
|
||||
chunk_48k = rvc_ctx['resampler_32_to_48'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]
|
||||
chunk_16k = rvc_ctx['resampler_32_to_16'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]
|
||||
|
||||
# Accumulate
|
||||
acc_buffer_48k = torch.cat([acc_buffer_48k, chunk_48k])
|
||||
acc_buffer_16k = torch.cat([acc_buffer_16k, chunk_16k])
|
||||
|
||||
# Process blocks when we have enough samples
|
||||
while acc_buffer_48k.shape[0] >= rvc_ctx['block_frame']:
|
||||
block_start = time.time()
|
||||
|
||||
# Take a block
|
||||
block_48k = acc_buffer_48k[:rvc_ctx['block_frame']]
|
||||
block_16k = acc_buffer_16k[:rvc_ctx['block_frame_16k']]
|
||||
|
||||
acc_buffer_48k = acc_buffer_48k[rvc_ctx['block_frame']:]
|
||||
acc_buffer_16k = acc_buffer_16k[rvc_ctx['block_frame_16k']:]
|
||||
|
||||
# Update input buffers (slide window)
|
||||
rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone()
|
||||
rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k
|
||||
|
||||
rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone()
|
||||
rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k
|
||||
|
||||
# RVC inference
|
||||
infer_wav = rvc.infer(
|
||||
rvc_ctx['input_wav_res'],
|
||||
rvc_ctx['block_frame_16k'],
|
||||
rvc_ctx['skip_head'],
|
||||
rvc_ctx['return_length'],
|
||||
"rmvpe", # f0method
|
||||
)
|
||||
|
||||
# Convert to tensor
|
||||
if not torch.is_tensor(infer_wav):
|
||||
infer_wav = torch.from_numpy(infer_wav).to(device)
|
||||
|
||||
output_chunks.append(infer_wav)
|
||||
block_times.append(time.time() - block_start)
|
||||
|
||||
# Flush remaining audio
|
||||
if acc_buffer_48k.shape[0] > 0:
|
||||
block_start = time.time()
|
||||
|
||||
# Pad to block size
|
||||
pad_size_48k = rvc_ctx['block_frame'] - acc_buffer_48k.shape[0]
|
||||
pad_size_16k = rvc_ctx['block_frame_16k'] - acc_buffer_16k.shape[0]
|
||||
|
||||
padding_48k = torch.zeros(pad_size_48k, device=device, dtype=torch.float32)
|
||||
padding_16k = torch.zeros(pad_size_16k, device=device, dtype=torch.float32)
|
||||
|
||||
block_48k = torch.cat([acc_buffer_48k, padding_48k])
|
||||
block_16k = torch.cat([acc_buffer_16k, padding_16k])
|
||||
|
||||
# Update buffers
|
||||
rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone()
|
||||
rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k
|
||||
|
||||
rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone()
|
||||
rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k
|
||||
|
||||
# Inference
|
||||
infer_wav = rvc.infer(
|
||||
rvc_ctx['input_wav_res'],
|
||||
rvc_ctx['block_frame_16k'],
|
||||
rvc_ctx['skip_head'],
|
||||
rvc_ctx['return_length'],
|
||||
"rmvpe",
|
||||
)
|
||||
|
||||
if not torch.is_tensor(infer_wav):
|
||||
infer_wav = torch.from_numpy(infer_wav).to(device)
|
||||
|
||||
# Trim padding
|
||||
output_samples = acc_buffer_48k.shape[0]
|
||||
if output_samples < len(infer_wav):
|
||||
infer_wav = infer_wav[:output_samples]
|
||||
|
||||
output_chunks.append(infer_wav)
|
||||
block_times.append(time.time() - block_start)
|
||||
|
||||
finally:
|
||||
os.chdir(original_dir)
|
||||
|
||||
# Concatenate all output chunks
|
||||
if output_chunks:
|
||||
full_audio = torch.cat(output_chunks).cpu().numpy()
|
||||
else:
|
||||
full_audio = np.array([])
|
||||
|
||||
return full_audio, sum(block_times), block_times
|
||||
|
||||
|
||||
def run_benchmark():
|
||||
"""Run comprehensive benchmark"""
|
||||
print("\n" + "="*80)
|
||||
print(" "*20 + "COMPREHENSIVE SOPRANO vs RVC BENCHMARK")
|
||||
print("="*80 + "\n")
|
||||
|
||||
soprano_socket = setup_soprano_connection()
|
||||
print("✅ Connected to Soprano server")
|
||||
|
||||
print("\n📦 Loading RVC model...")
|
||||
rvc_start = time.time()
|
||||
rvc_ctx = setup_rvc()
|
||||
print(f"✅ RVC loaded in {time.time() - rvc_start:.2f}s\n")
|
||||
|
||||
results = {}
|
||||
|
||||
for label, text in TEST_CASES:
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Testing: {label.upper()}")
|
||||
print(f"Text: \"{text}\"")
|
||||
print('='*80)
|
||||
|
||||
results[label] = {
|
||||
'soprano_times': [],
|
||||
'rvc_times': [],
|
||||
'rvc_block_times': [],
|
||||
'pipeline_times': [],
|
||||
'audio_duration': None,
|
||||
'num_blocks': [],
|
||||
}
|
||||
|
||||
for run in range(RUNS_PER_TEST):
|
||||
print(f"\n Run {run+1}/{RUNS_PER_TEST}:")
|
||||
|
||||
# Test 1: Soprano only
|
||||
try:
|
||||
soprano_audio, soprano_time = call_soprano(soprano_socket, text)
|
||||
audio_duration = len(soprano_audio) / 32000
|
||||
results[label]['soprano_times'].append(soprano_time)
|
||||
results[label]['audio_duration'] = audio_duration
|
||||
|
||||
soprano_rtf = audio_duration / soprano_time if soprano_time > 0 else 0
|
||||
print(f" 🎤 Soprano: {soprano_time*1000:.1f}ms → {audio_duration:.2f}s audio (RTF: {soprano_rtf:.2f}x)")
|
||||
|
||||
# Test 2: RVC only (using cached Soprano output from run 1)
|
||||
rvc_output, rvc_time, block_times = process_audio_through_rvc(soprano_audio, rvc_ctx)
|
||||
results[label]['rvc_times'].append(rvc_time)
|
||||
results[label]['rvc_block_times'].extend(block_times)
|
||||
results[label]['num_blocks'].append(len(block_times))
|
||||
|
||||
rvc_rtf = audio_duration / rvc_time if rvc_time > 0 else 0
|
||||
avg_block_time = statistics.mean(block_times) if block_times else 0
|
||||
print(f" 🎙️ RVC: {rvc_time*1000:.1f}ms ({len(block_times)} blocks, avg {avg_block_time*1000:.1f}ms/block, RTF: {rvc_rtf:.2f}x)")
|
||||
|
||||
# Test 3: Full pipeline time
|
||||
pipeline_time = soprano_time + rvc_time
|
||||
results[label]['pipeline_times'].append(pipeline_time)
|
||||
|
||||
pipeline_rtf = audio_duration / pipeline_time if pipeline_time > 0 else 0
|
||||
print(f" ⚡ Pipeline: {pipeline_time*1000:.1f}ms (RTF: {pipeline_rtf:.2f}x)")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
# Print comprehensive analysis
|
||||
print("\n\n" + "="*80)
|
||||
print(" "*25 + "PERFORMANCE ANALYSIS")
|
||||
print("="*80 + "\n")
|
||||
|
||||
for label, text in TEST_CASES:
|
||||
data = results[label]
|
||||
|
||||
if not data['soprano_times']:
|
||||
continue
|
||||
|
||||
print(f"\n{'─'*80}")
|
||||
print(f"📊 {label.upper()}: \"{text[:60]}{'...' if len(text) > 60 else ''}\"")
|
||||
print('─'*80)
|
||||
|
||||
audio_dur = data['audio_duration']
|
||||
print(f"\n 🎵 Audio Duration: {audio_dur:.2f}s")
|
||||
|
||||
# Soprano stats
|
||||
s_mean = statistics.mean(data['soprano_times'])
|
||||
s_median = statistics.median(data['soprano_times'])
|
||||
s_std = statistics.stdev(data['soprano_times']) if len(data['soprano_times']) > 1 else 0
|
||||
s_rtf = audio_dur / s_mean if s_mean > 0 else 0
|
||||
|
||||
print(f"\n 🎤 SOPRANO:")
|
||||
print(f" ├─ Mean: {s_mean*1000:.1f}ms")
|
||||
print(f" ├─ Median: {s_median*1000:.1f}ms")
|
||||
print(f" ├─ Std Dev: {s_std*1000:.1f}ms")
|
||||
print(f" ├─ Range: {min(data['soprano_times'])*1000:.1f} - {max(data['soprano_times'])*1000:.1f}ms")
|
||||
print(f" └─ RTF: {s_rtf:.2f}x")
|
||||
|
||||
# RVC stats
|
||||
r_mean = statistics.mean(data['rvc_times'])
|
||||
r_median = statistics.median(data['rvc_times'])
|
||||
r_std = statistics.stdev(data['rvc_times']) if len(data['rvc_times']) > 1 else 0
|
||||
r_rtf = audio_dur / r_mean if r_mean > 0 else 0
|
||||
|
||||
avg_blocks = statistics.mean(data['num_blocks'])
|
||||
avg_block_time = statistics.mean(data['rvc_block_times']) if data['rvc_block_times'] else 0
|
||||
|
||||
print(f"\n 🎙️ RVC:")
|
||||
print(f" ├─ Mean: {r_mean*1000:.1f}ms")
|
||||
print(f" ├─ Median: {r_median*1000:.1f}ms")
|
||||
print(f" ├─ Std Dev: {r_std*1000:.1f}ms")
|
||||
print(f" ├─ Range: {min(data['rvc_times'])*1000:.1f} - {max(data['rvc_times'])*1000:.1f}ms")
|
||||
print(f" ├─ Avg blocks: {avg_blocks:.1f}")
|
||||
print(f" ├─ Avg per block: {avg_block_time*1000:.1f}ms")
|
||||
print(f" └─ RTF: {r_rtf:.2f}x")
|
||||
|
||||
# Pipeline stats
|
||||
p_mean = statistics.mean(data['pipeline_times'])
|
||||
p_rtf = audio_dur / p_mean if p_mean > 0 else 0
|
||||
|
||||
print(f"\n ⚡ FULL PIPELINE:")
|
||||
print(f" ├─ Mean: {p_mean*1000:.1f}ms")
|
||||
print(f" └─ RTF: {p_rtf:.2f}x")
|
||||
|
||||
# Breakdown
|
||||
print(f"\n 📈 BREAKDOWN:")
|
||||
print(f" ├─ Soprano: {s_mean*1000:.1f}ms ({s_mean/p_mean*100:.1f}%)")
|
||||
print(f" └─ RVC: {r_mean*1000:.1f}ms ({r_mean/p_mean*100:.1f}%)")
|
||||
|
||||
# Summary
|
||||
print("\n\n" + "="*80)
|
||||
print(" "*30 + "SUMMARY")
|
||||
print("="*80 + "\n")
|
||||
|
||||
all_soprano_rtf = []
|
||||
all_rvc_rtf = []
|
||||
all_pipeline_rtf = []
|
||||
|
||||
for label in results:
|
||||
data = results[label]
|
||||
if data['audio_duration'] and data['soprano_times'] and data['rvc_times']:
|
||||
s_rtf = data['audio_duration'] / statistics.mean(data['soprano_times'])
|
||||
r_rtf = data['audio_duration'] / statistics.mean(data['rvc_times'])
|
||||
p_rtf = data['audio_duration'] / statistics.mean(data['pipeline_times'])
|
||||
|
||||
all_soprano_rtf.append(s_rtf)
|
||||
all_rvc_rtf.append(r_rtf)
|
||||
all_pipeline_rtf.append(p_rtf)
|
||||
|
||||
if all_soprano_rtf:
|
||||
print(f" 🎤 Soprano Average RTF: {statistics.mean(all_soprano_rtf):.2f}x")
|
||||
if all_rvc_rtf:
|
||||
print(f" 🎙️ RVC Average RTF: {statistics.mean(all_rvc_rtf):.2f}x")
|
||||
if all_pipeline_rtf:
|
||||
print(f" ⚡ Pipeline Average RTF: {statistics.mean(all_pipeline_rtf):.2f}x")
|
||||
|
||||
print(f"\n 💡 BOTTLENECK ANALYSIS:")
|
||||
if all_soprano_rtf and all_rvc_rtf:
|
||||
soprano_avg = statistics.mean(all_soprano_rtf)
|
||||
rvc_avg = statistics.mean(all_rvc_rtf)
|
||||
|
||||
if soprano_avg < rvc_avg:
|
||||
bottleneck = "Soprano"
|
||||
ratio = rvc_avg / soprano_avg
|
||||
else:
|
||||
bottleneck = "RVC"
|
||||
ratio = soprano_avg / rvc_avg
|
||||
|
||||
print(f" └─ {bottleneck} is {ratio:.2f}x slower than the other")
|
||||
|
||||
if bottleneck == "RVC":
|
||||
print(f"\n ✅ CONCLUSION: RVC (voice conversion) is the bottleneck, NOT Soprano")
|
||||
print(f" - Soprano: {soprano_avg:.2f}x realtime (FAST)")
|
||||
print(f" - RVC: {rvc_avg:.2f}x realtime (BOTTLENECK)")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print(" "*28 + "BENCHMARK COMPLETE")
|
||||
print("="*80 + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_benchmark()
|
||||
Reference in New Issue
Block a user