add: absorb soprano_to_rvc as regular subdirectory

Voice conversion pipeline (Soprano TTS → RVC) with Docker support. Previously tracked as bare gitlink; removed .git/ directories and absorbed into main repo for unified tracking. Includes: Soprano TTS, RVC WebUI integration, Docker configs, WebSocket API, and benchmark scripts. Updated .gitignore to exclude large model weights (*.pth, *.pt, *.onnx, *.index). 287 files (3.1GB of ML weights properly excluded via gitignore).
2026-03-04 00:24:53 +02:00
parent 34b184a05a
commit 8ca716029e
287 changed files with 47102 additions and 0 deletions
--- a/soprano_to_rvc/benchmark_complete.py
+++ b/soprano_to_rvc/benchmark_complete.py
@@ -0,0 +1,454 @@
+#!/usr/bin/env python3
+"""
+Complete component benchmark replicating the actual soprano_rvc_api.py pipeline logic.
+This ensures accurate RVC-only performance measurement.
+"""
+
+import sys
+import time
+import os
+import numpy as np
+import zmq
+import torch
+import json
+from collections import defaultdict
+import statistics
+
+sys.path.insert(0, '/app/Retrieval-based-Voice-Conversion-WebUI')
+
+# Test sentences
+TEST_CASES = [
+    ("tiny", "Hello!"),
+    ("short", "Hello, this is a test."),
+    ("medium", "The quick brown fox jumps over the lazy dog."),
+    ("long", "Artificial intelligence is revolutionizing the way we interact with technology."),
+    ("very_long", "Artificial intelligence and machine learning are fundamentally transforming our understanding of human-computer interaction, enabling unprecedented levels of automation and personalization across diverse industries."),
+]
+
+RUNS_PER_TEST = 5
+
+
+def setup_soprano_connection():
+    """Connect to Soprano server"""
+    zmq_context = zmq.Context()
+    soprano_socket = zmq_context.socket(zmq.REQ)
+    soprano_socket.connect("tcp://soprano:5555")
+    soprano_socket.setsockopt(zmq.RCVTIMEO, 30000)
+    return soprano_socket
+
+
+def call_soprano(socket, text):
+    """Get audio from Soprano"""
+    import uuid
+    job_id = str(uuid.uuid4())
+    
+    start = time.time()
+    socket.send_json({"job_id": job_id, "text": text})
+    response = socket.recv_json()
+    elapsed = time.time() - start
+    
+    if 'error' in response:
+        raise Exception(f"Soprano error: {response['error']}")
+    
+    audio = np.array(response['audio'], dtype=np.float32)
+    return audio, elapsed
+
+
+def setup_rvc():
+    """Initialize RVC exactly as soprano_rvc_api.py does"""
+    from infer.lib import rtrvc as rvc_for_realtime
+    from configs.config import Config
+    from multiprocessing import Queue as MPQueue
+    import torchaudio
+    
+    # Change to RVC directory
+    rvc_dir = '/app/Retrieval-based-Voice-Conversion-WebUI'
+    original_dir = os.getcwd()
+    os.chdir(rvc_dir)
+    
+    try:
+        config = Config()
+        device = config.device
+        
+        # Create queues
+        inp_q = MPQueue()
+        opt_q = MPQueue()
+        
+        # Initialize RVC
+        rvc = rvc_for_realtime.RVC(
+            key=0,
+            formant=0,
+            pth_path="assets/weights/MikuAI_e210_s6300.pth",
+            index_path="assets/indices/added_IVF1811_Flat_nprobe_1_MikuAI_v2.index",
+            index_rate=0.5,
+            n_cpu=4,
+            inp_q=inp_q,
+            opt_q=opt_q,
+            config=config,
+            last_rvc=None
+        )
+        
+        # Optimizations
+        torch.set_grad_enabled(False)
+        torch.backends.cudnn.benchmark = True
+        
+        if config.is_half:
+            rvc.model = rvc.model.half()
+            rvc.net_g = rvc.net_g.half()
+        
+        # Create resamplers (32kHz Soprano → 48kHz and 16kHz)
+        resampler_32_to_48 = torchaudio.transforms.Resample(
+            orig_freq=32000, new_freq=48000
+        ).to(device)
+        
+        resampler_32_to_16 = torchaudio.transforms.Resample(
+            orig_freq=32000, new_freq=16000
+        ).to(device)
+        
+        # Prepare RVC buffers
+        block_frame = 9600  # 0.2s @ 48kHz
+        block_frame_16k = 3200  # 0.2s @ 16kHz
+        crossfade_frame = int(block_frame * 0.25)  # 0.05s
+        extra_frame = int(block_frame * 9)  # 1.8s context
+        
+        input_wav_res = torch.zeros(
+            extra_frame + crossfade_frame + block_frame_16k + crossfade_frame,
+            device=device, dtype=torch.float32
+        )
+        
+        input_wav = torch.zeros(
+            extra_frame + crossfade_frame + block_frame + crossfade_frame,
+            device=device, dtype=torch.float32
+        )
+        
+        # RVC processing parameters (zc = target_sr // 100 = 48000 // 100 = 480)
+        zc = 480  # Window size for RVC processing
+        skip_head = extra_frame // zc
+        return_length = (block_frame_16k + crossfade_frame) // zc
+        
+    finally:
+        os.chdir(original_dir)
+    
+    return {
+        'rvc': rvc,
+        'config': config,
+        'device': device,
+        'resampler_32_to_48': resampler_32_to_48,
+        'resampler_32_to_16': resampler_32_to_16,
+        'block_frame': block_frame,
+        'block_frame_16k': block_frame_16k,
+        'crossfade_frame': crossfade_frame,
+        'extra_frame': extra_frame,
+        'input_wav_res': input_wav_res,
+        'input_wav': input_wav,
+        'skip_head': skip_head,
+        'return_length': return_length,
+        'rvc_dir': rvc_dir,
+    }
+
+
+def process_audio_through_rvc(soprano_audio, rvc_ctx):
+    """
+    Process Soprano audio through RVC pipeline exactly as soprano_rvc_api.py does.
+    Returns: (converted_audio, time_taken)
+    """
+    device = rvc_ctx['device']
+    rvc = rvc_ctx['rvc']
+    
+    # Convert to tensor
+    soprano_tensor = torch.from_numpy(soprano_audio).to(device).float()
+    
+    # Split into chunks (0.1s @ 32kHz)
+    chunk_size = 3200
+    num_chunks = (len(soprano_tensor) + chunk_size - 1) // chunk_size
+    
+    # Accumulation buffers
+    acc_buffer_48k = torch.tensor([], device=device, dtype=torch.float32)
+    acc_buffer_16k = torch.tensor([], device=device, dtype=torch.float32)
+    
+    output_chunks = []
+    block_times = []
+    
+    original_dir = os.getcwd()
+    os.chdir(rvc_ctx['rvc_dir'])
+    
+    try:
+        for i in range(num_chunks):
+            chunk_start = i * chunk_size
+            chunk_end = min((i + 1) * chunk_size, len(soprano_tensor))
+            soprano_chunk = soprano_tensor[chunk_start:chunk_end]
+            
+            # Resample to 48kHz and 16kHz
+            chunk_48k = rvc_ctx['resampler_32_to_48'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]
+            chunk_16k = rvc_ctx['resampler_32_to_16'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]
+            
+            # Accumulate
+            acc_buffer_48k = torch.cat([acc_buffer_48k, chunk_48k])
+            acc_buffer_16k = torch.cat([acc_buffer_16k, chunk_16k])
+            
+            # Process blocks when we have enough samples
+            while acc_buffer_48k.shape[0] >= rvc_ctx['block_frame']:
+                block_start = time.time()
+                
+                # Take a block
+                block_48k = acc_buffer_48k[:rvc_ctx['block_frame']]
+                block_16k = acc_buffer_16k[:rvc_ctx['block_frame_16k']]
+                
+                acc_buffer_48k = acc_buffer_48k[rvc_ctx['block_frame']:]
+                acc_buffer_16k = acc_buffer_16k[rvc_ctx['block_frame_16k']:]
+                
+                # Update input buffers (slide window)
+                rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone()
+                rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k
+                
+                rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone()
+                rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k
+                
+                # RVC inference
+                infer_wav = rvc.infer(
+                    rvc_ctx['input_wav_res'],
+                    rvc_ctx['block_frame_16k'],
+                    rvc_ctx['skip_head'],
+                    rvc_ctx['return_length'],
+                    "rmvpe",  # f0method
+                )
+                
+                # Convert to tensor
+                if not torch.is_tensor(infer_wav):
+                    infer_wav = torch.from_numpy(infer_wav).to(device)
+                
+                output_chunks.append(infer_wav)
+                block_times.append(time.time() - block_start)
+        
+        # Flush remaining audio
+        if acc_buffer_48k.shape[0] > 0:
+            block_start = time.time()
+            
+            # Pad to block size
+            pad_size_48k = rvc_ctx['block_frame'] - acc_buffer_48k.shape[0]
+            pad_size_16k = rvc_ctx['block_frame_16k'] - acc_buffer_16k.shape[0]
+            
+            padding_48k = torch.zeros(pad_size_48k, device=device, dtype=torch.float32)
+            padding_16k = torch.zeros(pad_size_16k, device=device, dtype=torch.float32)
+            
+            block_48k = torch.cat([acc_buffer_48k, padding_48k])
+            block_16k = torch.cat([acc_buffer_16k, padding_16k])
+            
+            # Update buffers
+            rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone()
+            rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k
+            
+            rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone()
+            rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k
+            
+            # Inference
+            infer_wav = rvc.infer(
+                rvc_ctx['input_wav_res'],
+                rvc_ctx['block_frame_16k'],
+                rvc_ctx['skip_head'],
+                rvc_ctx['return_length'],
+                "rmvpe",
+            )
+            
+            if not torch.is_tensor(infer_wav):
+                infer_wav = torch.from_numpy(infer_wav).to(device)
+            
+            # Trim padding
+            output_samples = acc_buffer_48k.shape[0]
+            if output_samples < len(infer_wav):
+                infer_wav = infer_wav[:output_samples]
+            
+            output_chunks.append(infer_wav)
+            block_times.append(time.time() - block_start)
+    
+    finally:
+        os.chdir(original_dir)
+    
+    # Concatenate all output chunks
+    if output_chunks:
+        full_audio = torch.cat(output_chunks).cpu().numpy()
+    else:
+        full_audio = np.array([])
+    
+    return full_audio, sum(block_times), block_times
+
+
+def run_benchmark():
+    """Run comprehensive benchmark"""
+    print("\n" + "="*80)
+    print(" "*20 + "COMPREHENSIVE SOPRANO vs RVC BENCHMARK")
+    print("="*80 + "\n")
+    
+    soprano_socket = setup_soprano_connection()
+    print("✅ Connected to Soprano server")
+    
+    print("\n📦 Loading RVC model...")
+    rvc_start = time.time()
+    rvc_ctx = setup_rvc()
+    print(f"✅ RVC loaded in {time.time() - rvc_start:.2f}s\n")
+    
+    results = {}
+    
+    for label, text in TEST_CASES:
+        print(f"\n{'='*80}")
+        print(f"Testing: {label.upper()}")
+        print(f"Text: \"{text}\"")
+        print('='*80)
+        
+        results[label] = {
+            'soprano_times': [],
+            'rvc_times': [],
+            'rvc_block_times': [],
+            'pipeline_times': [],
+            'audio_duration': None,
+            'num_blocks': [],
+        }
+        
+        for run in range(RUNS_PER_TEST):
+            print(f"\n  Run {run+1}/{RUNS_PER_TEST}:")
+            
+            # Test 1: Soprano only
+            try:
+                soprano_audio, soprano_time = call_soprano(soprano_socket, text)
+                audio_duration = len(soprano_audio) / 32000
+                results[label]['soprano_times'].append(soprano_time)
+                results[label]['audio_duration'] = audio_duration
+                
+                soprano_rtf = audio_duration / soprano_time if soprano_time > 0 else 0
+                print(f"    🎤 Soprano: {soprano_time*1000:.1f}ms → {audio_duration:.2f}s audio (RTF: {soprano_rtf:.2f}x)")
+                
+                # Test 2: RVC only (using cached Soprano output from run 1)
+                rvc_output, rvc_time, block_times = process_audio_through_rvc(soprano_audio, rvc_ctx)
+                results[label]['rvc_times'].append(rvc_time)
+                results[label]['rvc_block_times'].extend(block_times)
+                results[label]['num_blocks'].append(len(block_times))
+                
+                rvc_rtf = audio_duration / rvc_time if rvc_time > 0 else 0
+                avg_block_time = statistics.mean(block_times) if block_times else 0
+                print(f"    🎙️  RVC: {rvc_time*1000:.1f}ms ({len(block_times)} blocks, avg {avg_block_time*1000:.1f}ms/block, RTF: {rvc_rtf:.2f}x)")
+                
+                # Test 3: Full pipeline time
+                pipeline_time = soprano_time + rvc_time
+                results[label]['pipeline_times'].append(pipeline_time)
+                
+                pipeline_rtf = audio_duration / pipeline_time if pipeline_time > 0 else 0
+                print(f"    ⚡ Pipeline: {pipeline_time*1000:.1f}ms (RTF: {pipeline_rtf:.2f}x)")
+                
+            except Exception as e:
+                print(f"    ERROR: {e}")
+    
+    # Print comprehensive analysis
+    print("\n\n" + "="*80)
+    print(" "*25 + "PERFORMANCE ANALYSIS")
+    print("="*80 + "\n")
+    
+    for label, text in TEST_CASES:
+        data = results[label]
+        
+        if not data['soprano_times']:
+            continue
+        
+        print(f"\n{'─'*80}")
+        print(f"📊 {label.upper()}: \"{text[:60]}{'...' if len(text) > 60 else ''}\"")
+        print('─'*80)
+        
+        audio_dur = data['audio_duration']
+        print(f"\n  🎵 Audio Duration: {audio_dur:.2f}s")
+        
+        # Soprano stats
+        s_mean = statistics.mean(data['soprano_times'])
+        s_median = statistics.median(data['soprano_times'])
+        s_std = statistics.stdev(data['soprano_times']) if len(data['soprano_times']) > 1 else 0
+        s_rtf = audio_dur / s_mean if s_mean > 0 else 0
+        
+        print(f"\n  🎤 SOPRANO:")
+        print(f"     ├─ Mean: {s_mean*1000:.1f}ms")
+        print(f"     ├─ Median: {s_median*1000:.1f}ms")
+        print(f"     ├─ Std Dev: {s_std*1000:.1f}ms")
+        print(f"     ├─ Range: {min(data['soprano_times'])*1000:.1f} - {max(data['soprano_times'])*1000:.1f}ms")
+        print(f"     └─ RTF: {s_rtf:.2f}x")
+        
+        # RVC stats
+        r_mean = statistics.mean(data['rvc_times'])
+        r_median = statistics.median(data['rvc_times'])
+        r_std = statistics.stdev(data['rvc_times']) if len(data['rvc_times']) > 1 else 0
+        r_rtf = audio_dur / r_mean if r_mean > 0 else 0
+        
+        avg_blocks = statistics.mean(data['num_blocks'])
+        avg_block_time = statistics.mean(data['rvc_block_times']) if data['rvc_block_times'] else 0
+        
+        print(f"\n  🎙️  RVC:")
+        print(f"     ├─ Mean: {r_mean*1000:.1f}ms")
+        print(f"     ├─ Median: {r_median*1000:.1f}ms")
+        print(f"     ├─ Std Dev: {r_std*1000:.1f}ms")
+        print(f"     ├─ Range: {min(data['rvc_times'])*1000:.1f} - {max(data['rvc_times'])*1000:.1f}ms")
+        print(f"     ├─ Avg blocks: {avg_blocks:.1f}")
+        print(f"     ├─ Avg per block: {avg_block_time*1000:.1f}ms")
+        print(f"     └─ RTF: {r_rtf:.2f}x")
+        
+        # Pipeline stats
+        p_mean = statistics.mean(data['pipeline_times'])
+        p_rtf = audio_dur / p_mean if p_mean > 0 else 0
+        
+        print(f"\n  ⚡ FULL PIPELINE:")
+        print(f"     ├─ Mean: {p_mean*1000:.1f}ms")
+        print(f"     └─ RTF: {p_rtf:.2f}x")
+        
+        # Breakdown
+        print(f"\n  📈 BREAKDOWN:")
+        print(f"     ├─ Soprano: {s_mean*1000:.1f}ms ({s_mean/p_mean*100:.1f}%)")
+        print(f"     └─ RVC: {r_mean*1000:.1f}ms ({r_mean/p_mean*100:.1f}%)")
+    
+    # Summary
+    print("\n\n" + "="*80)
+    print(" "*30 + "SUMMARY")
+    print("="*80 + "\n")
+    
+    all_soprano_rtf = []
+    all_rvc_rtf = []
+    all_pipeline_rtf = []
+    
+    for label in results:
+        data = results[label]
+        if data['audio_duration'] and data['soprano_times'] and data['rvc_times']:
+            s_rtf = data['audio_duration'] / statistics.mean(data['soprano_times'])
+            r_rtf = data['audio_duration'] / statistics.mean(data['rvc_times'])
+            p_rtf = data['audio_duration'] / statistics.mean(data['pipeline_times'])
+            
+            all_soprano_rtf.append(s_rtf)
+            all_rvc_rtf.append(r_rtf)
+            all_pipeline_rtf.append(p_rtf)
+    
+    if all_soprano_rtf:
+        print(f"  🎤 Soprano Average RTF: {statistics.mean(all_soprano_rtf):.2f}x")
+    if all_rvc_rtf:
+        print(f"  🎙️  RVC Average RTF: {statistics.mean(all_rvc_rtf):.2f}x")
+    if all_pipeline_rtf:
+        print(f"  ⚡ Pipeline Average RTF: {statistics.mean(all_pipeline_rtf):.2f}x")
+    
+    print(f"\n  💡 BOTTLENECK ANALYSIS:")
+    if all_soprano_rtf and all_rvc_rtf:
+        soprano_avg = statistics.mean(all_soprano_rtf)
+        rvc_avg = statistics.mean(all_rvc_rtf)
+        
+        if soprano_avg < rvc_avg:
+            bottleneck = "Soprano"
+            ratio = rvc_avg / soprano_avg
+        else:
+            bottleneck = "RVC"
+            ratio = soprano_avg / rvc_avg
+        
+        print(f"     └─ {bottleneck} is {ratio:.2f}x slower than the other")
+        
+        if bottleneck == "RVC":
+            print(f"\n  ✅ CONCLUSION: RVC (voice conversion) is the bottleneck, NOT Soprano")
+            print(f"     - Soprano: {soprano_avg:.2f}x realtime (FAST)")
+            print(f"     - RVC: {rvc_avg:.2f}x realtime (BOTTLENECK)")
+    
+    print("\n" + "="*80)
+    print(" "*28 + "BENCHMARK COMPLETE")
+    print("="*80 + "\n")
+
+
+if __name__ == "__main__":
+    run_benchmark()