miku-discord/soprano_to_rvc/benchmark_complete.py

#!/usr/bin/env python3
"""
Complete component benchmark replicating the actual soprano_rvc_api.py pipeline logic.
This ensures accurate RVC-only performance measurement.
"""

import sys
import time
import os
import numpy as np
import zmq
import torch
import json
from collections import defaultdict
import statistics

sys.path.insert(0, '/app/Retrieval-based-Voice-Conversion-WebUI')

# Test sentences
TEST_CASES = [
    ("tiny", "Hello!"),
    ("short", "Hello, this is a test."),
    ("medium", "The quick brown fox jumps over the lazy dog."),
    ("long", "Artificial intelligence is revolutionizing the way we interact with technology."),
    ("very_long", "Artificial intelligence and machine learning are fundamentally transforming our understanding of human-computer interaction, enabling unprecedented levels of automation and personalization across diverse industries."),
]

RUNS_PER_TEST = 5


def setup_soprano_connection():
    """Connect to Soprano server"""
    zmq_context = zmq.Context()
    soprano_socket = zmq_context.socket(zmq.REQ)
    soprano_socket.connect("tcp://soprano:5555")
    soprano_socket.setsockopt(zmq.RCVTIMEO, 30000)
    return soprano_socket


def call_soprano(socket, text):
    """Get audio from Soprano"""
    import uuid
    job_id = str(uuid.uuid4())

    start = time.time()
    socket.send_json({"job_id": job_id, "text": text})
    response = socket.recv_json()
    elapsed = time.time() - start

    if 'error' in response:
        raise Exception(f"Soprano error: {response['error']}")

    audio = np.array(response['audio'], dtype=np.float32)
    return audio, elapsed


def setup_rvc():
    """Initialize RVC exactly as soprano_rvc_api.py does"""
    from infer.lib import rtrvc as rvc_for_realtime
    from configs.config import Config
    from multiprocessing import Queue as MPQueue
    import torchaudio

    # Change to RVC directory
    rvc_dir = '/app/Retrieval-based-Voice-Conversion-WebUI'
    original_dir = os.getcwd()
    os.chdir(rvc_dir)

    try:
        config = Config()
        device = config.device

        # Create queues
        inp_q = MPQueue()
        opt_q = MPQueue()

        # Initialize RVC
        rvc = rvc_for_realtime.RVC(
            key=0,
            formant=0,
            pth_path="assets/weights/MikuAI_e210_s6300.pth",
            index_path="assets/indices/added_IVF1811_Flat_nprobe_1_MikuAI_v2.index",
            index_rate=0.5,
            n_cpu=4,
            inp_q=inp_q,
            opt_q=opt_q,
            config=config,
            last_rvc=None
        )

        # Optimizations
        torch.set_grad_enabled(False)
        torch.backends.cudnn.benchmark = True

        if config.is_half:
            rvc.model = rvc.model.half()
            rvc.net_g = rvc.net_g.half()

        # Create resamplers (32kHz Soprano → 48kHz and 16kHz)
        resampler_32_to_48 = torchaudio.transforms.Resample(
            orig_freq=32000, new_freq=48000
        ).to(device)

        resampler_32_to_16 = torchaudio.transforms.Resample(
            orig_freq=32000, new_freq=16000
        ).to(device)

        # Prepare RVC buffers
        block_frame = 9600  # 0.2s @ 48kHz
        block_frame_16k = 3200  # 0.2s @ 16kHz
        crossfade_frame = int(block_frame * 0.25)  # 0.05s
        extra_frame = int(block_frame * 9)  # 1.8s context

        input_wav_res = torch.zeros(
            extra_frame + crossfade_frame + block_frame_16k + crossfade_frame,
            device=device, dtype=torch.float32
        )

        input_wav = torch.zeros(
            extra_frame + crossfade_frame + block_frame + crossfade_frame,
            device=device, dtype=torch.float32
        )

        # RVC processing parameters (zc = target_sr // 100 = 48000 // 100 = 480)
        zc = 480  # Window size for RVC processing
        skip_head = extra_frame // zc
        return_length = (block_frame_16k + crossfade_frame) // zc

    finally:
        os.chdir(original_dir)

    return {
        'rvc': rvc,
        'config': config,
        'device': device,
        'resampler_32_to_48': resampler_32_to_48,
        'resampler_32_to_16': resampler_32_to_16,
        'block_frame': block_frame,
        'block_frame_16k': block_frame_16k,
        'crossfade_frame': crossfade_frame,
        'extra_frame': extra_frame,
        'input_wav_res': input_wav_res,
        'input_wav': input_wav,
        'skip_head': skip_head,
        'return_length': return_length,
        'rvc_dir': rvc_dir,
    }


def process_audio_through_rvc(soprano_audio, rvc_ctx):
    """
    Process Soprano audio through RVC pipeline exactly as soprano_rvc_api.py does.
    Returns: (converted_audio, time_taken)
    """
    device = rvc_ctx['device']
    rvc = rvc_ctx['rvc']

    # Convert to tensor
    soprano_tensor = torch.from_numpy(soprano_audio).to(device).float()

    # Split into chunks (0.1s @ 32kHz)
    chunk_size = 3200
    num_chunks = (len(soprano_tensor) + chunk_size - 1) // chunk_size

    # Accumulation buffers
    acc_buffer_48k = torch.tensor([], device=device, dtype=torch.float32)
    acc_buffer_16k = torch.tensor([], device=device, dtype=torch.float32)

    output_chunks = []
    block_times = []

    original_dir = os.getcwd()
    os.chdir(rvc_ctx['rvc_dir'])

    try:
        for i in range(num_chunks):
            chunk_start = i * chunk_size
            chunk_end = min((i + 1) * chunk_size, len(soprano_tensor))
            soprano_chunk = soprano_tensor[chunk_start:chunk_end]

            # Resample to 48kHz and 16kHz
            chunk_48k = rvc_ctx['resampler_32_to_48'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]
            chunk_16k = rvc_ctx['resampler_32_to_16'](soprano_chunk.unsqueeze(0).unsqueeze(0))[0, 0]

            # Accumulate
            acc_buffer_48k = torch.cat([acc_buffer_48k, chunk_48k])
            acc_buffer_16k = torch.cat([acc_buffer_16k, chunk_16k])

            # Process blocks when we have enough samples
            while acc_buffer_48k.shape[0] >= rvc_ctx['block_frame']:
                block_start = time.time()

                # Take a block
                block_48k = acc_buffer_48k[:rvc_ctx['block_frame']]
                block_16k = acc_buffer_16k[:rvc_ctx['block_frame_16k']]

                acc_buffer_48k = acc_buffer_48k[rvc_ctx['block_frame']:]
                acc_buffer_16k = acc_buffer_16k[rvc_ctx['block_frame_16k']:]

                # Update input buffers (slide window)
                rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone()
                rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k

                rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone()
                rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k

                # RVC inference
                infer_wav = rvc.infer(
                    rvc_ctx['input_wav_res'],
                    rvc_ctx['block_frame_16k'],
                    rvc_ctx['skip_head'],
                    rvc_ctx['return_length'],
                    "rmvpe",  # f0method
                )

                # Convert to tensor
                if not torch.is_tensor(infer_wav):
                    infer_wav = torch.from_numpy(infer_wav).to(device)

                output_chunks.append(infer_wav)
                block_times.append(time.time() - block_start)

        # Flush remaining audio
        if acc_buffer_48k.shape[0] > 0:
            block_start = time.time()

            # Pad to block size
            pad_size_48k = rvc_ctx['block_frame'] - acc_buffer_48k.shape[0]
            pad_size_16k = rvc_ctx['block_frame_16k'] - acc_buffer_16k.shape[0]

            padding_48k = torch.zeros(pad_size_48k, device=device, dtype=torch.float32)
            padding_16k = torch.zeros(pad_size_16k, device=device, dtype=torch.float32)

            block_48k = torch.cat([acc_buffer_48k, padding_48k])
            block_16k = torch.cat([acc_buffer_16k, padding_16k])

            # Update buffers
            rvc_ctx['input_wav'][:-rvc_ctx['block_frame']] = rvc_ctx['input_wav'][rvc_ctx['block_frame']:].clone()
            rvc_ctx['input_wav'][-rvc_ctx['block_frame']:] = block_48k

            rvc_ctx['input_wav_res'][:-rvc_ctx['block_frame_16k']] = rvc_ctx['input_wav_res'][rvc_ctx['block_frame_16k']:].clone()
            rvc_ctx['input_wav_res'][-rvc_ctx['block_frame_16k']:] = block_16k

            # Inference
            infer_wav = rvc.infer(
                rvc_ctx['input_wav_res'],
                rvc_ctx['block_frame_16k'],
                rvc_ctx['skip_head'],
                rvc_ctx['return_length'],
                "rmvpe",
            )

            if not torch.is_tensor(infer_wav):
                infer_wav = torch.from_numpy(infer_wav).to(device)

            # Trim padding
            output_samples = acc_buffer_48k.shape[0]
            if output_samples < len(infer_wav):
                infer_wav = infer_wav[:output_samples]

            output_chunks.append(infer_wav)
            block_times.append(time.time() - block_start)

    finally:
        os.chdir(original_dir)

    # Concatenate all output chunks
    if output_chunks:
        full_audio = torch.cat(output_chunks).cpu().numpy()
    else:
        full_audio = np.array([])

    return full_audio, sum(block_times), block_times


def run_benchmark():
    """Run comprehensive benchmark"""
    print("\n" + "="*80)
    print(" "*20 + "COMPREHENSIVE SOPRANO vs RVC BENCHMARK")
    print("="*80 + "\n")

    soprano_socket = setup_soprano_connection()
    print("✅ Connected to Soprano server")

    print("\n📦 Loading RVC model...")
    rvc_start = time.time()
    rvc_ctx = setup_rvc()
    print(f"✅ RVC loaded in {time.time() - rvc_start:.2f}s\n")

    results = {}

    for label, text in TEST_CASES:
        print(f"\n{'='*80}")
        print(f"Testing: {label.upper()}")
        print(f"Text: \"{text}\"")
        print('='*80)

        results[label] = {
            'soprano_times': [],
            'rvc_times': [],
            'rvc_block_times': [],
            'pipeline_times': [],
            'audio_duration': None,
            'num_blocks': [],
        }

        for run in range(RUNS_PER_TEST):
            print(f"\n  Run {run+1}/{RUNS_PER_TEST}:")

            # Test 1: Soprano only
            try:
                soprano_audio, soprano_time = call_soprano(soprano_socket, text)
                audio_duration = len(soprano_audio) / 32000
                results[label]['soprano_times'].append(soprano_time)
                results[label]['audio_duration'] = audio_duration

                soprano_rtf = audio_duration / soprano_time if soprano_time > 0 else 0
                print(f"    🎤 Soprano: {soprano_time*1000:.1f}ms → {audio_duration:.2f}s audio (RTF: {soprano_rtf:.2f}x)")

                # Test 2: RVC only (using cached Soprano output from run 1)
                rvc_output, rvc_time, block_times = process_audio_through_rvc(soprano_audio, rvc_ctx)
                results[label]['rvc_times'].append(rvc_time)
                results[label]['rvc_block_times'].extend(block_times)
                results[label]['num_blocks'].append(len(block_times))

                rvc_rtf = audio_duration / rvc_time if rvc_time > 0 else 0
                avg_block_time = statistics.mean(block_times) if block_times else 0
                print(f"    🎙️  RVC: {rvc_time*1000:.1f}ms ({len(block_times)} blocks, avg {avg_block_time*1000:.1f}ms/block, RTF: {rvc_rtf:.2f}x)")

                # Test 3: Full pipeline time
                pipeline_time = soprano_time + rvc_time
                results[label]['pipeline_times'].append(pipeline_time)

                pipeline_rtf = audio_duration / pipeline_time if pipeline_time > 0 else 0
                print(f"    ⚡ Pipeline: {pipeline_time*1000:.1f}ms (RTF: {pipeline_rtf:.2f}x)")

            except Exception as e:
                print(f"    ERROR: {e}")

    # Print comprehensive analysis
    print("\n\n" + "="*80)
    print(" "*25 + "PERFORMANCE ANALYSIS")
    print("="*80 + "\n")

    for label, text in TEST_CASES:
        data = results[label]

        if not data['soprano_times']:
            continue

        print(f"\n{'─'*80}")
        print(f"📊 {label.upper()}: \"{text[:60]}{'...' if len(text) > 60 else ''}\"")
        print('─'*80)

        audio_dur = data['audio_duration']
        print(f"\n  🎵 Audio Duration: {audio_dur:.2f}s")

        # Soprano stats
        s_mean = statistics.mean(data['soprano_times'])
        s_median = statistics.median(data['soprano_times'])
        s_std = statistics.stdev(data['soprano_times']) if len(data['soprano_times']) > 1 else 0
        s_rtf = audio_dur / s_mean if s_mean > 0 else 0

        print(f"\n  🎤 SOPRANO:")
        print(f"     ├─ Mean: {s_mean*1000:.1f}ms")
        print(f"     ├─ Median: {s_median*1000:.1f}ms")
        print(f"     ├─ Std Dev: {s_std*1000:.1f}ms")
        print(f"     ├─ Range: {min(data['soprano_times'])*1000:.1f} - {max(data['soprano_times'])*1000:.1f}ms")
        print(f"     └─ RTF: {s_rtf:.2f}x")

        # RVC stats
        r_mean = statistics.mean(data['rvc_times'])
        r_median = statistics.median(data['rvc_times'])
        r_std = statistics.stdev(data['rvc_times']) if len(data['rvc_times']) > 1 else 0
        r_rtf = audio_dur / r_mean if r_mean > 0 else 0

        avg_blocks = statistics.mean(data['num_blocks'])
        avg_block_time = statistics.mean(data['rvc_block_times']) if data['rvc_block_times'] else 0

        print(f"\n  🎙️  RVC:")
        print(f"     ├─ Mean: {r_mean*1000:.1f}ms")
        print(f"     ├─ Median: {r_median*1000:.1f}ms")
        print(f"     ├─ Std Dev: {r_std*1000:.1f}ms")
        print(f"     ├─ Range: {min(data['rvc_times'])*1000:.1f} - {max(data['rvc_times'])*1000:.1f}ms")
        print(f"     ├─ Avg blocks: {avg_blocks:.1f}")
        print(f"     ├─ Avg per block: {avg_block_time*1000:.1f}ms")
        print(f"     └─ RTF: {r_rtf:.2f}x")

        # Pipeline stats
        p_mean = statistics.mean(data['pipeline_times'])
        p_rtf = audio_dur / p_mean if p_mean > 0 else 0

        print(f"\n  ⚡ FULL PIPELINE:")
        print(f"     ├─ Mean: {p_mean*1000:.1f}ms")
        print(f"     └─ RTF: {p_rtf:.2f}x")

        # Breakdown
        print(f"\n  📈 BREAKDOWN:")
        print(f"     ├─ Soprano: {s_mean*1000:.1f}ms ({s_mean/p_mean*100:.1f}%)")
        print(f"     └─ RVC: {r_mean*1000:.1f}ms ({r_mean/p_mean*100:.1f}%)")

    # Summary
    print("\n\n" + "="*80)
    print(" "*30 + "SUMMARY")
    print("="*80 + "\n")

    all_soprano_rtf = []
    all_rvc_rtf = []
    all_pipeline_rtf = []

    for label in results:
        data = results[label]
        if data['audio_duration'] and data['soprano_times'] and data['rvc_times']:
            s_rtf = data['audio_duration'] / statistics.mean(data['soprano_times'])
            r_rtf = data['audio_duration'] / statistics.mean(data['rvc_times'])
            p_rtf = data['audio_duration'] / statistics.mean(data['pipeline_times'])

            all_soprano_rtf.append(s_rtf)
            all_rvc_rtf.append(r_rtf)
            all_pipeline_rtf.append(p_rtf)

    if all_soprano_rtf:
        print(f"  🎤 Soprano Average RTF: {statistics.mean(all_soprano_rtf):.2f}x")
    if all_rvc_rtf:
        print(f"  🎙️  RVC Average RTF: {statistics.mean(all_rvc_rtf):.2f}x")
    if all_pipeline_rtf:
        print(f"  ⚡ Pipeline Average RTF: {statistics.mean(all_pipeline_rtf):.2f}x")

    print(f"\n  💡 BOTTLENECK ANALYSIS:")
    if all_soprano_rtf and all_rvc_rtf:
        soprano_avg = statistics.mean(all_soprano_rtf)
        rvc_avg = statistics.mean(all_rvc_rtf)

        if soprano_avg < rvc_avg:
            bottleneck = "Soprano"
            ratio = rvc_avg / soprano_avg
        else:
            bottleneck = "RVC"
            ratio = soprano_avg / rvc_avg

        print(f"     └─ {bottleneck} is {ratio:.2f}x slower than the other")

        if bottleneck == "RVC":
            print(f"\n  ✅ CONCLUSION: RVC (voice conversion) is the bottleneck, NOT Soprano")
            print(f"     - Soprano: {soprano_avg:.2f}x realtime (FAST)")
            print(f"     - RVC: {rvc_avg:.2f}x realtime (BOTTLENECK)")

    print("\n" + "="*80)
    print(" "*28 + "BENCHMARK COMPLETE")
    print("="*80 + "\n")


if __name__ == "__main__":
    run_benchmark()