add: absorb soprano_to_rvc as regular subdirectory

Voice conversion pipeline (Soprano TTS → RVC) with Docker support.
Previously tracked as bare gitlink; removed .git/ directories and
absorbed into main repo for unified tracking.

Includes: Soprano TTS, RVC WebUI integration, Docker configs,
WebSocket API, and benchmark scripts.
Updated .gitignore to exclude large model weights (*.pth, *.pt, *.onnx, *.index).
287 files (3.1GB of ML weights properly excluded via gitignore).
This commit is contained in:
2026-03-04 00:24:53 +02:00
parent 34b184a05a
commit 8ca716029e
287 changed files with 47102 additions and 0 deletions

View File

@@ -0,0 +1,89 @@
# Dual-GPU Setup: GTX 1660 (CUDA) + RX 6800 (ROCm)
## Architecture
```
┌─────────────────────────────────────────────────────────┐
│ HTTP Request → FastAPI Main Process │
└────────────────┬────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────┐
│ Process 1: Soprano Generator (GTX 1660 CUDA) │
│ - Runs in CUDA venv │
│ - Soprano TTS: text → audio (32kHz) │
│ - Outputs to Redis/ZMQ queue │
│ Device: cuda:0 (GTX 1660) │
└────────────────┬────────────────────────────────────────┘
│ Audio chunks (numpy arrays)
┌─────────────────────────────────────────────────────────┐
│ Process 2: RVC Processor (RX 6800 ROCm) │
│ - Runs in ROCm venv │
│ - RVC: 32kHz → 48kHz converted audio │
│ - Outputs to client HTTP streams │
│ Device: cuda:0 (RX 6800, but seen as cuda:0) │
└─────────────────────────────────────────────────────────┘
```
## Virtual Environment Setup
### 1. CUDA venv for Soprano (GTX 1660)
```bash
cd /home/koko210Serve/docker/miku-discord/soprano_to_rvc
python -m venv .venv-cuda
source .venv-cuda/bin/activate
# Install CUDA PyTorch
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# Install Soprano dependencies
pip install -r soprano/requirements.txt
pip install lmdeploy # or skip if using transformers backend only
pip install redis zmq # for IPC
```
### 2. ROCm venv for RVC (RX 6800) - Already exists
```bash
# Current .venv has ROCm PyTorch
source .venv/bin/activate
pip install redis zmq # add IPC libraries
```
## Implementation Files
### soprano_server.py (Process 1 - CUDA)
Runs Soprano TTS on GTX 1660, outputs to Redis queue.
### rvc_server.py (Process 2 - ROCm)
Runs RVC on RX 6800, reads from Redis queue, serves HTTP.
### coordinator.py
Manages both processes, handles restarts.
## Benefits
1. **No GPU contention** - Each model has dedicated GPU
2. **No kernel warmup issues** - GTX 1660 has mature CUDA with pre-compiled kernels
3. **Better utilization** - Both GPUs work in parallel
4. **Faster overall** - No waiting for shared GPU resources
## Expected Performance
- **Soprano on GTX 1660**: ~1.5x realtime (similar to RX 6800 isolated)
- **RVC on RX 6800**: ~1.56x realtime (isolated, no contention)
- **Total pipeline**: ~1.5x realtime (limited by slower component)
- **No warmup needed**: CUDA kernels pre-compiled
## Testing Approach
1. Test Soprano alone on GTX 1660
2. Test RVC alone on RX 6800
3. Test full pipeline with Redis communication
4. Verify no GPU contention with monitoring
## Fallback
If GTX 1660 is too slow for Soprano, reverse the assignment:
- Soprano on RX 6800 (faster GPU)
- RVC on GTX 1660 (simpler model)

View File

@@ -0,0 +1,119 @@
#!/usr/bin/env python3
"""
Benchmark Soprano TTS on GTX 1660 (CUDA)
Tests isolated performance before dual-GPU setup
"""
import time
import torch
from soprano import SopranoTTS
print("="*60)
print("Soprano Benchmark on GTX 1660")
print("="*60)
# Verify GPU
print(f"\nGPU: {torch.cuda.get_device_name(0)}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"PyTorch: {torch.__version__}")
# Load model
print("\nLoading Soprano with lmdeploy backend...")
model = SopranoTTS(
backend='lmdeploy',
device='cuda',
cache_size_mb=500,
decoder_batch_size=2
)
print(f"✓ Loaded on {model.device}")
# Test texts of varying lengths
test_cases = [
("Short test for quick generation.", "Short"),
("This is a medium length test to measure the performance of Soprano text to speech on the GTX 1660 GPU.", "Medium"),
("This is a longer test sentence designed to evaluate sustained performance of the Soprano text to speech model running on the NVIDIA GTX 1660 graphics card with CUDA acceleration to see if it can achieve realtime or better performance.", "Long"),
("Extended performance test with even more text to really push the limits and see how the GTX 1660 handles longer generation tasks with the Soprano model using the lmdeploy backend which should provide optimized inference performance compared to the standard transformers backend.", "Very Long"),
]
print("\n" + "="*60)
print("Running Benchmarks...")
print("="*60)
results = []
for text, label in test_cases:
print(f"\n[{label}] Text: \"{text[:50]}...\"")
print(f" Length: {len(text)} chars")
# Warm up with first run
if label == "Short":
print(" (Warming up...)")
_ = model.infer(text)
time.sleep(1)
# Actual benchmark - 3 runs
times = []
audio_lengths = []
for run in range(3):
start = time.time()
audio = model.infer(text)
elapsed = time.time() - start
times.append(elapsed)
# Calculate audio duration (32kHz, mono)
audio_duration = len(audio) / 32000
audio_lengths.append(audio_duration)
realtime_factor = audio_duration / elapsed
print(f" Run {run+1}: {elapsed:.2f}s for {audio_duration:.2f}s audio = {realtime_factor:.2f}x realtime")
# Average results
avg_time = sum(times) / len(times)
avg_audio = sum(audio_lengths) / len(audio_lengths)
avg_realtime = avg_audio / avg_time
results.append({
'label': label,
'text_len': len(text),
'avg_time': avg_time,
'avg_audio': avg_audio,
'avg_realtime': avg_realtime
})
print(f" → Average: {avg_time:.2f}s for {avg_audio:.2f}s audio = {avg_realtime:.2f}x realtime")
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"{'Text Length':<15} {'Gen Time':<12} {'Audio':<10} {'Realtime Factor':<15}")
print("-"*60)
for r in results:
print(f"{r['label']:<15} {r['avg_time']:>8.2f}s {r['avg_audio']:>6.2f}s {r['avg_realtime']:>12.2f}x")
overall_avg = sum(r['avg_realtime'] for r in results) / len(results)
print("-"*60)
print(f"Overall Average: {overall_avg:.2f}x realtime")
# Verdict
print("\n" + "="*60)
print("VERDICT")
print("="*60)
if overall_avg >= 1.5:
print("✅ EXCELLENT - GTX 1660 is fast enough!")
print(f" {overall_avg:.2f}x realtime is plenty for live generation")
elif overall_avg >= 1.0:
print("✅ GOOD - GTX 1660 can handle realtime")
print(f" {overall_avg:.2f}x realtime is sufficient")
elif overall_avg >= 0.8:
print("⚠️ MARGINAL - GTX 1660 is borderline")
print(f" {overall_avg:.2f}x realtime might work with optimization")
else:
print("❌ TOO SLOW - GTX 1660 cannot keep up")
print(f" {overall_avg:.2f}x realtime is insufficient")
print(" Consider using RX 6800 for Soprano instead")
print("="*60)

View File

@@ -0,0 +1,55 @@
====== BOTTLENECK C1/C2 CONFIRMED: SYNCHRONOUS BLOCKING ======
TEST METHODOLOGY:
- Added detailed timing to broadcast_worker() loop
- Measured: Soprano wait time, RVC processing time, loop iteration time
- Test text: "This is a comprehensive test of the full Soprano plus RVC pipeline..."
RAW DATA:
- Total elapsed: 14.49s
- Blocks processed: 26
- Audio duration: 6.50s
- Realtime factor: 0.45x (should be 1.49x if parallel)
- Performance loss: 69.9%
TIME BREAKDOWN:
- Total RVC blocking time: 10.07s (69.4% of elapsed)
- Total Soprano waiting time: 4.38s (30.2% of elapsed)
CRITICAL FINDING - FIRST BLOCK PENALTY:
- Block 1 RVC: 6107.8ms (6.1 SECONDS!)
- Block 2 RVC: 163.2ms
- Block 3+ RVC: ~155-165ms each
ROOT CAUSE: ROCm/HIP kernel JIT compilation + first-time memory allocation
This 6.1s penalty is ONE-TIME per server start, not per request.
STEADY-STATE BEHAVIOR (blocks 2-26):
- Soprano wait per chunk: ~400ms (generation time)
- RVC processes 2-3 blocks per chunk: 320-480ms total
- Loop iteration: 720-880ms (400ms + 320-480ms)
- Pattern: SYNCHRONOUS - Soprano waits for RVC to complete all blocks
EXPECTED BEHAVIOR (65.8 - 48.3 = 17.5%, likely due to different test length)
If fixed: Loop iteration = max(400ms Soprano, 160ms RVC) = 400ms
Pipeline limited by Soprano at 1.49x realtime (isolated component speed)
CONFIRMATION OF HYPOTHESIS C1/C2:
✅ Soprano generation BLOCKS while waiting for RVC processing
✅ RVC processing BLOCKS Soprano from generating next chunk
✅ No overlap/parallelism between components
✅ Measured loop iteration (780ms avg) matches expected additive behavior (785ms)
✅ 69.4% of time spent in RVC blocking = Soprano sitting idle
NEXT STEP:
Implement async producer-consumer pattern:
1. Soprano generates chunks → queue (continuous, non-blocking)
2. RVC pulls from queue → processes (continuous, non-blocking)
3. Both components run in parallel
EXPECTED IMPROVEMENT:
Current: 0.45x realtime (with 6.1s first-block penalty)
Steady-state current: ~0.78x realtime (blocks 2-26 average)
After fix: ~1.49x realtime (limited by Soprano generation speed)
Improvement: 0.78x → 1.49x = +91% performance, achieves >1.0x target! ✅

View File

@@ -0,0 +1,68 @@
====== VISUALIZATION OF SYNCHRONOUS BLOCKING PROBLEM ======
CURRENT IMPLEMENTATION (Synchronous - Sequential Processing):
Timeline for steady-state blocks (ignoring 6.1s first-block warmup):
0ms ──────────────────────────────────────────────────────────────────
[Soprano generating chunk 1 - 400ms]
400ms ────────────────────────────────────────────────────────────────
[RVC block 1 - 160ms]
560ms ────────────────────────────────────────────────────────────────
[RVC block 2 - 160ms]
720ms ────────────────────────────────────────────────────────────────
[Soprano generating chunk 2 - 400ms]
1120ms ───────────────────────────────────────────────────────────────
[RVC block 3 - 160ms]
1280ms ───────────────────────────────────────────────────────────────
[RVC block 4 - 160ms]
1440ms ───────────────────────────────────────────────────────────────
[RVC block 5 - 160ms]
1600ms ───────────────────────────────────────────────────────────────
PROBLEM:
- Soprano IDLES during RVC processing (wasted time: 320-480ms per iteration)
- RVC IDLES during Soprano generation (wasted time: 400ms per iteration)
- Total time per 2 blocks: 720ms
- Effective realtime: 0.78x (320ms audio in 720ms) ❌
PROPOSED IMPLEMENTATION (Async - Parallel Processing):
Timeline for steady-state blocks:
0ms ──────────────────────────────────────────────────────────────────
[Soprano gen chunk 1 - 400ms]
[RVC block 1 - 160ms]────────────────────────────────────────
400ms ────────────────────────────────────────────────────────────────
[Soprano gen chunk 2 - 400ms]
[RVC block 2 - 160ms]─────────────
560ms ────────────────────────────────────────────────────────────────
[RVC block 3 - 160ms]
800ms ────────────────────────────────────────────────────────────────
[Soprano gen chunk 3 - 400ms]
[RVC block 4 - 160ms]
1200ms ───────────────────────────────────────────────────────────────
SOLUTION:
- Soprano and RVC run IN PARALLEL (no idle time!)
- Queue between them buffers chunks (decouples timing)
- RVC processes faster than Soprano generates (160ms < 400ms ✅)
- Pipeline limited by SLOWER component (Soprano at 1.49x realtime)
- Total time per 2 blocks: 400ms (RVC finishes before next Soprano chunk ready)
- Effective realtime: 1.49x (320ms audio in ~215ms) ✅
IMPLEMENTATION PATTERN:
┌──────────────┐ Queue ┌──────────────┐
│ Soprano │──────────────>│ RVC │
│ Generator │ │ Processor │
│ (Thread 1) │ │ (Thread 2) │
└──────────────┘ └──────────────┘
Runs at Runs at
400ms/chunk 160ms/block
(1.49x realtime) (1.56x realtime)
Both threads run continuously!
Soprano doesn't wait for RVC!
RVC pulls from queue as ready!

View File

@@ -0,0 +1,205 @@
this is the pipeline i ended up with;
right now i get 0.76x realtime with it;
do you see any obvious ways to improve the latency?
Yes — thanks for sharing the full pipeline. This is actually a very solid, correct implementation, and getting ~0.76× realtime with Soprano → RVC → HTTP streaming on consumer hardware is already respectable.
That said, there are several clear, concrete latency wins you can still take without breaking correctness. Ill separate them into “guaranteed wins”, “conditional wins”, and “dont do this (it breaks quality)”.
TL;DR Biggest Wins First
If you do only three things, do these:
1. Reduce block_time from 0.25s → 0.16s
2. Pin HuBERT + f0 to FP16 and disable gradients everywhere
3. Stop resampling Soprano → 48k → 16k (go straight 32k → 16k)
That alone should move you from 0.76× → ~1.051.15× realtime.
Block size is your #1 latency lever (and youre playing it safe)
Right now:
block_time = 0.25
At 48 kHz thats 12,000 samples per block, which is very conservative.
What GUI actually tolerates
RVC GUI users regularly run:
0.16 s (stable)
0.128 s (borderline but OK)
0.10 s (risky unless GPU is strong)
Recommended change
In PipelineConfig:
"block_time": 0.16,
"crossfade_time": 0.04,
"extra_time": 1.8
Why this helps:
Less audio buffered before inference
HuBERT + f0 runs on smaller windows
SOLA still works correctly
This reduces both latency and total compute per second.
You are resampling too much (this is real overhead)
Current path:
Soprano (32k)
→ resample → 48k
→ buffer
→ resample → 16k
→ RVC
That is two resamplers per block, every block.
Correct and faster path
RVC only cares about 16k for inference.
You do not need 48k until after RVC.
Replace this logic
In _initialize_buffers:
Remove:
self.resampler_soprano_to_48k
self.resampler_48k_to_16k
Keep only:
self.resampler_soprano_to_16k
self.resampler_rvc_to_target
Then change:
process_soprano_chunk()
to directly emit 16k audio, not 48k.
Why this helps
Resampling is surprisingly expensive
This removes ~1015% of your per-block cost
Also reduces numerical drift
This alone is worth ~0.10.15× realtime.
Your RVC worker threading is good — but under-utilized
You already did the right thing:
Soprano generation
Accumulation
RVC inference in a worker thread
But this line limits throughput:
rvc_input_queue = Queue(maxsize=2)
Increase queue depth slightly:
rvc_input_queue = Queue(maxsize=4)
rvc_output_queue = Queue(maxsize=4)
Why:
HuBERT + f0 have bursty latency
This prevents pipeline stalls
Doesnt increase end-to-end latency meaningfully
Force FP16 everywhere (you are almost doing it)
You already rely on:
self.rvc_config.is_half
But some parts still default to FP32 unless forced.
Add these once, after model load:
torch.set_grad_enabled(False)
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
And explicitly cast HuBERT + RMVPE:
self.rvc.model = self.rvc.model.half()
self.rvc.net_g = self.rvc.net_g.half()
This typically gives ~510% speedup.
RMVPE vs FCPE: latency reality check
f0 method | Latency | Stability | Recommendation
rmvpe | slower | best | Use for quality
fcpe | fastest | noisy | Use for realtime
Best compromise
"f0method": "fcpe",
"index_rate": 0.0,
"rms_mix_rate": 0.0
This removes expensive pitch/index blending.
Soprano is not your bottleneck (good news)
From your timing logs:
Soprano ≈ 1525%
RVC ≈ 6070%
Everything else ≈ 1015%
That means:
Optimizing RVC is the correct focus
Switching TTS backends wont help much
Why you should not reduce extra_time too much
You might be tempted to do:
"extra_time": 0.5
Dont.
Below ~1.5s:
HuBERT context breaks
Metallic artifacts return
Pitch stability collapses
Your current 2.5s → 1.8s is the safe minimum.
Expected Result After These Changes
Metric | Before | After
Realtime factor | 0.76× | 1.051.25×
First audio latency | ~400 ms | ~220280 ms
Final sanity check (important)
Your pipeline is architecturally correct.
The problems you had earlier (metallic noise, speedup, segfaults) were:
incorrect buffer sliding
missing inp_q/opt_q
resampling mismatches
overlapping tensor writes
Those are now fixed.
What remains is pure performance tuning, not correctness.

View File

@@ -0,0 +1,24 @@
# ==========================================================
# HARD CONSTRAINTS — DO NOT UPGRADE CASUALLY
# ==========================================================
python_version == "3.10.19"
# Torch / ROCm ABI lock
torch == 2.5.1+rocm6.2
torchaudio == 2.5.1+rocm6.2
torchvision == 0.20.1+rocm6.2
pytorch-triton-rocm == 3.1.0
# NumPy / Numba compatibility
numpy < 1.24
numba == 0.56.4
llvmlite == 0.39.0
# RVC core
fairseq == 0.12.2
faiss-cpu == 1.7.3
pyworld < 0.4
# Gradio pin (RVC WebUI tested)
gradio == 3.48.0

View File

@@ -0,0 +1,211 @@
#!/usr/bin/env python3
"""
Deep diagnostic tool to understand what causes <8s gap failures.
Monitors: GPU memory, timing, queue states, torch internals.
"""
import torch
import time
import requests
import json
import subprocess
import threading
from collections import defaultdict
def get_gpu_memory():
"""Get current GPU memory usage."""
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024**3 # GB
reserved = torch.cuda.memory_reserved() / 1024**3 # GB
return allocated, reserved
return 0, 0
def get_gpu_clock_speeds():
"""Get GPU clock speeds via rocm-smi."""
try:
result = subprocess.run(['rocm-smi', '--showclocks'],
capture_output=True, text=True, timeout=1)
lines = result.stdout.split('\n')
sclk = mclk = 0
for line in lines:
if 'sclk' in line.lower() and 'level' in line.lower():
# Parse "level: X: (YYYYMhz)"
parts = line.split('(')
if len(parts) > 1:
mhz = parts[1].split('Mhz')[0]
if mhz.isdigit():
sclk = int(mhz)
elif 'mclk' in line.lower() and 'level' in line.lower():
parts = line.split('(')
if len(parts) > 1:
mhz = parts[1].split('Mhz')[0]
if mhz.isdigit():
mclk = int(mhz)
return sclk, mclk
except:
return 0, 0
def monitor_job_execution(job_id, log_file):
"""Monitor a single job execution in detail."""
start_time = time.time()
last_check = start_time
print(f"\n{'='*60}")
print(f"Monitoring Job: {job_id}")
print(f"{'='*60}")
with open(log_file, 'a') as f:
f.write(f"\n{'='*60}\n")
f.write(f"Job: {job_id} | Started: {time.strftime('%H:%M:%S')}\n")
f.write(f"{'='*60}\n")
# Initial state
mem_alloc, mem_reserved = get_gpu_memory()
sclk, mclk = get_gpu_clock_speeds()
f.write(f"INITIAL STATE:\n")
f.write(f" GPU Mem: {mem_alloc:.2f}GB allocated, {mem_reserved:.2f}GB reserved\n")
f.write(f" GPU Clocks: sclk={sclk}MHz, mclk={mclk}MHz\n")
f.write(f" PyTorch cache: {torch.cuda.memory_reserved()/1024**3:.2f}GB\n\n")
# Monitor during execution (poll every 0.5s for ~10s)
for i in range(20):
time.sleep(0.5)
elapsed = time.time() - start_time
mem_alloc, mem_reserved = get_gpu_memory()
sclk, mclk = get_gpu_clock_speeds()
f.write(f"T+{elapsed:.1f}s: mem={mem_alloc:.2f}GB, "
f"sclk={sclk}MHz, mclk={mclk}MHz\n")
# If job seems done (very rough heuristic), break
if elapsed > 8 and mem_alloc < 0.5:
break
# Final state
time.sleep(1)
elapsed = time.time() - start_time
mem_alloc, mem_reserved = get_gpu_memory()
sclk, mclk = get_gpu_clock_speeds()
f.write(f"\nFINAL STATE (T+{elapsed:.1f}s):\n")
f.write(f" GPU Mem: {mem_alloc:.2f}GB allocated, {mem_reserved:.2f}GB reserved\n")
f.write(f" GPU Clocks: sclk={sclk}MHz, mclk={mclk}MHz\n")
f.write(f" PyTorch cache: {torch.cuda.memory_reserved()/1024**3:.2f}GB\n")
f.write(f"{'='*60}\n\n")
print(f"Job {job_id} monitoring complete (T+{elapsed:.1f}s)")
def test_gap_with_monitoring(gap_seconds, num_jobs=3):
"""Test a specific gap with deep monitoring."""
log_file = f'/tmp/gpu_diagnostic_{gap_seconds}s.log'
print(f"\n{'#'*60}")
print(f"# Testing {gap_seconds}s gap with {num_jobs} jobs")
print(f"# Log file: {log_file}")
print(f"{'#'*60}")
# Clear log
with open(log_file, 'w') as f:
f.write(f"GPU Diagnostic Test - {gap_seconds}s gap\n")
f.write(f"Started: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"{'='*60}\n\n")
for i in range(num_jobs):
print(f"\n[Job {i+1}/{num_jobs}]")
# Submit job
text = f"Diagnostic test job {i+1} with {gap_seconds} second gap for deep analysis."
response = requests.post(
'http://localhost:8765/api/speak',
headers={'Content-Type': 'application/json'},
json={'text': text}
)
if response.status_code == 200:
data = response.json()
job_id = data.get('job_id', 'unknown')[:8]
print(f" Job ID: {job_id}")
print(f" Queue size: {data.get('queue_size', '?')}")
# Monitor this job
monitor_thread = threading.Thread(
target=monitor_job_execution,
args=(job_id, log_file)
)
monitor_thread.start()
# Wait for gap
print(f" Waiting {gap_seconds}s before next job...")
time.sleep(gap_seconds)
monitor_thread.join(timeout=1) # Don't block too long
else:
print(f" ERROR: {response.status_code}")
# Wait for last job
print(f"\nWaiting 10s for last job to complete...")
time.sleep(10)
print(f"\nDiagnostic log saved to: {log_file}")
print(f"Timing results in: /tmp/soprano_timing_debug.txt")
def compare_torch_state():
"""Compare PyTorch internal state between good and bad runs."""
print("\n=== PyTorch State Analysis ===")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
print(f"Current stream: {torch.cuda.current_stream()}")
print(f"Memory allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB")
print(f"Memory reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB")
print(f"Max memory allocated: {torch.cuda.max_memory_allocated()/1024**3:.2f}GB")
print(f"Max memory reserved: {torch.cuda.max_memory_reserved()/1024**3:.2f}GB")
print(f"Memory cached: {torch.cuda.memory_cached()/1024**3:.2f}GB" if hasattr(torch.cuda, 'memory_cached') else "N/A")
if __name__ == '__main__':
print("="*60)
print("GPU Diagnostic Tool for Job Spacing Investigation")
print("="*60)
# Clear timing log
import os
if os.path.exists('/tmp/soprano_timing_debug.txt'):
os.remove('/tmp/soprano_timing_debug.txt')
# Show initial state
compare_torch_state()
# Test 8s gap (should be good)
print("\n" + "="*60)
print("TEST 1: 8s gap (expected: GOOD)")
print("="*60)
test_gap_with_monitoring(8, num_jobs=3)
print("\n" + "="*60)
print("Waiting 15s to let GPU fully settle...")
print("="*60)
time.sleep(15)
# Test 7s gap (should be bad)
print("\n" + "="*60)
print("TEST 2: 7s gap (expected: BAD)")
print("="*60)
test_gap_with_monitoring(7, num_jobs=3)
print("\n" + "="*60)
print("Analysis Complete!")
print("="*60)
# Show results
print("\nPerformance Results:")
with open('/tmp/soprano_timing_debug.txt', 'r') as f:
lines = f.readlines()
for i, line in enumerate(lines):
if 'Realtime' in line:
print(f" Job {(i//6)+1}: {line.strip()}")
if i+1 < len(lines):
print(f" {lines[i+1].strip()}")
print("\nCheck detailed logs:")
print(" - /tmp/gpu_diagnostic_8s.log")
print(" - /tmp/gpu_diagnostic_7s.log")

View File

@@ -0,0 +1,94 @@
#!/bin/bash
# Diagnostic test to understand 7s vs 10s gap difference
echo "=== JOB TRANSITION DIAGNOSTIC TEST ==="
echo "Testing with different gap times while monitoring GPU state"
echo ""
rm -f /tmp/soprano_timing_debug.txt
rm -f /tmp/job_transition_log.txt
# Function to test with specific gap
test_gap() {
local gap=$1
local num_tests=$2
echo "========================================" | tee -a /tmp/job_transition_log.txt
echo "Testing with ${gap}s gap between jobs (${num_tests} runs)" | tee -a /tmp/job_transition_log.txt
echo "========================================" | tee -a /tmp/job_transition_log.txt
for i in $(seq 1 $num_tests); do
echo -n "Job $i: Submitting..." | tee -a /tmp/job_transition_log.txt
# Record start time
start_time=$(date +%s.%N)
# Submit job
curl -s -X POST http://localhost:8765/api/speak \
-H "Content-Type: application/json" \
-d "{\"text\": \"Test job number $i with ${gap} second gap to measure performance consistency.\"}" \
> /dev/null
echo " submitted at $(date +%H:%M:%S.%N)" | tee -a /tmp/job_transition_log.txt
# Monitor GPU during the gap
echo " Monitoring GPU during ${gap}s wait..." | tee -a /tmp/job_transition_log.txt
for t in $(seq 1 $gap); do
sleep 1
gpu_state=$(rocm-smi --showclocks --showuse 2>/dev/null | grep -E "sclk|mclk|GPU use" | tr '\n' ' ')
echo " T+${t}s: $gpu_state" | tee -a /tmp/job_transition_log.txt
done
done
# Wait for last job to complete
echo "Waiting 10s for last job to complete..." | tee -a /tmp/job_transition_log.txt
sleep 10
# Show results
echo "" | tee -a /tmp/job_transition_log.txt
echo "Results for ${gap}s gap:" | tee -a /tmp/job_transition_log.txt
tail -$((num_tests * 6)) /tmp/soprano_timing_debug.txt | grep -E "Realtime|Avg RVC" | \
paste - - | awk -v gap=$gap '{print " Job " NR ": " $3 " realtime, RVC: " $6}' | tee -a /tmp/job_transition_log.txt
echo "" | tee -a /tmp/job_transition_log.txt
}
# Test 1: 10 second gap (known good)
echo "TEST 1: 10 second gap (baseline - should be good)"
test_gap 10 3
sleep 5
# Test 2: 7 second gap (known bad)
echo ""
echo "TEST 2: 7 second gap (should show degradation)"
test_gap 7 3
sleep 5
# Test 3: 5 second gap (extreme - what happens?)
echo ""
echo "TEST 3: 5 second gap (extreme test)"
test_gap 5 3
sleep 5
# Test 4: 8 second gap (boundary test)
echo ""
echo "TEST 4: 8 second gap (finding the threshold)"
test_gap 8 3
echo ""
echo "========================================"
echo "ANALYSIS"
echo "========================================"
echo ""
echo "Full log saved to: /tmp/job_transition_log.txt"
echo ""
echo "Summary of results:"
grep "Job [0-9]:" /tmp/job_transition_log.txt | tail -12
echo ""
echo "GPU state patterns during gaps:"
echo "Look for GPU clock/memory differences between good and bad runs"

View File

@@ -0,0 +1,16 @@
#!/usr/bin/env bash
PATTERNS="start_voice.api.sh|voice_api.py"
echo "Searching for processes matching: $PATTERNS"
PIDS=$(ps aux | grep -E "$PATTERNS" | grep -v grep | awk '{print $2}')
if [ -z "$PIDS" ]; then
echo "No matching processes found."
exit 0
fi
echo "Killing the following PIDs:"
echo "$PIDS"
kill -9 $PIDS

View File

@@ -0,0 +1,28 @@
#!/bin/bash
# Detailed GPU monitoring during job execution
echo "=== GPU STATE MONITORING ==="
echo "Monitoring: clocks, memory, temperature, utilization"
echo "Press Ctrl+C to stop"
echo ""
echo "Time | sclk (MHz) | mclk (MHz) | Temp (C) | GPU% | VRAM Used"
echo "----------|------------|------------|----------|------|------------"
while true; do
timestamp=$(date +%H:%M:%S)
# Get GPU stats
stats=$(rocm-smi 2>/dev/null)
# Extract values
sclk=$(echo "$stats" | grep -A1 "sclk" | tail -1 | awk '{print $1}')
mclk=$(echo "$stats" | grep -A1 "mclk" | tail -1 | awk '{print $1}')
temp=$(echo "$stats" | grep "Temperature" | awk '{print $3}')
gpu_use=$(echo "$stats" | grep "GPU use" | awk '{print $4}')
vram=$(echo "$stats" | grep "GTT use" -A1 | tail -1 | grep -oP '\d+MiB')
printf "%s | %10s | %10s | %8s | %4s | %s\n" \
"$timestamp" "$sclk" "$mclk" "$temp" "$gpu_use" "$vram"
sleep 0.5
done

View File

@@ -0,0 +1,44 @@
#!/bin/bash
# Monitor GPU clocks during inference test
echo "Starting GPU monitoring during inference..."
echo "=========================================="
# Start GPU monitoring in background
(
while true; do
echo "--- $(date +%H:%M:%S) ---"
rocm-smi --showclocks --showuse 2>/dev/null | grep -E "sclk|mclk|GPU use"
sleep 0.5
done
) > /tmp/gpu_monitor.log &
MONITOR_PID=$!
echo "Monitor started (PID: $MONITOR_PID)"
echo ""
# Run test
echo "Submitting test job..."
curl -s -X POST http://localhost:8765/api/speak \
-H "Content-Type: application/json" \
-d '{"text": "Comprehensive performance test to monitor GPU clock speeds and utilization during Soprano and RVC processing."}' \
> /dev/null
echo "Job submitted, waiting 10 seconds..."
sleep 10
# Stop monitoring
kill $MONITOR_PID 2>/dev/null
echo ""
echo "=========================================="
echo "GPU Monitoring Results:"
echo "=========================================="
cat /tmp/gpu_monitor.log
echo ""
echo "=========================================="
echo "Performance Results:"
echo "=========================================="
sleep 1
tail -10 /tmp/soprano_timing_debug.txt

View File

@@ -0,0 +1 @@
3.10.19

View File

@@ -0,0 +1,25 @@
#!/bin/bash
echo "========================================"
echo " Soprano + RVC Streaming API"
echo "========================================"
echo ""
# Check if virtual environment exists
if [ ! -d ".venv" ]; then
echo "✗ Virtual environment not found"
echo "Please run setup first"
exit 1
fi
echo "✓ Virtual environment found"
# Activate virtual environment
source .venv/bin/activate
echo "Starting Soprano + RVC Streaming API..."
echo "Models will load on first request (15-20 seconds)"
echo ""
# Run the API
python soprano_rvc_api.py

View File

@@ -0,0 +1,42 @@
#!/bin/bash
# Extended test: more runs per gap to see steady-state behavior
echo "=== EXTENDED GAP TIMING TEST ==="
echo "Running 5 jobs per gap interval to see steady-state"
echo ""
rm -f /tmp/soprano_timing_debug.txt
test_gap() {
local gap=$1
echo "========================================"
echo "Testing ${gap}s gap (5 runs)"
echo "========================================"
for i in {1..5}; do
echo "Run $i at $(date +%H:%M:%S)"
curl -s -X POST http://localhost:8765/api/speak \
-H "Content-Type: application/json" \
-d "{\"text\": \"Extended test run $i with ${gap} second gap to find steady state performance.\"}" \
> /dev/null
sleep $gap
done
echo "Waiting 10s for last job..."
sleep 10
echo ""
}
# Test 9s gap (between known good 10s and bad 8s)
echo "TEST 1: 9s gap"
test_gap 9
# Test 7s gap
echo "TEST 2: 7s gap"
test_gap 7
echo ""
echo "========================================"
echo "RESULTS"
echo "========================================"
cat /tmp/soprano_timing_debug.txt | grep -E "Realtime|Avg RVC" | paste - - | awk '{print NR ". " $3 " realtime, RVC: " $6}'

View File

@@ -0,0 +1,65 @@
#!/bin/bash
# Simple test: measure actual job completion times and GPU state
echo "=== GAP TIMING TEST ==="
echo "Testing to find minimum gap that maintains performance"
echo ""
rm -f /tmp/soprano_timing_debug.txt
test_with_gap() {
local gap=$1
local runs=$2
echo "========================================"
echo "Testing ${gap}s gap (${runs} runs)"
echo "========================================"
for i in $(seq 1 $runs); do
echo "Run $i: Starting at $(date +%H:%M:%S)"
# Submit job and measure response time
start=$(date +%s.%N)
curl -s -X POST http://localhost:8765/api/speak \
-H "Content-Type: application/json" \
-d "{\"text\": \"Gap test run number $i with ${gap} second interval to verify performance consistency.\"}" \
> /dev/null
end=$(date +%s.%N)
response_time=$(echo "$end - $start" | bc)
echo " Response time: ${response_time}s"
# Wait for specified gap
echo " Waiting ${gap}s before next run..."
sleep $gap
done
echo ""
echo "Waiting 8s for last job to complete..."
sleep 8
echo ""
echo "Results:"
tail -$((runs * 6)) /tmp/soprano_timing_debug.txt | grep -E "Realtime|Avg RVC" | \
paste - - | awk '{print " Run " NR ": " $3 " realtime, RVC: " $6}'
echo ""
}
# Test different gaps
echo "TEST 1: 10s gap (baseline)"
test_with_gap 10 3
echo "TEST 2: 8s gap"
test_with_gap 8 3
echo "TEST 3: 6s gap"
test_with_gap 6 3
echo "TEST 4: 5s gap"
test_with_gap 5 3
echo ""
echo "========================================"
echo "SUMMARY"
echo "========================================"
cat /tmp/soprano_timing_debug.txt | grep -E "Realtime factor" | tail -12

View File

@@ -0,0 +1,207 @@
#!/usr/bin/env python3
"""
Test for GPU contention between Soprano and RVC
Measures if running both models simultaneously slows them down
"""
import time
import threading
import requests
from statistics import mean
def test_soprano_solo(num_runs=3):
"""Test Soprano alone"""
print("=" * 60)
print("TEST 1: Soprano Solo (baseline)")
print("=" * 60)
text = "This is a comprehensive test of Soprano TTS performance to measure the true speed without any interference."
times = []
for i in range(num_runs):
print(f"\nRun {i+1}/{num_runs}...")
start = time.time()
response = requests.get(
f"http://localhost:8765/api/test_soprano_stream",
params={"text": text},
stream=True
)
# Consume the stream
for chunk in response.iter_content(chunk_size=8192):
pass
elapsed = time.time() - start
times.append(elapsed)
print(f" Elapsed: {elapsed:.2f}s")
time.sleep(1)
avg = mean(times)
print(f"\n✓ Soprano Solo Average: {avg:.2f}s")
return avg
def test_rvc_solo(num_runs=3):
"""Test RVC alone"""
print("\n" + "=" * 60)
print("TEST 2: RVC Solo (baseline)")
print("=" * 60)
times = []
for i in range(num_runs):
print(f"\nRun {i+1}/{num_runs}...")
start = time.time()
response = requests.get(
f"http://localhost:8765/api/test_rvc_only",
params={"input_file": "/tmp/soprano_test_3.wav"},
stream=True
)
# Consume the stream
for chunk in response.iter_content(chunk_size=8192):
pass
elapsed = time.time() - start
times.append(elapsed)
print(f" Elapsed: {elapsed:.2f}s")
time.sleep(1)
avg = mean(times)
print(f"\n✓ RVC Solo Average: {avg:.2f}s")
return avg
def concurrent_soprano():
"""Run Soprano in thread"""
text = "This is a comprehensive test of Soprano TTS performance to measure the true speed without any interference."
start = time.time()
response = requests.get(
f"http://localhost:8765/api/test_soprano_stream",
params={"text": text},
stream=True
)
for chunk in response.iter_content(chunk_size=8192):
pass
return time.time() - start
def concurrent_rvc():
"""Run RVC in thread"""
start = time.time()
response = requests.get(
f"http://localhost:8765/api/test_rvc_only",
params={"input_file": "/tmp/soprano_test_3.wav"},
stream=True
)
for chunk in response.iter_content(chunk_size=8192):
pass
return time.time() - start
def test_concurrent(num_runs=3):
"""Test Soprano and RVC running simultaneously"""
print("\n" + "=" * 60)
print("TEST 3: Soprano + RVC CONCURRENT (contention test)")
print("=" * 60)
soprano_times = []
rvc_times = []
for i in range(num_runs):
print(f"\nRun {i+1}/{num_runs}...")
print(" Starting both simultaneously...")
# Create threads
soprano_result = [None]
rvc_result = [None]
def run_soprano():
soprano_result[0] = concurrent_soprano()
def run_rvc():
rvc_result[0] = concurrent_rvc()
soprano_thread = threading.Thread(target=run_soprano)
rvc_thread = threading.Thread(target=run_rvc)
# Start both at the same time
start = time.time()
soprano_thread.start()
time.sleep(0.1) # Slight stagger to ensure both are running
rvc_thread.start()
# Wait for both to complete
soprano_thread.join()
rvc_thread.join()
total_elapsed = time.time() - start
soprano_time = soprano_result[0]
rvc_time = rvc_result[0]
soprano_times.append(soprano_time)
rvc_times.append(rvc_time)
print(f" Soprano: {soprano_time:.2f}s")
print(f" RVC: {rvc_time:.2f}s")
print(f" Total: {total_elapsed:.2f}s")
time.sleep(2)
avg_soprano = mean(soprano_times)
avg_rvc = mean(rvc_times)
print(f"\n✓ Concurrent Soprano Average: {avg_soprano:.2f}s")
print(f"✓ Concurrent RVC Average: {avg_rvc:.2f}s")
return avg_soprano, avg_rvc
def main():
print("\n" + "=" * 60)
print("GPU CONTENTION TEST")
print("=" * 60)
print("\nThis test will:")
print("1. Measure Soprano speed alone")
print("2. Measure RVC speed alone")
print("3. Measure both running simultaneously")
print("4. Compare to detect GPU contention")
print("\nStarting in 3 seconds...")
time.sleep(3)
# Run tests
soprano_solo = test_soprano_solo(num_runs=3)
rvc_solo = test_rvc_solo(num_runs=3)
soprano_concurrent, rvc_concurrent = test_concurrent(num_runs=3)
# Analysis
print("\n" + "=" * 60)
print("ANALYSIS")
print("=" * 60)
soprano_slowdown = ((soprano_concurrent - soprano_solo) / soprano_solo) * 100
rvc_slowdown = ((rvc_concurrent - rvc_solo) / rvc_solo) * 100
print(f"\nSoprano Solo: {soprano_solo:.2f}s")
print(f"Soprano Concurrent: {soprano_concurrent:.2f}s")
print(f"Slowdown: {soprano_slowdown:+.1f}%")
print(f"\nRVC Solo: {rvc_solo:.2f}s")
print(f"RVC Concurrent: {rvc_concurrent:.2f}s")
print(f"Slowdown: {rvc_slowdown:+.1f}%")
print("\n" + "=" * 60)
print("CONCLUSION")
print("=" * 60)
if soprano_slowdown > 10 or rvc_slowdown > 10:
print("⚠️ SIGNIFICANT GPU CONTENTION DETECTED!")
print(f" Running both models simultaneously causes:")
print(f" - Soprano: {soprano_slowdown:+.1f}% slower")
print(f" - RVC: {rvc_slowdown:+.1f}% slower")
print("\n This explains why async pipeline doesn't improve performance.")
print(" The GPU cannot efficiently run both models at full speed.")
elif soprano_slowdown > 5 or rvc_slowdown > 5:
print("⚠️ MODERATE GPU CONTENTION DETECTED")
print(f" Running both models simultaneously causes:")
print(f" - Soprano: {soprano_slowdown:+.1f}% slower")
print(f" - RVC: {rvc_slowdown:+.1f}% slower")
print("\n Some contention exists but may not be the main bottleneck.")
else:
print("✓ NO SIGNIFICANT GPU CONTENTION")
print(f" - Soprano: {soprano_slowdown:+.1f}% change")
print(f" - RVC: {rvc_slowdown:+.1f}% change")
print("\n GPU can handle both models simultaneously!")
print(" The async pipeline slowdown has a different cause.")
print("\n" + "=" * 60)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,211 @@
#!/usr/bin/env python3
"""
PROPER GPU Contention Test v2
Tests the ACTUAL async pipeline vs isolated components to find the real bottleneck
"""
import time
import requests
import subprocess
from statistics import mean
def test_soprano_isolated(num_runs=3):
"""Test Soprano isolated endpoint"""
print("=" * 70)
print("TEST 1: Soprano ISOLATED (baseline)")
print("=" * 70)
text = "This is a comprehensive test of raw Soprano TTS streaming performance without any RVC processing to isolate the true bottleneck and measure generation speed."
times = []
audio_durations = []
for i in range(num_runs):
print(f"\nRun {i+1}/{num_runs}...")
start = time.time()
# Download the audio file
response = requests.get(
f"http://localhost:8765/api/test_soprano_stream",
params={"text": text},
stream=True
)
output_file = f"/tmp/soprano_isolated_{i}.wav"
with open(output_file, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
elapsed = time.time() - start
times.append(elapsed)
# Get audio duration using ffprobe
result = subprocess.run(
['ffprobe', '-v', 'error', '-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1', output_file],
capture_output=True, text=True
)
duration = float(result.stdout.strip())
audio_durations.append(duration)
realtime = duration / elapsed
print(f" Elapsed: {elapsed:.2f}s, Audio: {duration:.2f}s, Realtime: {realtime:.2f}x")
time.sleep(1)
avg_elapsed = mean(times)
avg_duration = mean(audio_durations)
avg_realtime = avg_duration / avg_elapsed
print(f"\n✓ Soprano Isolated Average:")
print(f" Elapsed: {avg_elapsed:.2f}s")
print(f" Audio: {avg_duration:.2f}s")
print(f" Realtime: {avg_realtime:.2f}x")
return avg_elapsed, avg_duration, avg_realtime
def test_async_pipeline(num_runs=3):
"""Test the ACTUAL async pipeline (Soprano + RVC integrated)"""
print("\n" + "=" * 70)
print("TEST 2: ASYNC PIPELINE (Soprano + RVC integrated)")
print("=" * 70)
text = "This is a comprehensive test of raw Soprano TTS streaming performance without any RVC processing to isolate the true bottleneck and measure generation speed."
times = []
audio_durations = []
# First, submit job and connect to stream
for i in range(num_runs):
print(f"\nRun {i+1}/{num_runs}...")
# Submit the job
job_response = requests.post(
"http://localhost:8765/api/speak",
json={"text": text}
)
job_data = job_response.json()
print(f" Job queued: {job_data.get('job_id', 'unknown')[:8]}")
# Connect to streaming endpoint and consume audio
start = time.time()
response = requests.get(
"http://localhost:8765/api/stream/continuous",
stream=True
)
output_file = f"/tmp/async_pipeline_{i}.wav"
bytes_received = 0
with open(output_file, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
bytes_received += len(chunk)
# Stop after receiving enough data (roughly one job's worth)
# Assuming 48kHz 16-bit mono: ~96KB per second of audio
# For ~6s audio, that's ~576KB
if bytes_received > 600000: # 600KB threshold
break
elapsed = time.time() - start
times.append(elapsed)
# Close the connection
response.close()
# Get audio duration
result = subprocess.run(
['ffprobe', '-v', 'error', '-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1', output_file],
capture_output=True, text=True
)
duration = float(result.stdout.strip()) if result.stdout.strip() else 0
audio_durations.append(duration)
realtime = duration / elapsed if elapsed > 0 else 0
print(f" Elapsed: {elapsed:.2f}s, Audio: {duration:.2f}s, Realtime: {realtime:.2f}x")
print(f" Bytes received: {bytes_received}")
time.sleep(2)
avg_elapsed = mean(times)
avg_duration = mean(audio_durations)
avg_realtime = avg_duration / avg_elapsed if avg_elapsed > 0 else 0
print(f"\n✓ Async Pipeline Average:")
print(f" Elapsed: {avg_elapsed:.2f}s")
print(f" Audio: {avg_duration:.2f}s")
print(f" Realtime: {avg_realtime:.2f}x")
return avg_elapsed, avg_duration, avg_realtime
def main():
print("\n" + "=" * 70)
print("PROPER GPU CONTENTION TEST v2")
print("=" * 70)
print("\nThis test compares:")
print("1. Soprano isolated (no RVC)")
print("2. Full async pipeline (Soprano + RVC integrated)")
print("\nIf GPU contention is the issue:")
print("- Soprano should be slower in the pipeline due to RVC competing")
print("- Performance loss should correlate with GPU memory pressure")
print("\nStarting in 3 seconds...")
time.sleep(3)
# Run tests
soprano_elapsed, soprano_duration, soprano_realtime = test_soprano_isolated(num_runs=3)
pipeline_elapsed, pipeline_duration, pipeline_realtime = test_async_pipeline(num_runs=3)
# Analysis
print("\n" + "=" * 70)
print("ANALYSIS")
print("=" * 70)
# Calculate how much slower the pipeline is
slowdown_factor = soprano_realtime / pipeline_realtime if pipeline_realtime > 0 else float('inf')
slowdown_percent = ((soprano_realtime - pipeline_realtime) / soprano_realtime) * 100 if soprano_realtime > 0 else 0
print(f"\nSoprano Isolated: {soprano_realtime:.2f}x realtime")
print(f"Async Pipeline: {pipeline_realtime:.2f}x realtime")
print(f"Performance Loss: {slowdown_percent:.1f}%")
print(f"Slowdown Factor: {slowdown_factor:.2f}x")
print("\n" + "=" * 70)
print("CONCLUSION")
print("=" * 70)
if slowdown_percent > 40:
print("⚠️ MAJOR BOTTLENECK DETECTED!")
print(f"\n The pipeline runs at {slowdown_percent:.1f}% slower than Soprano alone.")
print("\n Possible causes:")
print(" 1. GPU contention (RVC competing with Soprano)")
print(" 2. Memory bandwidth limitations")
print(" 3. Pipeline overhead (queue, threading)")
print(" 4. CPU bottleneck (PCIe lanes, context switching)")
# Check logs for timing breakdown
print("\n Recommendation: Check server logs for detailed timing breakdown")
print(" Look for '[Soprano]' and '[RVC]' timing to see where time is spent")
elif slowdown_percent > 20:
print("⚠️ MODERATE PERFORMANCE LOSS")
print(f"\n The pipeline runs at {slowdown_percent:.1f}% slower than Soprano alone.")
print(" This is expected overhead from RVC processing")
elif slowdown_percent > 10:
print("✓ MINOR OVERHEAD (ACCEPTABLE)")
print(f"\n The pipeline runs at {slowdown_percent:.1f}% slower than Soprano alone.")
print(" This is normal for integrated pipeline processing")
else:
print("✓ EXCELLENT PERFORMANCE!")
print(f"\n Pipeline performance: {pipeline_realtime:.2f}x realtime")
print(" Minimal overhead from integration")
# Target assessment
print(f"\n Target: >1.0x realtime")
if pipeline_realtime >= 1.0:
print(f" Status: ✓ ACHIEVED ({pipeline_realtime:.2f}x)")
else:
needed_improvement = (1.0 / pipeline_realtime - 1.0) * 100
print(f" Status: ✗ MISSED (need {needed_improvement:.1f}% improvement)")
print("\n" + "=" * 70)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,124 @@
#!/usr/bin/env python3
"""
Simple performance comparison test
"""
import time
import requests
import subprocess
def test_isolated():
print("=" * 60)
print("Testing Soprano ISOLATED (5 runs)")
print("=" * 60)
text = "This is a comprehensive test of Soprano TTS performance measuring realtime factor and generation speed across multiple runs to establish a reliable baseline."
times = []
for i in range(5):
print(f"\nRun {i+1}/5...", end=" ", flush=True)
start = time.time()
response = requests.get(
"http://localhost:8765/api/test_soprano_stream",
params={"text": text},
stream=True
)
output = f"/tmp/iso_{i}.wav"
with open(output, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
elapsed = time.time() - start
# Get duration
result = subprocess.run(
['ffprobe', '-v', 'error', '-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1', output],
capture_output=True, text=True
)
duration = float(result.stdout.strip())
realtime = duration / elapsed
times.append((elapsed, duration, realtime))
print(f"Elapsed: {elapsed:.2f}s, Audio: {duration:.2f}s, RT: {realtime:.2f}x")
time.sleep(0.5)
avg_rt = sum(t[2] for t in times) / len(times)
print(f"\n✓ Average Realtime: {avg_rt:.2f}x")
return avg_rt
def test_pipeline():
print("\n" + "=" * 60)
print("Testing ASYNC PIPELINE (5 runs)")
print("=" * 60)
text = "This is a comprehensive test of Soprano TTS performance measuring realtime factor and generation speed across multiple runs to establish a reliable baseline."
times = []
for i in range(5):
print(f"\nRun {i+1}/5...", end=" ", flush=True)
# Submit job
job_resp = requests.post("http://localhost:8765/api/speak", json={"text": text})
# Connect and receive
start = time.time()
response = requests.get("http://localhost:8765/api/stream/continuous", stream=True)
output = f"/tmp/pipe_{i}.wav"
bytes_recv = 0
with open(output, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
bytes_recv += len(chunk)
if bytes_recv > 700000: # ~7s audio
break
elapsed = time.time() - start
response.close()
# Get duration
result = subprocess.run(
['ffprobe', '-v', 'error', '-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1', output],
capture_output=True, text=True
)
duration = float(result.stdout.strip()) if result.stdout.strip() else 0
realtime = duration / elapsed if elapsed > 0 else 0
times.append((elapsed, duration, realtime))
print(f"Elapsed: {elapsed:.2f}s, Audio: {duration:.2f}s, RT: {realtime:.2f}x")
time.sleep(1)
avg_rt = sum(t[2] for t in times) / len(times)
print(f"\n✓ Average Realtime: {avg_rt:.2f}x")
return avg_rt
def main():
print("\nLMDEPLOY BACKEND PERFORMANCE TEST")
print("=" * 60)
iso_rt = test_isolated()
pipe_rt = test_pipeline()
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Soprano Isolated: {iso_rt:.2f}x realtime")
print(f"Async Pipeline: {pipe_rt:.2f}x realtime")
loss = ((iso_rt - pipe_rt) / iso_rt) * 100
print(f"Performance Loss: {loss:.1f}%")
if pipe_rt >= 1.0:
print(f"\n✓ TARGET ACHIEVED: {pipe_rt:.2f}x >= 1.0x")
else:
needed = ((1.0 / pipe_rt) - 1.0) * 100
print(f"\n✗ Target missed, need {needed:.1f}% improvement")
print("=" * 60)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,42 @@
#!/usr/bin/env python3
"""
Test client for Soprano server
"""
import zmq
import json
import uuid
import numpy as np
# Connect to Soprano server
context = zmq.Context()
socket = context.socket(zmq.REQ)
socket.connect("tcp://localhost:5555")
print("Testing Soprano server...")
# Send request
job_id = str(uuid.uuid4())
request = {
'job_id': job_id,
'text': 'This is a test of the Soprano server running on the GTX 1660 GPU.'
}
print(f"Sending request: {request['text']}")
socket.send_json(request)
# Wait for response
print("Waiting for response...")
response = socket.recv_json()
if 'error' in response:
print(f"Error: {response['error']}")
else:
audio = np.array(response['audio'])
print(f"✓ Received audio: {len(audio)} samples @ {response['sample_rate']}Hz")
print(f" Duration: {response['audio_duration']:.2f}s")
print(f" Generation time: {response['elapsed']:.2f}s")
print(f" Realtime factor: {response['audio_duration'] / response['elapsed']:.2f}x")
socket.close()
context.term()