416 lines
15 KiB
Python
416 lines
15 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Unified Soprano TTS + RVC Pipeline
|
||
|
|
Combines soprano_to_virtual_sink.py and headless_rvc.py into a single interface
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import os
|
||
|
|
import json
|
||
|
|
import argparse
|
||
|
|
import threading
|
||
|
|
import time
|
||
|
|
import subprocess
|
||
|
|
import warnings
|
||
|
|
import logging
|
||
|
|
from pathlib import Path
|
||
|
|
from dataclasses import dataclass, asdict
|
||
|
|
from typing import Optional
|
||
|
|
from contextlib import redirect_stdout, redirect_stderr
|
||
|
|
|
||
|
|
# Configure logging - only show INFO and above
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format='%(levelname)s: %(message)s'
|
||
|
|
)
|
||
|
|
# Suppress debug logs from all modules
|
||
|
|
logging.getLogger().setLevel(logging.INFO)
|
||
|
|
|
||
|
|
# Suppress warnings
|
||
|
|
warnings.filterwarnings('ignore')
|
||
|
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
||
|
|
os.environ['MIOPEN_LOG_LEVEL'] = '1' # Suppress MIOpen warnings
|
||
|
|
|
||
|
|
# Add soprano to path
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent / "soprano"))
|
||
|
|
|
||
|
|
import numpy as np
|
||
|
|
import sounddevice as sd
|
||
|
|
from scipy import signal as scipy_signal
|
||
|
|
import torch
|
||
|
|
|
||
|
|
# Import soprano
|
||
|
|
from soprano import SopranoTTS
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class RVCConfig:
|
||
|
|
"""RVC configuration parameters"""
|
||
|
|
pth: str
|
||
|
|
index: str
|
||
|
|
pitch: int = 0
|
||
|
|
formant: float = 0.0
|
||
|
|
index_rate: float = 0.75
|
||
|
|
filter_radius: int = 3
|
||
|
|
rms_mix_rate: float = 0.25
|
||
|
|
protect: float = 0.33
|
||
|
|
f0method: str = "rmvpe"
|
||
|
|
input_device: str = "soprano_rvc"
|
||
|
|
output_device: Optional[str] = None
|
||
|
|
samplerate: int = 48000
|
||
|
|
channels: int = 2
|
||
|
|
block_time: float = 0.25
|
||
|
|
crossfade_time: float = 0.04
|
||
|
|
extra_time: float = 2.5
|
||
|
|
n_cpu: int = 4
|
||
|
|
I_noise_reduce: bool = False
|
||
|
|
O_noise_reduce: bool = False
|
||
|
|
use_pv: bool = True
|
||
|
|
threshold: float = -60.0
|
||
|
|
|
||
|
|
@classmethod
|
||
|
|
def from_file(cls, config_path: str) -> 'RVCConfig':
|
||
|
|
"""Load configuration from JSON file"""
|
||
|
|
with open(config_path, 'r') as f:
|
||
|
|
data = json.load(f)
|
||
|
|
return cls(**data)
|
||
|
|
|
||
|
|
def to_file(self, config_path: str):
|
||
|
|
"""Save configuration to JSON file"""
|
||
|
|
with open(config_path, 'w') as f:
|
||
|
|
json.dump(asdict(self), f, indent=2)
|
||
|
|
|
||
|
|
|
||
|
|
class UnifiedPipeline:
|
||
|
|
"""Unified Soprano TTS + RVC pipeline"""
|
||
|
|
|
||
|
|
def __init__(self, rvc_config: RVCConfig, virtual_sink_name: str = "soprano_to_rvc"):
|
||
|
|
self.rvc_config = rvc_config
|
||
|
|
self.virtual_sink_name = virtual_sink_name
|
||
|
|
self.soprano = None
|
||
|
|
self.rvc_process = None
|
||
|
|
self.rvc_thread = None
|
||
|
|
self.soprano_stream = None
|
||
|
|
self.running = False
|
||
|
|
|
||
|
|
# Soprano audio parameters
|
||
|
|
self.soprano_sample_rate = 32000
|
||
|
|
self.virtual_sink_sample_rate = 48000
|
||
|
|
|
||
|
|
def ensure_virtual_sink(self):
|
||
|
|
"""Ensure PulseAudio virtual sink exists"""
|
||
|
|
print("Checking virtual sink...")
|
||
|
|
|
||
|
|
# Check if sink exists
|
||
|
|
result = subprocess.run(
|
||
|
|
["pactl", "list", "sinks", "short"],
|
||
|
|
capture_output=True,
|
||
|
|
text=True
|
||
|
|
)
|
||
|
|
|
||
|
|
if self.virtual_sink_name not in result.stdout:
|
||
|
|
print(f"Creating virtual sink: {self.virtual_sink_name}")
|
||
|
|
subprocess.run([
|
||
|
|
"pactl", "load-module", "module-null-sink",
|
||
|
|
f"sink_name={self.virtual_sink_name}",
|
||
|
|
f"sink_properties=device.description='Soprano_to_RVC_Virtual_Sink'",
|
||
|
|
f"rate={self.virtual_sink_sample_rate}",
|
||
|
|
"channels=2"
|
||
|
|
])
|
||
|
|
time.sleep(0.5)
|
||
|
|
else:
|
||
|
|
print(f"✓ Virtual sink '{self.virtual_sink_name}' already exists")
|
||
|
|
|
||
|
|
def initialize_soprano(self):
|
||
|
|
"""Initialize Soprano TTS"""
|
||
|
|
print("\n" + "="*70)
|
||
|
|
print("Initializing Soprano TTS...")
|
||
|
|
print("="*70)
|
||
|
|
|
||
|
|
# Suppress verbose output during initialization
|
||
|
|
with open(os.devnull, 'w') as devnull:
|
||
|
|
with redirect_stdout(devnull), redirect_stderr(devnull):
|
||
|
|
self.soprano = SopranoTTS(device="cuda")
|
||
|
|
|
||
|
|
# Open audio stream to virtual sink
|
||
|
|
try:
|
||
|
|
self.soprano_stream = sd.OutputStream(
|
||
|
|
device=self.virtual_sink_name,
|
||
|
|
samplerate=self.virtual_sink_sample_rate,
|
||
|
|
channels=2,
|
||
|
|
dtype='float32',
|
||
|
|
blocksize=1024
|
||
|
|
)
|
||
|
|
self.soprano_stream.start()
|
||
|
|
print("✓ Soprano TTS initialized successfully")
|
||
|
|
print(f" Output: {self.virtual_sink_name} ({self.virtual_sink_sample_rate}Hz, stereo)")
|
||
|
|
except Exception as e:
|
||
|
|
print(f"✗ Failed to open audio stream: {e}")
|
||
|
|
raise
|
||
|
|
|
||
|
|
def start_rvc(self):
|
||
|
|
"""Start headless RVC in a separate thread"""
|
||
|
|
print("\n" + "="*70)
|
||
|
|
print("Starting RVC Voice Conversion...")
|
||
|
|
print("="*70)
|
||
|
|
|
||
|
|
def run_rvc():
|
||
|
|
# Suppress logging from RVC
|
||
|
|
import logging
|
||
|
|
logging.getLogger('faiss').setLevel(logging.ERROR)
|
||
|
|
logging.getLogger('fairseq').setLevel(logging.ERROR)
|
||
|
|
|
||
|
|
# Import here to avoid conflicts
|
||
|
|
from headless_rvc import HeadlessRVC, HeadlessRVCConfig
|
||
|
|
|
||
|
|
# Redirect RVC output
|
||
|
|
with open(os.devnull, 'w') as devnull:
|
||
|
|
with redirect_stdout(devnull), redirect_stderr(devnull):
|
||
|
|
# Convert our config to HeadlessRVCConfig
|
||
|
|
config_dict = {
|
||
|
|
'pth_path': self.rvc_config.pth,
|
||
|
|
'index_path': self.rvc_config.index,
|
||
|
|
'pitch': self.rvc_config.pitch,
|
||
|
|
'formant': self.rvc_config.formant,
|
||
|
|
'index_rate': self.rvc_config.index_rate,
|
||
|
|
'filter_radius': self.rvc_config.filter_radius,
|
||
|
|
'rms_mix_rate': self.rvc_config.rms_mix_rate,
|
||
|
|
'protect': self.rvc_config.protect,
|
||
|
|
'f0method': self.rvc_config.f0method,
|
||
|
|
'input_device': self.rvc_config.input_device,
|
||
|
|
'output_device': self.rvc_config.output_device,
|
||
|
|
'samplerate': self.rvc_config.samplerate,
|
||
|
|
'channels': self.rvc_config.channels,
|
||
|
|
'block_time': self.rvc_config.block_time,
|
||
|
|
'crossfade_time': self.rvc_config.crossfade_time,
|
||
|
|
'extra_time': self.rvc_config.extra_time,
|
||
|
|
'n_cpu': self.rvc_config.n_cpu,
|
||
|
|
'I_noise_reduce': self.rvc_config.I_noise_reduce,
|
||
|
|
'O_noise_reduce': self.rvc_config.O_noise_reduce,
|
||
|
|
'use_pv': self.rvc_config.use_pv,
|
||
|
|
'threshold': self.rvc_config.threshold
|
||
|
|
}
|
||
|
|
gui_config = HeadlessRVCConfig(config_dict)
|
||
|
|
|
||
|
|
self.rvc = HeadlessRVC(gui_config)
|
||
|
|
self.rvc.start()
|
||
|
|
|
||
|
|
# Keep running until stopped
|
||
|
|
while self.running:
|
||
|
|
time.sleep(0.1)
|
||
|
|
|
||
|
|
# Suppress stop output too
|
||
|
|
with open(os.devnull, 'w') as devnull:
|
||
|
|
with redirect_stdout(devnull), redirect_stderr(devnull):
|
||
|
|
self.rvc.stop()
|
||
|
|
|
||
|
|
self.rvc_thread = threading.Thread(target=run_rvc, daemon=True)
|
||
|
|
self.running = True
|
||
|
|
self.rvc_thread.start()
|
||
|
|
|
||
|
|
# Wait for RVC to initialize
|
||
|
|
time.sleep(3) # Give it time to load models
|
||
|
|
print("✓ RVC initialized successfully")
|
||
|
|
|
||
|
|
def stream_audio_chunk(self, audio_chunk):
|
||
|
|
"""Stream an audio chunk to the virtual sink"""
|
||
|
|
if audio_chunk is None:
|
||
|
|
return
|
||
|
|
|
||
|
|
# Convert torch tensor to numpy if needed
|
||
|
|
if torch.is_tensor(audio_chunk):
|
||
|
|
audio_chunk = audio_chunk.cpu().numpy()
|
||
|
|
|
||
|
|
if len(audio_chunk) == 0:
|
||
|
|
return
|
||
|
|
|
||
|
|
# Ensure float32
|
||
|
|
audio_chunk = audio_chunk.astype(np.float32)
|
||
|
|
|
||
|
|
# Resample from 32kHz to 48kHz
|
||
|
|
if self.soprano_sample_rate != self.virtual_sink_sample_rate:
|
||
|
|
num_samples_output = int(len(audio_chunk) * self.virtual_sink_sample_rate / self.soprano_sample_rate)
|
||
|
|
audio_chunk = scipy_signal.resample(audio_chunk, num_samples_output)
|
||
|
|
|
||
|
|
# Clean audio (handle NaN/inf)
|
||
|
|
audio_chunk = np.nan_to_num(audio_chunk, nan=0.0, posinf=0.0, neginf=0.0)
|
||
|
|
audio_chunk = np.clip(audio_chunk, -1.0, 1.0)
|
||
|
|
|
||
|
|
# Convert mono to stereo
|
||
|
|
if audio_chunk.ndim == 1:
|
||
|
|
audio_chunk = np.column_stack((audio_chunk, audio_chunk))
|
||
|
|
|
||
|
|
# Write to stream
|
||
|
|
try:
|
||
|
|
self.soprano_stream.write(audio_chunk)
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Warning: Failed to write audio chunk: {e}")
|
||
|
|
|
||
|
|
def process_text(self, text: str):
|
||
|
|
"""Process text through TTS and stream to virtual sink"""
|
||
|
|
if not text.strip():
|
||
|
|
return
|
||
|
|
|
||
|
|
print(f"\n🎤 Processing: {text}", flush=True)
|
||
|
|
|
||
|
|
# Generate and stream audio (suppress Soprano's verbose output)
|
||
|
|
with open(os.devnull, 'w') as devnull:
|
||
|
|
with redirect_stdout(devnull), redirect_stderr(devnull):
|
||
|
|
for audio_chunk in self.soprano.infer_stream(text):
|
||
|
|
if audio_chunk is not None:
|
||
|
|
self.stream_audio_chunk(audio_chunk)
|
||
|
|
|
||
|
|
# Small silence at the end
|
||
|
|
silence = np.zeros(int(0.1 * self.virtual_sink_sample_rate), dtype=np.float32)
|
||
|
|
self.stream_audio_chunk(silence)
|
||
|
|
print("✓ Done\n", flush=True)
|
||
|
|
|
||
|
|
def run(self):
|
||
|
|
"""Run the unified pipeline"""
|
||
|
|
try:
|
||
|
|
# Setup
|
||
|
|
self.ensure_virtual_sink()
|
||
|
|
self.initialize_soprano()
|
||
|
|
self.start_rvc()
|
||
|
|
|
||
|
|
print("\n" + "="*70)
|
||
|
|
print("UNIFIED SOPRANO TTS + RVC PIPELINE")
|
||
|
|
print("="*70)
|
||
|
|
print(f"\n✓ Ready! Voice conversion active (pitch: {self.rvc_config.pitch:+d} semitones)")
|
||
|
|
print("\nCommands:")
|
||
|
|
print(" - Type text and press Enter to generate speech")
|
||
|
|
print(" - Type 'quit' or 'exit' to stop")
|
||
|
|
print(" - Press Ctrl+C to stop")
|
||
|
|
print("="*70 + "\n")
|
||
|
|
|
||
|
|
# Interactive loop
|
||
|
|
while self.running:
|
||
|
|
try:
|
||
|
|
text = input("💬 > ").strip()
|
||
|
|
|
||
|
|
if text.lower() in ['quit', 'exit', 'q']:
|
||
|
|
break
|
||
|
|
|
||
|
|
if text:
|
||
|
|
self.process_text(text)
|
||
|
|
|
||
|
|
except EOFError:
|
||
|
|
break
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
break
|
||
|
|
|
||
|
|
finally:
|
||
|
|
self.cleanup()
|
||
|
|
|
||
|
|
def cleanup(self):
|
||
|
|
"""Clean up resources"""
|
||
|
|
print("\n\n⏹️ Stopping...")
|
||
|
|
self.running = False
|
||
|
|
|
||
|
|
if self.soprano_stream:
|
||
|
|
self.soprano_stream.stop()
|
||
|
|
self.soprano_stream.close()
|
||
|
|
|
||
|
|
if self.rvc_thread and self.rvc_thread.is_alive():
|
||
|
|
self.rvc_thread.join(timeout=2)
|
||
|
|
|
||
|
|
print("✓ Stopped")
|
||
|
|
print("👋 Goodbye!\n")
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description="Unified Soprano TTS + RVC Pipeline",
|
||
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
|
|
epilog="""
|
||
|
|
Examples:
|
||
|
|
# Run with command-line arguments
|
||
|
|
python unified_soprano_rvc.py --pth model.pth --index model.index --pitch 0
|
||
|
|
|
||
|
|
# Load from config file
|
||
|
|
python unified_soprano_rvc.py --config rvc_config.json
|
||
|
|
|
||
|
|
# Save current config
|
||
|
|
python unified_soprano_rvc.py --pth model.pth --index model.index --save-config my_config.json
|
||
|
|
"""
|
||
|
|
)
|
||
|
|
|
||
|
|
# Config file options
|
||
|
|
parser.add_argument('--config', type=str, help='Load RVC configuration from JSON file')
|
||
|
|
parser.add_argument('--save-config', type=str, help='Save configuration to JSON file and exit')
|
||
|
|
|
||
|
|
# RVC parameters
|
||
|
|
parser.add_argument('--pth', type=str, help='Path to RVC model (.pth file)')
|
||
|
|
parser.add_argument('--index', type=str, help='Path to index file')
|
||
|
|
parser.add_argument('--pitch', type=int, default=0, help='Pitch shift in semitones (default: 0)')
|
||
|
|
parser.add_argument('--formant', type=float, default=0.0, help='Formant shift (default: 0.0)')
|
||
|
|
parser.add_argument('--index-rate', type=float, default=0.75, help='Index rate (default: 0.75)')
|
||
|
|
parser.add_argument('--filter-radius', type=int, default=3, help='Filter radius (default: 3)')
|
||
|
|
parser.add_argument('--rms-mix-rate', type=float, default=0.25, help='RMS mix rate (default: 0.25)')
|
||
|
|
parser.add_argument('--protect', type=float, default=0.33, help='Protect voiceless consonants (default: 0.33)')
|
||
|
|
parser.add_argument('--f0method', type=str, default='rmvpe',
|
||
|
|
choices=['rmvpe', 'harvest', 'crepe', 'fcpe'],
|
||
|
|
help='F0 extraction method (default: rmvpe)')
|
||
|
|
|
||
|
|
# Audio device settings
|
||
|
|
parser.add_argument('--input-device', type=str, default='soprano_rvc',
|
||
|
|
help='Input audio device for RVC (default: soprano_rvc)')
|
||
|
|
parser.add_argument('--output-device', type=str, help='Output audio device (default: system default)')
|
||
|
|
parser.add_argument('--samplerate', type=int, default=48000, help='Sample rate (default: 48000)')
|
||
|
|
|
||
|
|
# Advanced options
|
||
|
|
parser.add_argument('--n-cpu', type=int, default=4, help='Number of CPU cores for F0 extraction (default: 4)')
|
||
|
|
parser.add_argument('--threshold', type=float, default=-60.0, help='Silence threshold in dB (default: -60)')
|
||
|
|
parser.add_argument('--I-noise-reduce', action='store_true', help='Enable input noise reduction')
|
||
|
|
parser.add_argument('--O-noise-reduce', action='store_true', help='Enable output noise reduction')
|
||
|
|
parser.add_argument('--no-use-pv', action='store_true', help='Disable phase vocoder')
|
||
|
|
|
||
|
|
# Virtual sink name
|
||
|
|
parser.add_argument('--virtual-sink', type=str, default='soprano_to_rvc',
|
||
|
|
help='Name of virtual sink (default: soprano_to_rvc)')
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Load or create config
|
||
|
|
if args.config:
|
||
|
|
print(f"Loading configuration from: {args.config}")
|
||
|
|
rvc_config = RVCConfig.from_file(args.config)
|
||
|
|
else:
|
||
|
|
# Validate required arguments
|
||
|
|
if not args.pth or not args.index:
|
||
|
|
parser.error("--pth and --index are required (or use --config)")
|
||
|
|
|
||
|
|
rvc_config = RVCConfig(
|
||
|
|
pth=args.pth,
|
||
|
|
index=args.index,
|
||
|
|
pitch=args.pitch,
|
||
|
|
formant=args.formant,
|
||
|
|
index_rate=args.index_rate,
|
||
|
|
filter_radius=args.filter_radius,
|
||
|
|
rms_mix_rate=args.rms_mix_rate,
|
||
|
|
protect=args.protect,
|
||
|
|
f0method=args.f0method,
|
||
|
|
input_device=args.input_device,
|
||
|
|
output_device=args.output_device,
|
||
|
|
samplerate=args.samplerate,
|
||
|
|
n_cpu=args.n_cpu,
|
||
|
|
I_noise_reduce=args.I_noise_reduce,
|
||
|
|
O_noise_reduce=args.O_noise_reduce,
|
||
|
|
use_pv=not args.no_use_pv,
|
||
|
|
threshold=args.threshold
|
||
|
|
)
|
||
|
|
|
||
|
|
# Save config if requested
|
||
|
|
if args.save_config:
|
||
|
|
rvc_config.to_file(args.save_config)
|
||
|
|
print(f"✓ Configuration saved to: {args.save_config}")
|
||
|
|
return
|
||
|
|
|
||
|
|
# Run pipeline
|
||
|
|
pipeline = UnifiedPipeline(rvc_config, virtual_sink_name=args.virtual_sink)
|
||
|
|
pipeline.run()
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|