Voice conversion pipeline (Soprano TTS → RVC) with Docker support. Previously tracked as bare gitlink; removed .git/ directories and absorbed into main repo for unified tracking. Includes: Soprano TTS, RVC WebUI integration, Docker configs, WebSocket API, and benchmark scripts. Updated .gitignore to exclude large model weights (*.pth, *.pt, *.onnx, *.index). 287 files (3.1GB of ML weights properly excluded via gitignore).
134 lines
4.4 KiB
Python
134 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Soprano Server - Runs on GTX 1660 (CUDA)
|
|
Generates TTS audio and sends chunks to RVC via ZMQ
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import logging
|
|
import zmq
|
|
import numpy as np
|
|
|
|
# Remove current directory from path to avoid shadowing the soprano package
|
|
if '' in sys.path:
|
|
sys.path.remove('')
|
|
if '.' in sys.path:
|
|
sys.path.remove('.')
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
if current_dir in sys.path:
|
|
sys.path.remove(current_dir)
|
|
|
|
from soprano import SopranoTTS
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SopranoServer:
|
|
def __init__(self, zmq_port=5555):
|
|
logger.info("Initializing Soprano Server (GTX 1660 CUDA)...")
|
|
|
|
# Setup ZMQ
|
|
self.context = zmq.Context()
|
|
self.socket = self.context.socket(zmq.REP) # Reply socket (receives requests, sends responses)
|
|
self.socket.bind(f"tcp://*:{zmq_port}")
|
|
logger.info(f"✓ ZMQ listening on port {zmq_port}")
|
|
|
|
# Load Soprano model
|
|
logger.info("Loading Soprano TTS model...")
|
|
self.soprano = SopranoTTS(
|
|
backend='lmdeploy',
|
|
device='cuda',
|
|
cache_size_mb=500,
|
|
decoder_batch_size=2
|
|
)
|
|
logger.info(f"✓ Soprano loaded on {self.soprano.device} using {self.soprano.backend}")
|
|
|
|
def process_job(self, job_data):
|
|
"""Process a single TTS job and send chunks via ZMQ"""
|
|
job_id = job_data['job_id']
|
|
text = job_data['text']
|
|
|
|
logger.info(f"[Job {job_id[:8]}] Processing: \"{text[:50]}...\"")
|
|
start_time = time.time()
|
|
|
|
chunks_sent = 0
|
|
|
|
try:
|
|
# Stream generation from Soprano
|
|
stream = self.soprano.infer_stream(text, chunk_size=10)
|
|
|
|
for audio_chunk in stream:
|
|
chunks_sent += 1
|
|
logger.debug(f"[Job {job_id[:8]}] Generated chunk {chunks_sent}")
|
|
|
|
# Send all chunks as one batch (for simplicity)
|
|
# In production, could stream chunk-by-chunk
|
|
full_audio = self.soprano.infer(text)
|
|
|
|
elapsed = time.time() - start_time
|
|
audio_duration = len(full_audio) / 32000
|
|
realtime_factor = audio_duration / elapsed
|
|
|
|
logger.info(f"[Job {job_id[:8]}] Complete: {audio_duration:.2f}s audio in {elapsed:.2f}s ({realtime_factor:.2f}x realtime)")
|
|
|
|
# Return audio to RVC server
|
|
response = {
|
|
'job_id': job_id,
|
|
'audio': full_audio.tolist(),
|
|
'sample_rate': 32000,
|
|
'elapsed': elapsed,
|
|
'audio_duration': audio_duration
|
|
}
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
logger.error(f"[Job {job_id[:8]}] Error: {e}", exc_info=True)
|
|
return {'job_id': job_id, 'error': str(e)}
|
|
|
|
def run(self):
|
|
"""Main loop: listen for requests via ZMQ"""
|
|
logger.info("Soprano Server ready - waiting for requests...")
|
|
|
|
while True:
|
|
try:
|
|
# Wait for request from RVC server
|
|
message = self.socket.recv_json()
|
|
|
|
if message.get('shutdown'):
|
|
logger.info("Shutdown signal received")
|
|
self.socket.send_json({'status': 'shutting down'})
|
|
break
|
|
|
|
# Process the TTS job
|
|
response = self.process_job(message)
|
|
|
|
# Send response back
|
|
self.socket.send_json(response)
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("Shutting down...")
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"Error in main loop: {e}", exc_info=True)
|
|
try:
|
|
self.socket.send_json({'error': str(e)})
|
|
except:
|
|
pass
|
|
|
|
if __name__ == '__main__':
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Soprano TTS Server (CUDA)')
|
|
parser.add_argument('--port', type=int, default=5555, help='ZMQ port')
|
|
args = parser.parse_args()
|
|
|
|
server = SopranoServer(zmq_port=args.port)
|
|
server.run()
|