Working with GUI, auto loopback creation, soprano streaming
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
.venv
|
||||||
|
*.pth
|
||||||
1
Retrieval-based-Voice-Conversion-WebUI
Submodule
1
Retrieval-based-Voice-Conversion-WebUI
Submodule
Submodule Retrieval-based-Voice-Conversion-WebUI added at 7ef1986778
29
cleanup_virtual_sinks.sh
Executable file
29
cleanup_virtual_sinks.sh
Executable file
@@ -0,0 +1,29 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Cleanup script to remove any leftover virtual sinks
|
||||||
|
# Run this if you encounter issues with the virtual sink
|
||||||
|
|
||||||
|
echo "🧹 Cleaning up virtual audio sinks..."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Find and remove soprano virtual sinks
|
||||||
|
MODULES=$(pactl list modules short | grep -E "soprano|rvc" | awk '{print $1}')
|
||||||
|
|
||||||
|
if [ -z "$MODULES" ]; then
|
||||||
|
echo "✓ No virtual sinks found. Nothing to clean up."
|
||||||
|
else
|
||||||
|
echo "Found virtual sink modules to remove:"
|
||||||
|
pactl list modules short | grep -E "soprano|rvc"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
for MODULE in $MODULES; do
|
||||||
|
echo "Removing module $MODULE..."
|
||||||
|
pactl unload-module "$MODULE"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "✓ Cleanup complete!"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Current audio sinks:"
|
||||||
|
pactl list sinks short
|
||||||
24
constraints.txt
Normal file
24
constraints.txt
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
# ==========================================================
|
||||||
|
# HARD CONSTRAINTS — DO NOT UPGRADE CASUALLY
|
||||||
|
# ==========================================================
|
||||||
|
|
||||||
|
python_version == "3.10.19"
|
||||||
|
|
||||||
|
# Torch / ROCm ABI lock
|
||||||
|
torch == 2.5.1+rocm6.2
|
||||||
|
torchaudio == 2.5.1+rocm6.2
|
||||||
|
torchvision == 0.20.1+rocm6.2
|
||||||
|
pytorch-triton-rocm == 3.1.0
|
||||||
|
|
||||||
|
# NumPy / Numba compatibility
|
||||||
|
numpy < 1.24
|
||||||
|
numba == 0.56.4
|
||||||
|
llvmlite == 0.39.0
|
||||||
|
|
||||||
|
# RVC core
|
||||||
|
fairseq == 0.12.2
|
||||||
|
faiss-cpu == 1.7.3
|
||||||
|
pyworld < 0.4
|
||||||
|
|
||||||
|
# Gradio pin (RVC WebUI tested)
|
||||||
|
gradio == 3.48.0
|
||||||
1070
gui_v1.py.backup
Normal file
1070
gui_v1.py.backup
Normal file
File diff suppressed because it is too large
Load Diff
260
launch_soprano_rvc.sh
Executable file
260
launch_soprano_rvc.sh
Executable file
@@ -0,0 +1,260 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Soprano TTS to RVC Pipeline Launcher
|
||||||
|
# This script helps you set up and run the soprano->RVC pipeline
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
VENV_PATH="$SCRIPT_DIR/.venv"
|
||||||
|
RVC_DIR="$SCRIPT_DIR/Retrieval-based-Voice-Conversion-WebUI"
|
||||||
|
RVC_GUI="$RVC_DIR/gui_v1.py"
|
||||||
|
SOPRANO_SCRIPT="$SCRIPT_DIR/soprano_to_virtual_sink.py"
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Print colored output
|
||||||
|
print_info() {
|
||||||
|
echo -e "${BLUE}ℹ ${NC}$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_success() {
|
||||||
|
echo -e "${GREEN}✓${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_warning() {
|
||||||
|
echo -e "${YELLOW}⚠${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_error() {
|
||||||
|
echo -e "${RED}✗${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
print_header() {
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE} $1${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════════════════════${NC}"
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check prerequisites
|
||||||
|
check_prerequisites() {
|
||||||
|
print_header "Checking Prerequisites"
|
||||||
|
|
||||||
|
# Check if virtual environment exists
|
||||||
|
if [ ! -d "$VENV_PATH" ]; then
|
||||||
|
print_error "Virtual environment not found at: $VENV_PATH"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
print_success "Virtual environment found"
|
||||||
|
|
||||||
|
# Check if RVC GUI exists
|
||||||
|
if [ ! -f "$RVC_GUI" ]; then
|
||||||
|
print_error "RVC GUI not found at: $RVC_GUI"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
print_success "RVC GUI found"
|
||||||
|
|
||||||
|
# Check if soprano script exists
|
||||||
|
if [ ! -f "$SOPRANO_SCRIPT" ]; then
|
||||||
|
print_error "Soprano script not found at: $SOPRANO_SCRIPT"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
print_success "Soprano script found"
|
||||||
|
|
||||||
|
# Check if pactl is available (PulseAudio)
|
||||||
|
if ! command -v pactl &> /dev/null; then
|
||||||
|
print_error "pactl (PulseAudio) not found. Please install PulseAudio."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
print_success "PulseAudio found"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Display usage instructions
|
||||||
|
show_usage() {
|
||||||
|
print_header "Soprano TTS to RVC Pipeline"
|
||||||
|
|
||||||
|
echo "This script helps you run a text-to-speech pipeline where:"
|
||||||
|
echo " 1. You type text into the Soprano TTS script"
|
||||||
|
echo " 2. Soprano generates speech and outputs to a virtual sink"
|
||||||
|
echo " 3. RVC reads from that virtual sink and applies voice conversion"
|
||||||
|
echo " 4. RVC outputs the converted voice to your speakers/headphones"
|
||||||
|
echo ""
|
||||||
|
echo "Usage:"
|
||||||
|
echo " $0 [option]"
|
||||||
|
echo ""
|
||||||
|
echo "Options:"
|
||||||
|
echo " soprano - Start only the Soprano TTS virtual sink script"
|
||||||
|
echo " rvc - Start only the RVC realtime GUI"
|
||||||
|
echo " both - Start both in separate terminal windows (default)"
|
||||||
|
echo " help - Show this help message"
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Start soprano script
|
||||||
|
start_soprano() {
|
||||||
|
print_header "Starting Soprano TTS Virtual Sink"
|
||||||
|
|
||||||
|
print_info "Activating virtual environment..."
|
||||||
|
source "$VENV_PATH/bin/activate"
|
||||||
|
|
||||||
|
print_info "Starting soprano_to_virtual_sink.py..."
|
||||||
|
print_info "This will create a virtual sink: soprano_to_rvc"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
python "$SOPRANO_SCRIPT"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Start RVC GUI
|
||||||
|
start_rvc() {
|
||||||
|
print_header "Starting RVC Realtime GUI"
|
||||||
|
|
||||||
|
print_info "Activating virtual environment..."
|
||||||
|
source "$VENV_PATH/bin/activate"
|
||||||
|
|
||||||
|
print_info "Changing to RVC directory..."
|
||||||
|
cd "$RVC_DIR"
|
||||||
|
|
||||||
|
print_info "Starting RVC GUI..."
|
||||||
|
echo ""
|
||||||
|
print_warning "IMPORTANT: In the RVC GUI, select 'soprano_to_rvc.monitor' as your INPUT device!"
|
||||||
|
echo ""
|
||||||
|
sleep 2
|
||||||
|
|
||||||
|
python "$RVC_GUI"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Start both in separate terminals
|
||||||
|
start_both() {
|
||||||
|
print_header "Starting Both Components"
|
||||||
|
|
||||||
|
print_info "This will open two terminal windows:"
|
||||||
|
print_info " 1. Soprano TTS Virtual Sink (for text input)"
|
||||||
|
print_info " 2. RVC Realtime GUI (for voice conversion)"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Detect terminal emulator
|
||||||
|
TERMINAL=""
|
||||||
|
if command -v gnome-terminal &> /dev/null; then
|
||||||
|
TERMINAL="gnome-terminal"
|
||||||
|
elif command -v konsole &> /dev/null; then
|
||||||
|
TERMINAL="konsole"
|
||||||
|
elif command -v xfce4-terminal &> /dev/null; then
|
||||||
|
TERMINAL="xfce4-terminal"
|
||||||
|
elif command -v alacritty &> /dev/null; then
|
||||||
|
TERMINAL="alacritty"
|
||||||
|
elif command -v kitty &> /dev/null; then
|
||||||
|
TERMINAL="kitty"
|
||||||
|
elif command -v xterm &> /dev/null; then
|
||||||
|
TERMINAL="xterm"
|
||||||
|
else
|
||||||
|
print_error "No suitable terminal emulator found"
|
||||||
|
print_info "Please start the components manually:"
|
||||||
|
print_info " Terminal 1: $0 soprano"
|
||||||
|
print_info " Terminal 2: $0 rvc"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
print_success "Using terminal: $TERMINAL"
|
||||||
|
|
||||||
|
# Start soprano in new terminal
|
||||||
|
print_info "Starting Soprano TTS in new terminal..."
|
||||||
|
case "$TERMINAL" in
|
||||||
|
gnome-terminal)
|
||||||
|
gnome-terminal -- bash -c "cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash" &
|
||||||
|
;;
|
||||||
|
konsole)
|
||||||
|
konsole -e bash -c "cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash" &
|
||||||
|
;;
|
||||||
|
xfce4-terminal)
|
||||||
|
xfce4-terminal -e "bash -c \"cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash\"" &
|
||||||
|
;;
|
||||||
|
alacritty)
|
||||||
|
alacritty -e bash -c "cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash" &
|
||||||
|
;;
|
||||||
|
kitty)
|
||||||
|
kitty bash -c "cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash" &
|
||||||
|
;;
|
||||||
|
xterm)
|
||||||
|
xterm -e bash -c "cd '$SCRIPT_DIR' && bash '$0' soprano; exec bash" &
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
sleep 2
|
||||||
|
|
||||||
|
# Start RVC in new terminal
|
||||||
|
print_info "Starting RVC GUI in new terminal..."
|
||||||
|
case "$TERMINAL" in
|
||||||
|
gnome-terminal)
|
||||||
|
gnome-terminal -- bash -c "cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash" &
|
||||||
|
;;
|
||||||
|
konsole)
|
||||||
|
konsole -e bash -c "cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash" &
|
||||||
|
;;
|
||||||
|
xfce4-terminal)
|
||||||
|
xfce4-terminal -e "bash -c \"cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash\"" &
|
||||||
|
;;
|
||||||
|
alacritty)
|
||||||
|
alacritty -e bash -c "cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash" &
|
||||||
|
;;
|
||||||
|
kitty)
|
||||||
|
kitty bash -c "cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash" &
|
||||||
|
;;
|
||||||
|
xterm)
|
||||||
|
xterm -e bash -c "cd '$SCRIPT_DIR' && bash '$0' rvc; exec bash" &
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
print_success "Both components started in separate terminals"
|
||||||
|
echo ""
|
||||||
|
print_header "Quick Setup Guide"
|
||||||
|
echo "1. In the RVC GUI window:"
|
||||||
|
echo " - Select your RVC model (.pth file)"
|
||||||
|
echo " - Select the corresponding index file"
|
||||||
|
echo " - Choose 'soprano_to_rvc.monitor' as INPUT device"
|
||||||
|
echo " - Choose your speakers/headphones as OUTPUT device"
|
||||||
|
echo " - Click 'Start Voice Conversion'"
|
||||||
|
echo ""
|
||||||
|
echo "2. In the Soprano TTS window:"
|
||||||
|
echo " - Type any text you want to convert"
|
||||||
|
echo " - Press Enter to generate and stream"
|
||||||
|
echo ""
|
||||||
|
echo "3. Listen to the RVC-converted output!"
|
||||||
|
echo ""
|
||||||
|
print_info "Press Ctrl+C in each terminal to stop"
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main script
|
||||||
|
main() {
|
||||||
|
case "${1:-both}" in
|
||||||
|
soprano)
|
||||||
|
check_prerequisites
|
||||||
|
start_soprano
|
||||||
|
;;
|
||||||
|
rvc)
|
||||||
|
check_prerequisites
|
||||||
|
start_rvc
|
||||||
|
;;
|
||||||
|
both)
|
||||||
|
check_prerequisites
|
||||||
|
start_both
|
||||||
|
;;
|
||||||
|
help|--help|-h)
|
||||||
|
show_usage
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
print_error "Unknown option: $1"
|
||||||
|
show_usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
1
python-version.txt
Normal file
1
python-version.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
3.10.19
|
||||||
159
requirements.lock.txt
Normal file
159
requirements.lock.txt
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
absl-py==2.3.1
|
||||||
|
accelerate==1.12.0
|
||||||
|
aiofiles==23.2.1
|
||||||
|
aiohappyeyeballs==2.6.1
|
||||||
|
aiohttp==3.13.3
|
||||||
|
aiosignal==1.4.0
|
||||||
|
altair==5.5.0
|
||||||
|
annotated-doc==0.0.4
|
||||||
|
annotated-types==0.7.0
|
||||||
|
antlr4-python3-runtime==4.8
|
||||||
|
anyio==4.12.1
|
||||||
|
async-timeout==5.0.1
|
||||||
|
attrs==25.4.0
|
||||||
|
audioread==3.1.0
|
||||||
|
av==16.1.0
|
||||||
|
bitarray==3.8.0
|
||||||
|
brotli==1.2.0
|
||||||
|
certifi==2026.1.4
|
||||||
|
cffi==2.0.0
|
||||||
|
charset-normalizer==3.4.4
|
||||||
|
click==8.3.1
|
||||||
|
colorama==0.4.6
|
||||||
|
coloredlogs==15.0.1
|
||||||
|
contourpy==1.3.2
|
||||||
|
cycler==0.12.1
|
||||||
|
Cython==3.2.4
|
||||||
|
decorator==5.2.1
|
||||||
|
einops==0.8.1
|
||||||
|
exceptiongroup==1.3.1
|
||||||
|
fairseq==0.12.2
|
||||||
|
faiss-cpu==1.7.3
|
||||||
|
fastapi==0.88.0
|
||||||
|
ffmpeg-python==0.2.0
|
||||||
|
ffmpy==0.3.1
|
||||||
|
filelock==3.20.0
|
||||||
|
flatbuffers==25.12.19
|
||||||
|
fonttools==4.61.1
|
||||||
|
frozenlist==1.8.0
|
||||||
|
fsspec==2025.12.0
|
||||||
|
future==1.0.0
|
||||||
|
gradio==3.48.0
|
||||||
|
gradio_client==0.6.1
|
||||||
|
groovy==0.1.2
|
||||||
|
grpcio==1.76.0
|
||||||
|
h11==0.16.0
|
||||||
|
hf-xet==1.2.0
|
||||||
|
httpcore==1.0.9
|
||||||
|
httpx==0.28.1
|
||||||
|
huggingface-hub==0.36.0
|
||||||
|
humanfriendly==10.0
|
||||||
|
hydra-core==1.0.7
|
||||||
|
hyper-connections==0.4.0
|
||||||
|
idna==3.11
|
||||||
|
importlib_resources==6.5.2
|
||||||
|
inflect==7.5.0
|
||||||
|
Jinja2==3.1.3
|
||||||
|
joblib==1.5.3
|
||||||
|
json5==0.13.0
|
||||||
|
jsonschema==4.26.0
|
||||||
|
jsonschema-specifications==2025.9.1
|
||||||
|
kiwisolver==1.4.9
|
||||||
|
lazy_loader==0.4
|
||||||
|
librosa==0.10.2
|
||||||
|
linkify-it-py==2.0.3
|
||||||
|
llvmlite==0.39.0
|
||||||
|
local-attention==1.11.2
|
||||||
|
lxml==6.0.2
|
||||||
|
Markdown==3.10
|
||||||
|
markdown-it-py==2.2.0
|
||||||
|
MarkupSafe==2.1.5
|
||||||
|
matplotlib==3.10.8
|
||||||
|
matplotlib-inline==0.2.1
|
||||||
|
mdit-py-plugins==0.3.3
|
||||||
|
mdurl==0.1.2
|
||||||
|
more-itertools==10.8.0
|
||||||
|
mpmath==1.3.0
|
||||||
|
msgpack==1.1.2
|
||||||
|
multidict==6.7.0
|
||||||
|
narwhals==2.15.0
|
||||||
|
networkx==3.4.2
|
||||||
|
numba==0.56.4
|
||||||
|
numpy==1.23.5
|
||||||
|
omegaconf==2.0.6
|
||||||
|
onnxruntime==1.23.2
|
||||||
|
onnxruntime-gpu==1.23.2
|
||||||
|
orjson==3.11.5
|
||||||
|
packaging==25.0
|
||||||
|
pandas==2.3.3
|
||||||
|
pillow==10.4.0
|
||||||
|
platformdirs==4.5.1
|
||||||
|
pooch==1.8.2
|
||||||
|
portalocker==3.2.0
|
||||||
|
praat-parselmouth==0.4.7
|
||||||
|
propcache==0.4.1
|
||||||
|
protobuf==6.33.3
|
||||||
|
psutil==7.2.1
|
||||||
|
pyasn1==0.6.1
|
||||||
|
pyasn1_modules==0.4.2
|
||||||
|
pycparser==2.23
|
||||||
|
pydantic==1.10.26
|
||||||
|
pydantic_core==2.41.5
|
||||||
|
pydub==0.25.1
|
||||||
|
Pygments==2.19.2
|
||||||
|
pyparsing==3.3.1
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
python-dotenv==1.2.1
|
||||||
|
python-multipart==0.0.21
|
||||||
|
pytorch-triton-rocm==3.1.0
|
||||||
|
pytz==2025.2
|
||||||
|
pyworld==0.3.2
|
||||||
|
PyYAML==6.0.3
|
||||||
|
referencing==0.37.0
|
||||||
|
regex==2025.11.3
|
||||||
|
requests==2.32.5
|
||||||
|
resampy==0.4.3
|
||||||
|
rich==14.2.0
|
||||||
|
rpds-py==0.30.0
|
||||||
|
sacrebleu==2.5.1
|
||||||
|
safehttpx==0.1.7
|
||||||
|
safetensors==0.7.0
|
||||||
|
scikit-learn==1.7.2
|
||||||
|
scipy==1.15.3
|
||||||
|
semantic-version==2.10.0
|
||||||
|
shellingham==1.5.4
|
||||||
|
six==1.17.0
|
||||||
|
-e git+https://github.com/ekwek1/soprano.git@5c759351f9e115aa364d5f4453ddaa7ee0d6f15e#egg=soprano_tts
|
||||||
|
sounddevice==0.5.3
|
||||||
|
soundfile==0.13.1
|
||||||
|
soxr==1.0.0
|
||||||
|
starlette==0.22.0
|
||||||
|
sympy==1.13.1
|
||||||
|
tabulate==0.9.0
|
||||||
|
tensorboard==2.20.0
|
||||||
|
tensorboard-data-server==0.7.2
|
||||||
|
tensorboardX==2.6.4
|
||||||
|
threadpoolctl==3.6.0
|
||||||
|
tokenizers==0.22.2
|
||||||
|
tomlkit==0.13.3
|
||||||
|
torch==2.5.1+rocm6.2
|
||||||
|
torchaudio==2.5.1+rocm6.2
|
||||||
|
torchcrepe==0.0.23
|
||||||
|
torchfcpe==0.0.4
|
||||||
|
torchvision==0.20.1+rocm6.2
|
||||||
|
tornado==6.5.4
|
||||||
|
tqdm==4.67.1
|
||||||
|
traitlets==5.14.3
|
||||||
|
transformers==4.57.3
|
||||||
|
typeguard==4.4.4
|
||||||
|
typer==0.21.1
|
||||||
|
typing-inspection==0.4.2
|
||||||
|
typing_extensions==4.15.0
|
||||||
|
tzdata==2025.3
|
||||||
|
uc-micro-py==1.0.3
|
||||||
|
Unidecode==1.4.0
|
||||||
|
urllib3==2.6.3
|
||||||
|
uvicorn==0.40.0
|
||||||
|
websockets==11.0.3
|
||||||
|
Werkzeug==3.1.5
|
||||||
|
yarl==1.22.0
|
||||||
86
requirements.txt
Normal file
86
requirements.txt
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
# ==========================================================
|
||||||
|
# Unified Soprano + RVC environment
|
||||||
|
# Python == 3.10.19
|
||||||
|
# ROCm == 6.2
|
||||||
|
# ==========================================================
|
||||||
|
|
||||||
|
# ----------------------
|
||||||
|
# Core ML / GPU stack
|
||||||
|
# ----------------------
|
||||||
|
torch==2.5.1+rocm6.2
|
||||||
|
torchaudio==2.5.1+rocm6.2
|
||||||
|
torchvision==0.20.1+rocm6.2
|
||||||
|
pytorch-triton-rocm==3.1.0
|
||||||
|
|
||||||
|
# ----------------------
|
||||||
|
# Numerical stack (RVC-safe)
|
||||||
|
# ----------------------
|
||||||
|
numpy==1.23.5
|
||||||
|
scipy==1.15.3
|
||||||
|
scikit-learn==1.7.2
|
||||||
|
|
||||||
|
# ----------------------
|
||||||
|
# Audio processing
|
||||||
|
# ----------------------
|
||||||
|
sounddevice==0.5.3
|
||||||
|
soundfile==0.13.1
|
||||||
|
pydub==0.25.1
|
||||||
|
librosa==0.10.2
|
||||||
|
soxr==1.0.0
|
||||||
|
resampy==0.4.3
|
||||||
|
praat-parselmouth==0.4.7
|
||||||
|
pyworld==0.3.2
|
||||||
|
av==16.1.0
|
||||||
|
|
||||||
|
# ----------------------
|
||||||
|
# RVC core
|
||||||
|
# ----------------------
|
||||||
|
fairseq==0.12.2
|
||||||
|
faiss-cpu==1.7.3
|
||||||
|
numba==0.56.4
|
||||||
|
llvmlite==0.39.0
|
||||||
|
torchcrepe==0.0.23
|
||||||
|
torchfcpe==0.0.4
|
||||||
|
einops==0.8.1
|
||||||
|
local-attention==1.11.2
|
||||||
|
omegaconf==2.0.6
|
||||||
|
hydra-core==1.0.7
|
||||||
|
|
||||||
|
# ----------------------
|
||||||
|
# Soprano TTS
|
||||||
|
# ----------------------
|
||||||
|
transformers==4.57.3
|
||||||
|
accelerate==1.12.0
|
||||||
|
tokenizers==0.22.2
|
||||||
|
safetensors==0.7.0
|
||||||
|
huggingface-hub==0.36.0
|
||||||
|
inflect==7.5.0
|
||||||
|
Unidecode==1.4.0
|
||||||
|
|
||||||
|
# ----------------------
|
||||||
|
# Web / UI
|
||||||
|
# ----------------------
|
||||||
|
fastapi==0.88.0
|
||||||
|
starlette==0.22.0
|
||||||
|
uvicorn==0.40.0
|
||||||
|
gradio==3.48.0
|
||||||
|
gradio_client==0.6.1
|
||||||
|
python-multipart==0.0.21
|
||||||
|
orjson==3.11.5
|
||||||
|
|
||||||
|
# ----------------------
|
||||||
|
# Utilities
|
||||||
|
# ----------------------
|
||||||
|
tqdm==4.67.1
|
||||||
|
rich==14.2.0
|
||||||
|
psutil==7.2.1
|
||||||
|
requests==2.32.5
|
||||||
|
regex==2025.11.3
|
||||||
|
filelock==3.20.0
|
||||||
|
packaging==25.0
|
||||||
|
PyYAML==6.0.3
|
||||||
|
|
||||||
|
# ----------------------
|
||||||
|
# Editable installs (local)
|
||||||
|
# ----------------------
|
||||||
|
-e git+https://github.com/ekwek1/soprano.git@5c759351f9e115aa364d5f4453ddaa7ee0d6f15e#egg=soprano_tts
|
||||||
52
setup_alsa_bridge.sh
Executable file
52
setup_alsa_bridge.sh
Executable file
@@ -0,0 +1,52 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Setup script to make soprano_to_rvc available as an ALSA device for RVC
|
||||||
|
|
||||||
|
ASOUND_RC="$HOME/.asoundrc"
|
||||||
|
SINK_NAME="soprano_to_rvc"
|
||||||
|
|
||||||
|
echo "Setting up ALSA configuration for soprano_to_rvc..."
|
||||||
|
|
||||||
|
# Backup existing .asoundrc if it exists
|
||||||
|
if [ -f "$ASOUND_RC" ]; then
|
||||||
|
cp "$ASOUND_RC" "${ASOUND_RC}.backup.$(date +%s)"
|
||||||
|
echo "✓ Backed up existing .asoundrc"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if our configuration already exists
|
||||||
|
if grep -q "pcm.soprano_rvc" "$ASOUND_RC" 2>/dev/null; then
|
||||||
|
echo "✓ Configuration already exists in .asoundrc"
|
||||||
|
else
|
||||||
|
echo "Adding ALSA configuration..."
|
||||||
|
|
||||||
|
cat >> "$ASOUND_RC" << 'EOF'
|
||||||
|
|
||||||
|
# Soprano to RVC bridge
|
||||||
|
pcm.soprano_rvc {
|
||||||
|
type pulse
|
||||||
|
device soprano_to_rvc.monitor
|
||||||
|
hint {
|
||||||
|
show on
|
||||||
|
description "Soprano TTS to RVC Bridge"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ctl.soprano_rvc {
|
||||||
|
type pulse
|
||||||
|
device soprano_to_rvc.monitor
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "✓ Added ALSA configuration to .asoundrc"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=" * 70
|
||||||
|
echo "Setup complete!"
|
||||||
|
echo ""
|
||||||
|
echo "The virtual device 'soprano_rvc' is now available as an ALSA device."
|
||||||
|
echo ""
|
||||||
|
echo "In RVC GUI:"
|
||||||
|
echo " 1. Set device type to 'ALSA'"
|
||||||
|
echo " 2. Select 'soprano_rvc' or 'Soprano TTS to RVC Bridge' as input"
|
||||||
|
echo " 3. Make sure the soprano_to_virtual_sink.py script is running"
|
||||||
|
echo ""
|
||||||
1
soprano
Submodule
1
soprano
Submodule
Submodule soprano added at 5c759351f9
299
soprano_to_virtual_sink.py
Executable file
299
soprano_to_virtual_sink.py
Executable file
@@ -0,0 +1,299 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Soprano TTS to Virtual Sink
|
||||||
|
This script takes text input and streams Soprano TTS output to a virtual PulseAudio sink
|
||||||
|
that can be used as input for RVC realtime voice conversion.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import signal
|
||||||
|
import sounddevice as sd
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from scipy import signal as scipy_signal
|
||||||
|
|
||||||
|
# Add soprano to path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'soprano'))
|
||||||
|
from soprano import SopranoTTS
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
VIRTUAL_SINK_NAME = "soprano_to_rvc"
|
||||||
|
SAMPLE_RATE = 48000 # Use 48kHz for better compatibility with audio systems
|
||||||
|
SOPRANO_RATE = 32000 # Soprano outputs at 32kHz
|
||||||
|
CHANNELS = 2 # Use stereo to match RVC expectations
|
||||||
|
|
||||||
|
# Global flag for graceful shutdown
|
||||||
|
running = True
|
||||||
|
|
||||||
|
|
||||||
|
def signal_handler(sig, frame):
|
||||||
|
"""Handle Ctrl+C gracefully"""
|
||||||
|
global running
|
||||||
|
print("\n\nShutting down gracefully...")
|
||||||
|
running = False
|
||||||
|
|
||||||
|
|
||||||
|
def create_virtual_sink():
|
||||||
|
"""Create a PulseAudio virtual sink for audio output"""
|
||||||
|
# Check if sink already exists
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["pactl", "list", "sinks", "short"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
if VIRTUAL_SINK_NAME in result.stdout:
|
||||||
|
print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' already exists")
|
||||||
|
print(f" Monitor source: {VIRTUAL_SINK_NAME}.monitor")
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"Creating virtual sink: {VIRTUAL_SINK_NAME}")
|
||||||
|
try:
|
||||||
|
# Create a null sink (virtual audio device) at 48kHz for compatibility
|
||||||
|
subprocess.run([
|
||||||
|
"pactl", "load-module", "module-null-sink",
|
||||||
|
f"sink_name={VIRTUAL_SINK_NAME}",
|
||||||
|
f"sink_properties=device.description={VIRTUAL_SINK_NAME}",
|
||||||
|
f"rate={SAMPLE_RATE}",
|
||||||
|
"channels=2" # Stereo to match RVC expectations
|
||||||
|
], check=True, capture_output=True)
|
||||||
|
print(f"✓ Virtual sink '{VIRTUAL_SINK_NAME}' created successfully")
|
||||||
|
print(f" Monitor source: {VIRTUAL_SINK_NAME}.monitor")
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"✗ Failed to create virtual sink: {e.stderr.decode()}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def remove_virtual_sink():
|
||||||
|
"""Remove the virtual sink on exit"""
|
||||||
|
print(f"\nRemoving virtual sink: {VIRTUAL_SINK_NAME}")
|
||||||
|
try:
|
||||||
|
# Find the module ID
|
||||||
|
result = subprocess.run(
|
||||||
|
["pactl", "list", "modules", "short"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
for line in result.stdout.split('\n'):
|
||||||
|
if VIRTUAL_SINK_NAME in line:
|
||||||
|
module_id = line.split()[0]
|
||||||
|
subprocess.run(["pactl", "unload-module", module_id], check=True)
|
||||||
|
print(f"✓ Virtual sink removed")
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Error removing virtual sink: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def get_virtual_sink_device_id():
|
||||||
|
"""Get the sounddevice ID for our virtual sink"""
|
||||||
|
# Force refresh device list
|
||||||
|
sd._terminate()
|
||||||
|
sd._initialize()
|
||||||
|
|
||||||
|
devices = sd.query_devices()
|
||||||
|
for i, device in enumerate(devices):
|
||||||
|
if VIRTUAL_SINK_NAME in device['name']:
|
||||||
|
return i
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def stream_to_virtual_sink(tts_model, text, chunk_size=1):
|
||||||
|
"""Stream soprano TTS output to the virtual sink"""
|
||||||
|
device_id = get_virtual_sink_device_id()
|
||||||
|
|
||||||
|
if device_id is None:
|
||||||
|
print(f"✗ Could not find virtual sink device: {VIRTUAL_SINK_NAME}")
|
||||||
|
print(f"⚠️ Attempting to recreate virtual sink...")
|
||||||
|
if create_virtual_sink():
|
||||||
|
# Wait a moment for the device to appear
|
||||||
|
import time
|
||||||
|
time.sleep(1.0) # Increased wait time
|
||||||
|
device_id = get_virtual_sink_device_id()
|
||||||
|
if device_id is None:
|
||||||
|
print(f"✗ Still could not find virtual sink after recreation")
|
||||||
|
print(f"\n📋 Available devices:")
|
||||||
|
devices = sd.query_devices()
|
||||||
|
for i, dev in enumerate(devices):
|
||||||
|
if 'soprano' in dev['name'].lower() or 'rvc' in dev['name'].lower():
|
||||||
|
print(f" {i}: {dev['name']}")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
device_info = sd.query_devices(device_id)
|
||||||
|
print(f"✓ Using output device: {device_info['name']}")
|
||||||
|
|
||||||
|
# Get the device's default sample rate if 32kHz isn't supported
|
||||||
|
device_sr = int(device_info.get('default_samplerate', SAMPLE_RATE))
|
||||||
|
if device_sr == 0 or device_sr != SAMPLE_RATE:
|
||||||
|
device_sr = SAMPLE_RATE # Try with soprano's rate anyway
|
||||||
|
|
||||||
|
print(f" Sample rate: {device_sr} Hz")
|
||||||
|
print(f"\n🎤 Generating and streaming speech...")
|
||||||
|
print(f"Text: \"{text}\"\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Generate streaming audio from soprano
|
||||||
|
stream = tts_model.infer_stream(text, chunk_size=chunk_size)
|
||||||
|
|
||||||
|
# Open output stream to virtual sink
|
||||||
|
with sd.OutputStream(
|
||||||
|
samplerate=SAMPLE_RATE,
|
||||||
|
channels=CHANNELS,
|
||||||
|
dtype='float32',
|
||||||
|
device=device_id,
|
||||||
|
blocksize=0
|
||||||
|
) as out_stream:
|
||||||
|
first_chunk = True
|
||||||
|
for chunk in stream:
|
||||||
|
if not running:
|
||||||
|
break
|
||||||
|
|
||||||
|
if first_chunk:
|
||||||
|
print("✓ First audio chunk generated and streaming started")
|
||||||
|
first_chunk = False
|
||||||
|
|
||||||
|
# Convert torch tensor to numpy if needed
|
||||||
|
if isinstance(chunk, torch.Tensor):
|
||||||
|
chunk = chunk.detach().cpu().numpy()
|
||||||
|
|
||||||
|
# Ensure correct shape for mono audio
|
||||||
|
if chunk.ndim == 1:
|
||||||
|
chunk_1d = chunk
|
||||||
|
elif chunk.ndim == 2 and chunk.shape[0] == 1:
|
||||||
|
chunk_1d = chunk.flatten()
|
||||||
|
elif chunk.ndim == 2 and chunk.shape[1] == 1:
|
||||||
|
chunk_1d = chunk.flatten()
|
||||||
|
else:
|
||||||
|
chunk_1d = chunk.flatten()
|
||||||
|
|
||||||
|
# Check for invalid values before resampling
|
||||||
|
if not np.all(np.isfinite(chunk_1d)):
|
||||||
|
print(f"⚠️ Warning: Invalid values in soprano output, cleaning...")
|
||||||
|
chunk_1d = np.nan_to_num(chunk_1d, nan=0.0, posinf=1.0, neginf=-1.0)
|
||||||
|
|
||||||
|
# Resample from 32kHz (Soprano) to 48kHz (output) if needed
|
||||||
|
if SOPRANO_RATE != SAMPLE_RATE:
|
||||||
|
num_samples = int(len(chunk_1d) * SAMPLE_RATE / SOPRANO_RATE)
|
||||||
|
chunk_resampled = scipy_signal.resample(chunk_1d, num_samples)
|
||||||
|
else:
|
||||||
|
chunk_resampled = chunk_1d
|
||||||
|
|
||||||
|
# Ensure no NaN or inf values after resampling (clip to valid range)
|
||||||
|
if not np.all(np.isfinite(chunk_resampled)):
|
||||||
|
print(f"⚠️ Warning: Invalid values after resampling, cleaning...")
|
||||||
|
chunk_resampled = np.nan_to_num(chunk_resampled, nan=0.0, posinf=1.0, neginf=-1.0)
|
||||||
|
chunk_resampled = np.clip(chunk_resampled, -1.0, 1.0)
|
||||||
|
|
||||||
|
# Reshape to (N, 2) for stereo output (duplicate mono to both channels)
|
||||||
|
chunk_stereo = np.column_stack((chunk_resampled, chunk_resampled)).astype(np.float32)
|
||||||
|
|
||||||
|
# Write to virtual sink
|
||||||
|
out_stream.write(chunk_stereo)
|
||||||
|
|
||||||
|
print("✓ Speech generation and streaming completed")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Error during streaming: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function"""
|
||||||
|
global running
|
||||||
|
|
||||||
|
# Set up signal handler for graceful shutdown
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("Soprano TTS to Virtual Sink for RVC")
|
||||||
|
print("=" * 70)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Create virtual sink
|
||||||
|
if not create_virtual_sink():
|
||||||
|
print("\n⚠️ If sink already exists, removing and recreating...")
|
||||||
|
remove_virtual_sink()
|
||||||
|
if not create_virtual_sink():
|
||||||
|
print("✗ Failed to create virtual sink. Exiting.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=" * 70)
|
||||||
|
print("Virtual sink setup complete!")
|
||||||
|
print("=" * 70)
|
||||||
|
print()
|
||||||
|
print("📝 Next steps:")
|
||||||
|
print(f" 1. Open RVC realtime GUI (gui_v1.py)")
|
||||||
|
print(f" 2. Select '{VIRTUAL_SINK_NAME}.monitor' as the INPUT device")
|
||||||
|
print(f" 3. Select your desired output device")
|
||||||
|
print(f" 4. Load your RVC model and start conversion")
|
||||||
|
print(f" 5. Return here and type text to convert")
|
||||||
|
print()
|
||||||
|
print("=" * 70)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Initialize Soprano TTS
|
||||||
|
print("🔄 Loading Soprano TTS model...")
|
||||||
|
try:
|
||||||
|
tts = SopranoTTS(
|
||||||
|
backend='auto',
|
||||||
|
device='auto',
|
||||||
|
cache_size_mb=100,
|
||||||
|
decoder_batch_size=1
|
||||||
|
)
|
||||||
|
print("✓ Soprano TTS model loaded successfully")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Failed to load Soprano TTS: {e}")
|
||||||
|
remove_virtual_sink()
|
||||||
|
return 1
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=" * 70)
|
||||||
|
print("Ready! Type text to generate speech (Ctrl+C to exit)")
|
||||||
|
print("=" * 70)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Main loop - get text input and generate speech
|
||||||
|
try:
|
||||||
|
while running:
|
||||||
|
try:
|
||||||
|
text = input("\n🎙️ Enter text: ").strip()
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
print("⚠️ Please enter some text")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if text.lower() in ['quit', 'exit', 'q']:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Stream the text to the virtual sink
|
||||||
|
stream_to_virtual_sink(tts, text, chunk_size=1)
|
||||||
|
print()
|
||||||
|
|
||||||
|
except EOFError:
|
||||||
|
break
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\n⚠️ Interrupted by user")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Clean up
|
||||||
|
remove_virtual_sink()
|
||||||
|
print("\n✓ Cleanup complete. Goodbye!")
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Reference in New Issue
Block a user