Spaces:
Runtime error
Runtime error
updated dockerfile and added scripts
Browse files- Dockerfile +36 -47
- scripts/tortoise_tts.py +390 -0
- setup.py +40 -0
- tortoise_tts.ipynb +268 -0
Dockerfile
CHANGED
|
@@ -1,47 +1,36 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
ENV
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
echo "
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
conda install
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
# Make port 8501 available to the world outside this container
|
| 38 |
-
EXPOSE 8501
|
| 39 |
-
|
| 40 |
-
# Define environment variable
|
| 41 |
-
ENV NAME tortoise-tts
|
| 42 |
-
|
| 43 |
-
# List the contents of the /app directory
|
| 44 |
-
RUN ls -al /app
|
| 45 |
-
|
| 46 |
-
# Run the application
|
| 47 |
-
CMD ["streamlit", "run", "app.py"]
|
|
|
|
| 1 |
+
FROM nvidia/cuda:12.2.0-base-ubuntu22.04
|
| 2 |
+
|
| 3 |
+
COPY . /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && \
|
| 6 |
+
apt-get install -y --allow-unauthenticated --no-install-recommends \
|
| 7 |
+
wget \
|
| 8 |
+
git \
|
| 9 |
+
&& apt-get autoremove -y \
|
| 10 |
+
&& apt-get clean -y \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
ENV HOME "/root"
|
| 14 |
+
ENV CONDA_DIR "${HOME}/miniconda"
|
| 15 |
+
ENV PATH="$CONDA_DIR/bin":$PATH
|
| 16 |
+
ENV CONDA_AUTO_UPDATE_CONDA=false
|
| 17 |
+
ENV PIP_DOWNLOAD_CACHE="$HOME/.pip/cache"
|
| 18 |
+
ENV TORTOISE_MODELS_DIR
|
| 19 |
+
|
| 20 |
+
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda3.sh \
|
| 21 |
+
&& bash /tmp/miniconda3.sh -b -p "${CONDA_DIR}" -f -u \
|
| 22 |
+
&& "${CONDA_DIR}/bin/conda" init bash \
|
| 23 |
+
&& rm -f /tmp/miniconda3.sh \
|
| 24 |
+
&& echo ". '${CONDA_DIR}/etc/profile.d/conda.sh'" >> "${HOME}/.profile"
|
| 25 |
+
|
| 26 |
+
# --login option used to source bashrc (thus activating conda env) at every RUN statement
|
| 27 |
+
SHELL ["/bin/bash", "--login", "-c"]
|
| 28 |
+
|
| 29 |
+
RUN conda create --name tortoise python=3.9 numba inflect \
|
| 30 |
+
&& conda activate tortoise \
|
| 31 |
+
&& conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia \
|
| 32 |
+
&& conda install transformers=4.29.2 \
|
| 33 |
+
&& conda install streamlit \
|
| 34 |
+
&& cd /app \
|
| 35 |
+
&& python setup.py install \
|
| 36 |
+
&& streamlit run app.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/tortoise_tts.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# AGPL: a notification must be added stating that changes have been made to that file.
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
import tempfile
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Literal, Optional
|
| 10 |
+
|
| 11 |
+
import torch
|
| 12 |
+
import torchaudio
|
| 13 |
+
from simple_parsing import ArgumentParser, field
|
| 14 |
+
|
| 15 |
+
from tortoise.api import MODELS_DIR, TextToSpeech
|
| 16 |
+
from tortoise.utils.audio import load_audio
|
| 17 |
+
from tortoise.utils.diffusion import SAMPLERS
|
| 18 |
+
from tortoise.models.vocoder import VocConf
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class General:
|
| 23 |
+
"""General options"""
|
| 24 |
+
|
| 25 |
+
text: str = field(positional=True, nargs="*", metavar="text")
|
| 26 |
+
"""Text to speak. If omitted, text is read from stdin."""
|
| 27 |
+
|
| 28 |
+
voice: str = field(default="random", alias=["-v"])
|
| 29 |
+
"""Selects the voice to use for generation. Use the & character to join two voices together.
|
| 30 |
+
Use a comma to perform inference on multiple voices. Set to "all" to use all available voices.
|
| 31 |
+
Note that multiple voices require the --output-dir option to be set."""
|
| 32 |
+
|
| 33 |
+
voices_dir: Optional[str] = field(default=None, alias=["-V"])
|
| 34 |
+
"""Path to directory containing extra voices to be loaded. Use a comma to specify multiple directories."""
|
| 35 |
+
|
| 36 |
+
preset: Literal["ultra_fast", "fast", "standard", "high_quality"] = field(
|
| 37 |
+
default="fast", alias=["-p"]
|
| 38 |
+
)
|
| 39 |
+
"""Which voice quality preset to use."""
|
| 40 |
+
|
| 41 |
+
quiet: bool = field(default=False, alias=["-q"])
|
| 42 |
+
"""Suppress all output."""
|
| 43 |
+
|
| 44 |
+
voicefixer: bool = field(default=True)
|
| 45 |
+
"""Enable/Disable voicefixer"""
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@dataclass
|
| 49 |
+
class Output:
|
| 50 |
+
"""Output options"""
|
| 51 |
+
|
| 52 |
+
list_voices: bool = field(default=False, alias=["-l"])
|
| 53 |
+
"""List available voices and exit."""
|
| 54 |
+
|
| 55 |
+
play: bool = field(default=False, alias=["-P"])
|
| 56 |
+
"""Play the audio (requires pydub)."""
|
| 57 |
+
|
| 58 |
+
output: Optional[Path] = field(default=None, alias=["-o"])
|
| 59 |
+
"""Save the audio to a file."""
|
| 60 |
+
|
| 61 |
+
output_dir: Path = field(default=Path("results/"), alias=["-O"])
|
| 62 |
+
"""Save the audio to a directory as individual segments."""
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@dataclass
|
| 66 |
+
class MultiOutput:
|
| 67 |
+
"""Multi-output options"""
|
| 68 |
+
|
| 69 |
+
candidates: int = 1
|
| 70 |
+
"""How many output candidates to produce per-voice. Note that only the first candidate is used in the combined output."""
|
| 71 |
+
|
| 72 |
+
regenerate: Optional[str] = None
|
| 73 |
+
"""Comma-separated list of clip numbers to re-generate."""
|
| 74 |
+
|
| 75 |
+
skip_existing: bool = False
|
| 76 |
+
"""Set to skip re-generating existing clips."""
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@dataclass
|
| 80 |
+
class Advanced:
|
| 81 |
+
"""Advanced options"""
|
| 82 |
+
|
| 83 |
+
produce_debug_state: bool = False
|
| 84 |
+
"""Whether or not to produce debug_states in current directory, which can aid in reproducing problems."""
|
| 85 |
+
|
| 86 |
+
seed: Optional[int] = None
|
| 87 |
+
"""Random seed which can be used to reproduce results."""
|
| 88 |
+
|
| 89 |
+
models_dir: str = MODELS_DIR
|
| 90 |
+
"""Where to find pretrained model checkpoints. Tortoise automatically downloads these to
|
| 91 |
+
~/.cache/tortoise/.models, so this should only be specified if you have custom checkpoints."""
|
| 92 |
+
|
| 93 |
+
text_split: Optional[str] = None
|
| 94 |
+
"""How big chunks to split the text into, in the format <desired_length>,<max_length>."""
|
| 95 |
+
|
| 96 |
+
disable_redaction: bool = False
|
| 97 |
+
"""Normally text enclosed in brackets are automatically redacted from the spoken output
|
| 98 |
+
(but are still rendered by the model), this can be used for prompt engineering.
|
| 99 |
+
Set this to disable this behavior."""
|
| 100 |
+
|
| 101 |
+
device: Optional[str] = None
|
| 102 |
+
"""Device to use for inference."""
|
| 103 |
+
|
| 104 |
+
batch_size: Optional[int] = None
|
| 105 |
+
"""Batch size to use for inference. If omitted, the batch size is set based on available GPU memory."""
|
| 106 |
+
|
| 107 |
+
vocoder: Literal["Univnet", "BigVGAN", "BigVGAN_Base"] = "BigVGAN_Base"
|
| 108 |
+
"""Pretrained vocoder to be used.
|
| 109 |
+
Univnet - tortoise original
|
| 110 |
+
BigVGAN - 112M model
|
| 111 |
+
BigVGAN_Base - 14M model
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
ar_checkpoint: Optional[str] = None
|
| 115 |
+
"""Path to a checkpoint to use for the autoregressive model. If omitted, the default checkpoint is used."""
|
| 116 |
+
|
| 117 |
+
clvp_checkpoint: Optional[str] = None
|
| 118 |
+
"""Path to a checkpoint to use for the CLVP model. If omitted, the default checkpoint is used."""
|
| 119 |
+
|
| 120 |
+
diff_checkpoint: Optional[str] = None
|
| 121 |
+
"""Path to a checkpoint to use for the diffusion model. If omitted, the default checkpoint is used."""
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
@dataclass
|
| 125 |
+
class Tuning:
|
| 126 |
+
"""Tuning options (overrides preset settings)"""
|
| 127 |
+
|
| 128 |
+
num_autoregressive_samples: Optional[int] = None
|
| 129 |
+
"""Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
|
| 130 |
+
As TorToiSe is a probabilistic model, more samples means a higher probability of creating something "great"."""
|
| 131 |
+
|
| 132 |
+
temperature: Optional[float] = None
|
| 133 |
+
"""The softmax temperature of the autoregressive model."""
|
| 134 |
+
|
| 135 |
+
length_penalty: Optional[float] = None
|
| 136 |
+
"""A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs."""
|
| 137 |
+
|
| 138 |
+
repetition_penalty: Optional[float] = None
|
| 139 |
+
"""A penalty that prevents the autoregressive decoder from repeating itself during decoding.
|
| 140 |
+
Can be used to reduce the incidence of long silences or "uhhhhhhs", etc."""
|
| 141 |
+
|
| 142 |
+
top_p: Optional[float] = None
|
| 143 |
+
"""P value used in nucleus sampling. 0 to 1. Lower values mean the decoder produces more "likely" (aka boring) outputs."""
|
| 144 |
+
|
| 145 |
+
max_mel_tokens: Optional[int] = None
|
| 146 |
+
"""Restricts the output length. 1 to 600. Each unit is 1/20 of a second."""
|
| 147 |
+
|
| 148 |
+
cvvp_amount: Optional[float] = None
|
| 149 |
+
"""How much the CVVP model should influence the output.
|
| 150 |
+
Increasing this can in some cases reduce the likelihood of multiple speakers."""
|
| 151 |
+
|
| 152 |
+
diffusion_iterations: Optional[int] = None
|
| 153 |
+
"""Number of diffusion steps to perform. More steps means the network has more chances to iteratively
|
| 154 |
+
refine the output, which should theoretically mean a higher quality output.
|
| 155 |
+
Generally a value above 250 is not noticeably better, however."""
|
| 156 |
+
|
| 157 |
+
cond_free: Optional[bool] = None
|
| 158 |
+
"""Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for
|
| 159 |
+
each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output
|
| 160 |
+
of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and
|
| 161 |
+
dramatically improves realism."""
|
| 162 |
+
|
| 163 |
+
cond_free_k: Optional[float] = None
|
| 164 |
+
"""Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
|
| 165 |
+
As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
|
| 166 |
+
Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k"""
|
| 167 |
+
|
| 168 |
+
diffusion_temperature: Optional[float] = None
|
| 169 |
+
"""Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
|
| 170 |
+
are the "mean" prediction of the diffusion network and will sound bland and smeared."""
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
@dataclass
|
| 174 |
+
class Speed:
|
| 175 |
+
"""New/speed options"""
|
| 176 |
+
|
| 177 |
+
low_vram: bool = False
|
| 178 |
+
"""re-enable default offloading behaviour of tortoise"""
|
| 179 |
+
|
| 180 |
+
half: bool = False
|
| 181 |
+
"""enable autocast to half precision for autoregressive model"""
|
| 182 |
+
|
| 183 |
+
no_cache: bool = False
|
| 184 |
+
"""disable kv_cache usage. This should really only be used if you are very low on vram."""
|
| 185 |
+
|
| 186 |
+
sampler: Optional[str] = field(default=None, choices=SAMPLERS)
|
| 187 |
+
"""override the sampler used for diffusion (default depends on --preset)"""
|
| 188 |
+
|
| 189 |
+
original_tortoise: bool = False
|
| 190 |
+
"""ensure results are identical to original tortoise-tts repo"""
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
if __name__ == "__main__":
|
| 194 |
+
parser = ArgumentParser(
|
| 195 |
+
description="TorToiSe is a text-to-speech program that is capable of synthesizing speech "
|
| 196 |
+
"in multiple voices with realistic prosody and intonation."
|
| 197 |
+
)
|
| 198 |
+
# bugs out for some reason
|
| 199 |
+
# parser.add_argument(
|
| 200 |
+
# "--web",
|
| 201 |
+
# action="store_true",
|
| 202 |
+
# help="launch the webui (doesn't pass it the other arguments)",
|
| 203 |
+
# )
|
| 204 |
+
parser.add_arguments(General, "general")
|
| 205 |
+
parser.add_arguments(Output, "output")
|
| 206 |
+
parser.add_arguments(MultiOutput, "multi_output")
|
| 207 |
+
parser.add_arguments(Advanced, "advanced")
|
| 208 |
+
parser.add_arguments(Tuning, "tuning")
|
| 209 |
+
parser.add_arguments(Speed, "speed")
|
| 210 |
+
|
| 211 |
+
usage_examples = f"""
|
| 212 |
+
Examples:
|
| 213 |
+
|
| 214 |
+
Read text using random voice and place it in a file:
|
| 215 |
+
|
| 216 |
+
{parser.prog} -o hello.wav "Hello, how are you?"
|
| 217 |
+
|
| 218 |
+
Read text from stdin and play it using the tom voice:
|
| 219 |
+
|
| 220 |
+
echo "Say it like you mean it!" | {parser.prog} -P -v tom
|
| 221 |
+
|
| 222 |
+
Read a text file using multiple voices and save the audio clips to a directory:
|
| 223 |
+
|
| 224 |
+
{parser.prog} -O /tmp/tts-results -v tom,emma <textfile.txt
|
| 225 |
+
"""
|
| 226 |
+
|
| 227 |
+
# show usage even when Ctrl+C is pressed early
|
| 228 |
+
try:
|
| 229 |
+
args = parser.parse_args()
|
| 230 |
+
except SystemExit as e:
|
| 231 |
+
if e.code == 0:
|
| 232 |
+
print(usage_examples)
|
| 233 |
+
sys.exit(e.code)
|
| 234 |
+
# bugs out for some reason
|
| 235 |
+
# if args.web:
|
| 236 |
+
# from importlib import import_module
|
| 237 |
+
# app = import_module("app")
|
| 238 |
+
# sys.exit(app.main())
|
| 239 |
+
|
| 240 |
+
from tortoise.inference import (
|
| 241 |
+
check_pydub,
|
| 242 |
+
get_all_voices,
|
| 243 |
+
get_seed,
|
| 244 |
+
parse_multiarg_text,
|
| 245 |
+
parse_voice_str,
|
| 246 |
+
split_text,
|
| 247 |
+
validate_output_dir,
|
| 248 |
+
voice_loader,
|
| 249 |
+
save_gen_with_voicefix
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
# get voices
|
| 253 |
+
all_voices, extra_voice_dirs = get_all_voices(args.general.voices_dir)
|
| 254 |
+
if args.output.list_voices:
|
| 255 |
+
for v in all_voices:
|
| 256 |
+
print(v)
|
| 257 |
+
sys.exit(0)
|
| 258 |
+
selected_voices = parse_voice_str(args.general.voice, all_voices)
|
| 259 |
+
voice_generator = voice_loader(selected_voices, extra_voice_dirs)
|
| 260 |
+
|
| 261 |
+
# parse text
|
| 262 |
+
if not args.general.text:
|
| 263 |
+
print("reading text from stdin!")
|
| 264 |
+
text = parse_multiarg_text(args.general.text)
|
| 265 |
+
texts = split_text(text, args.advanced.text_split)
|
| 266 |
+
|
| 267 |
+
output_dir = validate_output_dir(
|
| 268 |
+
args.output.output_dir, selected_voices, args.multi_output.candidates
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
# error out early if pydub isn't installed
|
| 272 |
+
pydub = check_pydub(args.output.play)
|
| 273 |
+
|
| 274 |
+
seed = get_seed(args.advanced.seed)
|
| 275 |
+
verbose = not args.general.quiet
|
| 276 |
+
|
| 277 |
+
vocoder = getattr(VocConf, args.advanced.vocoder)
|
| 278 |
+
if verbose:
|
| 279 |
+
print("Loading tts...")
|
| 280 |
+
tts = TextToSpeech(
|
| 281 |
+
models_dir=args.advanced.models_dir,
|
| 282 |
+
enable_redaction=not args.advanced.disable_redaction,
|
| 283 |
+
device=args.advanced.device,
|
| 284 |
+
autoregressive_batch_size=args.advanced.batch_size,
|
| 285 |
+
high_vram=not args.speed.low_vram,
|
| 286 |
+
kv_cache=not args.speed.no_cache,
|
| 287 |
+
ar_checkpoint=args.advanced.ar_checkpoint,
|
| 288 |
+
clvp_checkpoint=args.advanced.clvp_checkpoint,
|
| 289 |
+
diff_checkpoint=args.advanced.diff_checkpoint,
|
| 290 |
+
vocoder=vocoder,
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
gen_settings = {
|
| 294 |
+
"use_deterministic_seed": seed,
|
| 295 |
+
"verbose": verbose,
|
| 296 |
+
"k": args.multi_output.candidates,
|
| 297 |
+
"preset": args.general.preset,
|
| 298 |
+
}
|
| 299 |
+
tuning_options = [
|
| 300 |
+
"num_autoregressive_samples",
|
| 301 |
+
"temperature",
|
| 302 |
+
"length_penalty",
|
| 303 |
+
"repetition_penalty",
|
| 304 |
+
"top_p",
|
| 305 |
+
"max_mel_tokens",
|
| 306 |
+
"cvvp_amount",
|
| 307 |
+
"diffusion_iterations",
|
| 308 |
+
"cond_free",
|
| 309 |
+
"cond_free_k",
|
| 310 |
+
"diffusion_temperature",
|
| 311 |
+
]
|
| 312 |
+
for option in tuning_options:
|
| 313 |
+
if getattr(args.tuning, option) is not None:
|
| 314 |
+
gen_settings[option] = getattr(args.tuning, option)
|
| 315 |
+
|
| 316 |
+
speed_options = [
|
| 317 |
+
"sampler",
|
| 318 |
+
"original_tortoise",
|
| 319 |
+
"half",
|
| 320 |
+
]
|
| 321 |
+
for option in speed_options:
|
| 322 |
+
if getattr(args.speed, option) is not None:
|
| 323 |
+
gen_settings[option] = getattr(args.speed, option)
|
| 324 |
+
|
| 325 |
+
total_clips = len(texts) * len(selected_voices)
|
| 326 |
+
regenerate_clips = (
|
| 327 |
+
[int(x) for x in args.multi_output.regenerate.split(",")]
|
| 328 |
+
if args.multi_output.regenerate
|
| 329 |
+
else None
|
| 330 |
+
)
|
| 331 |
+
for voice_idx, (voice, voice_samples, conditioning_latents) in enumerate(
|
| 332 |
+
voice_generator
|
| 333 |
+
):
|
| 334 |
+
audio_parts = []
|
| 335 |
+
for text_idx, text in enumerate(texts):
|
| 336 |
+
clip_name = f'{"-".join(voice)}_{text_idx:02d}'
|
| 337 |
+
if args.output.output_dir:
|
| 338 |
+
first_clip = os.path.join(args.output.output_dir, f"{clip_name}_00.wav")
|
| 339 |
+
if (
|
| 340 |
+
args.multi_output.skip_existing
|
| 341 |
+
or (regenerate_clips and text_idx not in regenerate_clips)
|
| 342 |
+
) and os.path.exists(first_clip):
|
| 343 |
+
audio_parts.append(load_audio(first_clip, 24000))
|
| 344 |
+
if verbose:
|
| 345 |
+
print(f"Skipping {clip_name}")
|
| 346 |
+
continue
|
| 347 |
+
if verbose:
|
| 348 |
+
print(
|
| 349 |
+
f"Rendering {clip_name} ({(voice_idx * len(texts) + text_idx + 1)} of {total_clips})..."
|
| 350 |
+
)
|
| 351 |
+
print(" " + text)
|
| 352 |
+
gen = tts.tts_with_preset(
|
| 353 |
+
text,
|
| 354 |
+
voice_samples=voice_samples,
|
| 355 |
+
conditioning_latents=conditioning_latents,
|
| 356 |
+
**gen_settings,
|
| 357 |
+
)
|
| 358 |
+
gen = gen if args.multi_output.candidates > 1 else [gen]
|
| 359 |
+
for candidate_idx, audio in enumerate(gen):
|
| 360 |
+
audio = audio.squeeze(0).cpu()
|
| 361 |
+
if candidate_idx == 0:
|
| 362 |
+
audio_parts.append(audio)
|
| 363 |
+
if args.output.output_dir:
|
| 364 |
+
filename = f"{clip_name}_{candidate_idx:02d}.wav"
|
| 365 |
+
save_gen_with_voicefix(audio, os.path.join(args.output.output_dir, filename), squeeze=False, voicefixer=args.general.voicefixer)
|
| 366 |
+
|
| 367 |
+
audio = torch.cat(audio_parts, dim=-1)
|
| 368 |
+
if args.output.output_dir:
|
| 369 |
+
filename = f'{"-".join(voice)}_combined.wav'
|
| 370 |
+
save_gen_with_voicefix(
|
| 371 |
+
audio,
|
| 372 |
+
os.path.join(args.output.output_dir, filename),
|
| 373 |
+
squeeze=False,
|
| 374 |
+
voicefixer=args.general.voicefixer,
|
| 375 |
+
)
|
| 376 |
+
elif args.output.output:
|
| 377 |
+
filename = args.output.output or os.tmp
|
| 378 |
+
save_gen_with_voicefix(audio, filename, squeeze=False, voicefixer=args.general.voicefixer)
|
| 379 |
+
elif args.output.play:
|
| 380 |
+
print("WARNING: cannot use voicefixer with --play")
|
| 381 |
+
f = tempfile.NamedTemporaryFile(suffix=".wav", delete=True)
|
| 382 |
+
torchaudio.save(f.name, audio, 24000)
|
| 383 |
+
pydub.playback.play(pydub.AudioSegment.from_wav(f.name))
|
| 384 |
+
|
| 385 |
+
if args.advanced.produce_debug_state:
|
| 386 |
+
os.makedirs("debug_states", exist_ok=True)
|
| 387 |
+
dbg_state = (seed, texts, voice_samples, conditioning_latents, args)
|
| 388 |
+
torch.save(
|
| 389 |
+
dbg_state, os.path.join("debug_states", f'debug_{"-".join(voice)}.pth')
|
| 390 |
+
)
|
setup.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import setuptools
|
| 2 |
+
|
| 3 |
+
with open("README.md", "r", encoding="utf-8") as fh:
|
| 4 |
+
long_description = fh.read()
|
| 5 |
+
|
| 6 |
+
setuptools.setup(
|
| 7 |
+
name="TorToiSe",
|
| 8 |
+
packages=setuptools.find_packages(),
|
| 9 |
+
version="2.8.0",
|
| 10 |
+
author="James Betker",
|
| 11 |
+
author_email="[email protected]",
|
| 12 |
+
description="A high quality multi-voice text-to-speech library",
|
| 13 |
+
long_description=long_description,
|
| 14 |
+
long_description_content_type="text/markdown",
|
| 15 |
+
url="https://github.com/neonbjb/tortoise-tts",
|
| 16 |
+
project_urls={},
|
| 17 |
+
scripts=[
|
| 18 |
+
'scripts/tortoise_tts.py',
|
| 19 |
+
],
|
| 20 |
+
include_package_data=True,
|
| 21 |
+
install_requires=[
|
| 22 |
+
'tqdm',
|
| 23 |
+
'rotary_embedding_torch',
|
| 24 |
+
'inflect',
|
| 25 |
+
'progressbar',
|
| 26 |
+
'einops',
|
| 27 |
+
'unidecode',
|
| 28 |
+
'scipy',
|
| 29 |
+
'librosa',
|
| 30 |
+
'transformers==4.29.2',
|
| 31 |
+
'tokenizers',
|
| 32 |
+
'deepspeed==0.8.3',
|
| 33 |
+
],
|
| 34 |
+
classifiers=[
|
| 35 |
+
"Programming Language :: Python :: 3",
|
| 36 |
+
"License :: OSI Approved :: Apache Software License",
|
| 37 |
+
"Operating System :: OS Independent",
|
| 38 |
+
],
|
| 39 |
+
python_requires=">=3.6",
|
| 40 |
+
)
|
tortoise_tts.ipynb
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {
|
| 6 |
+
"id": "_pIZ3ZXNp7cf"
|
| 7 |
+
},
|
| 8 |
+
"source": [
|
| 9 |
+
"Welcome to Tortoise! 🐢🐢🐢🐢\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"Before you begin, I **strongly** recommend you turn on a GPU runtime.\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"There's a reason this is called \"Tortoise\" - this model takes up to a minute to perform inference for a single sentence on a GPU. Expect waits on the order of hours on a CPU."
|
| 14 |
+
]
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"cell_type": "code",
|
| 18 |
+
"execution_count": null,
|
| 19 |
+
"metadata": {
|
| 20 |
+
"colab": {
|
| 21 |
+
"base_uri": "https://localhost:8080/",
|
| 22 |
+
"height": 1000
|
| 23 |
+
},
|
| 24 |
+
"id": "JrK20I32grP6",
|
| 25 |
+
"outputId": "9711e23e-3bfc-4cb0-c030-25a1cf460972"
|
| 26 |
+
},
|
| 27 |
+
"outputs": [],
|
| 28 |
+
"source": [
|
| 29 |
+
"!git clone https://github.com/DjKesu/tortoise-tts-fast-cloning.git\n",
|
| 30 |
+
"%cd tortoise-tts-fast-cloning\n",
|
| 31 |
+
"!pip3 install -r requirements.txt --no-deps\n",
|
| 32 |
+
"!pip3 install -e .\n",
|
| 33 |
+
"!pip3 install git+https://github.com/152334H/BigVGAN.git"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"cell_type": "code",
|
| 38 |
+
"execution_count": null,
|
| 39 |
+
"metadata": {},
|
| 40 |
+
"outputs": [],
|
| 41 |
+
"source": [
|
| 42 |
+
"!pip uninstall transformers\n",
|
| 43 |
+
"!pip install transformers==4.29.2"
|
| 44 |
+
]
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"cell_type": "markdown",
|
| 48 |
+
"metadata": {
|
| 49 |
+
"id": "zRW4p3ftjZ3Y"
|
| 50 |
+
},
|
| 51 |
+
"source": [
|
| 52 |
+
"## **Restart the runtime!**\n",
|
| 53 |
+
"## Ctrl+M for Colab"
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"cell_type": "code",
|
| 58 |
+
"execution_count": null,
|
| 59 |
+
"metadata": {
|
| 60 |
+
"id": "Gen09NM4hONQ"
|
| 61 |
+
},
|
| 62 |
+
"outputs": [],
|
| 63 |
+
"source": [
|
| 64 |
+
"#@title # Setup\n",
|
| 65 |
+
"# Imports used through the rest of the notebook.\n",
|
| 66 |
+
"import torch\n",
|
| 67 |
+
"import torchaudio\n",
|
| 68 |
+
"import torch.nn as nn\n",
|
| 69 |
+
"import torch.nn.functional as F\n",
|
| 70 |
+
"\n",
|
| 71 |
+
"import IPython\n",
|
| 72 |
+
"\n",
|
| 73 |
+
"from tortoise.api import TextToSpeech\n",
|
| 74 |
+
"from tortoise.utils.audio import load_audio, load_voice, load_voices\n",
|
| 75 |
+
"\n",
|
| 76 |
+
"# This will download all the models used by Tortoise from the HuggingFace hub.\n",
|
| 77 |
+
"tts = TextToSpeech()"
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"cell_type": "code",
|
| 82 |
+
"execution_count": null,
|
| 83 |
+
"metadata": {
|
| 84 |
+
"id": "bt_aoxONjfL2"
|
| 85 |
+
},
|
| 86 |
+
"outputs": [],
|
| 87 |
+
"source": [
|
| 88 |
+
"# This is the text that will be spoken.\n",
|
| 89 |
+
"text = \"Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?\" #@param {type:\"string\"}\n",
|
| 90 |
+
"#@markdown Show code for multiline text input\n",
|
| 91 |
+
"# Here's something for the poetically inclined.. (set text=)\n",
|
| 92 |
+
"\"\"\"\n",
|
| 93 |
+
"Then took the other, as just as fair,\n",
|
| 94 |
+
"And having perhaps the better claim,\n",
|
| 95 |
+
"Because it was grassy and wanted wear;\n",
|
| 96 |
+
"Though as for that the passing there\n",
|
| 97 |
+
"Had worn them really about the same,\"\"\"\n",
|
| 98 |
+
"\n",
|
| 99 |
+
"# Pick a \"preset mode\" to determine quality. Options: {\"ultra_fast\", \"fast\" (default), \"standard\", \"high_quality\"}. See docs in api.py\n",
|
| 100 |
+
"# added very_fast preset param option, since it involves resulution with dpm++2m, expected to give best,fastest results\n",
|
| 101 |
+
"preset = \"ultra_fast\" #@param [\"ultra_fast\", \"fast\", \"standard\", \"high_quality\", \"very_fast\"]"
|
| 102 |
+
]
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"cell_type": "code",
|
| 106 |
+
"execution_count": null,
|
| 107 |
+
"metadata": {
|
| 108 |
+
"colab": {
|
| 109 |
+
"base_uri": "https://localhost:8080/",
|
| 110 |
+
"height": 211
|
| 111 |
+
},
|
| 112 |
+
"id": "SSleVnRAiEE2",
|
| 113 |
+
"outputId": "45b950c7-5c39-4075-bb34-0a76bf19e1bc"
|
| 114 |
+
},
|
| 115 |
+
"outputs": [],
|
| 116 |
+
"source": [
|
| 117 |
+
"#@markdown Tortoise will attempt to mimic voices you provide. It comes pre-packaged\n",
|
| 118 |
+
"#@markdown with some voices you might recognize.\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"#@markdown Let's list all the voices available. These are just some random clips I've gathered\n",
|
| 121 |
+
"#@markdown from the internet as well as a few voices from the training dataset.\n",
|
| 122 |
+
"#@markdown Feel free to add your own clips to the voices/ folder.\n",
|
| 123 |
+
"#@markdown Currently stored my voice clips under voices/krish/ and displaying the random rumblings of my voice.\n",
|
| 124 |
+
"#@markdown each cell is the samples used, skip unless you wanna listen to them\n",
|
| 125 |
+
"%cd tortoise-tts-fast-cloning\n",
|
| 126 |
+
"%ls tortoise/voices/krish\n",
|
| 127 |
+
"import IPython\n",
|
| 128 |
+
"IPython.display.Audio('tortoise/voices/krish/1.wav')"
|
| 129 |
+
]
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"cell_type": "code",
|
| 133 |
+
"execution_count": null,
|
| 134 |
+
"metadata": {},
|
| 135 |
+
"outputs": [],
|
| 136 |
+
"source": [
|
| 137 |
+
"%cd tortoise-tts-fast-cloning\n",
|
| 138 |
+
"%ls tortoise/voices/krish\n",
|
| 139 |
+
"import IPython\n",
|
| 140 |
+
"IPython.display.Audio('tortoise/voices/krish/2.wav')"
|
| 141 |
+
]
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"cell_type": "code",
|
| 145 |
+
"execution_count": null,
|
| 146 |
+
"metadata": {},
|
| 147 |
+
"outputs": [],
|
| 148 |
+
"source": [
|
| 149 |
+
"%cd tortoise-tts-fast-cloning\n",
|
| 150 |
+
"%ls tortoise/voices/krish\n",
|
| 151 |
+
"import IPython\n",
|
| 152 |
+
"IPython.display.Audio('tortoise/voices/krish/3.wav')"
|
| 153 |
+
]
|
| 154 |
+
},
|
| 155 |
+
{
|
| 156 |
+
"cell_type": "code",
|
| 157 |
+
"execution_count": null,
|
| 158 |
+
"metadata": {},
|
| 159 |
+
"outputs": [],
|
| 160 |
+
"source": [
|
| 161 |
+
"%cd tortoise-tts-fast-cloning\n",
|
| 162 |
+
"%ls tortoise/voices/krish\n",
|
| 163 |
+
"import IPython\n",
|
| 164 |
+
"IPython.display.Audio('tortoise/voices/krish/4.wav')"
|
| 165 |
+
]
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"cell_type": "code",
|
| 169 |
+
"execution_count": null,
|
| 170 |
+
"metadata": {
|
| 171 |
+
"cellView": "form",
|
| 172 |
+
"colab": {
|
| 173 |
+
"base_uri": "https://localhost:8080/",
|
| 174 |
+
"height": 192
|
| 175 |
+
},
|
| 176 |
+
"id": "KEXOKjIvn6NW",
|
| 177 |
+
"outputId": "90c803f3-0b9b-4f24-ccbc-d3f3dcbde48c"
|
| 178 |
+
},
|
| 179 |
+
"outputs": [],
|
| 180 |
+
"source": [
|
| 181 |
+
"#@markdown Pick one of the voices from the output above\n",
|
| 182 |
+
"voice = 'krish' #@param {type:\"string\"}\n",
|
| 183 |
+
"\n",
|
| 184 |
+
"#@markdown Load it and send it through Tortoise.\n",
|
| 185 |
+
"voice_samples, conditioning_latents = load_voice(voice)\n",
|
| 186 |
+
"print(voice_samples)\n",
|
| 187 |
+
"# conditioning_latents = tts.get_conditioning_latents(\n",
|
| 188 |
+
"# voice_samples,\n",
|
| 189 |
+
"# return_mels=False, # Set to True if you want mel spectrograms to be returned\n",
|
| 190 |
+
"# latent_averaging_mode=1, # Choose the mode (0, 1, or 2) as needed\n",
|
| 191 |
+
"# original_tortoise=False, # Set to True or False as needed\n",
|
| 192 |
+
"# )\n",
|
| 193 |
+
"gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, \n",
|
| 194 |
+
" preset=preset)\n",
|
| 195 |
+
"torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)\n",
|
| 196 |
+
"IPython.display.Audio('generated.wav')"
|
| 197 |
+
]
|
| 198 |
+
},
|
| 199 |
+
{
|
| 200 |
+
"cell_type": "code",
|
| 201 |
+
"execution_count": null,
|
| 202 |
+
"metadata": {
|
| 203 |
+
"colab": {
|
| 204 |
+
"base_uri": "https://localhost:8080/",
|
| 205 |
+
"height": 41
|
| 206 |
+
},
|
| 207 |
+
"id": "VQgw3KeV8Yqb",
|
| 208 |
+
"outputId": "13db770e-3fcc-4b27-ab78-07a603a299d9"
|
| 209 |
+
},
|
| 210 |
+
"outputs": [],
|
| 211 |
+
"source": [
|
| 212 |
+
"#@markdown Optionally, upload use your own voice by running the next two cells. Change the name of the voice to a voice you want before running\n",
|
| 213 |
+
"#@markdown you upload at least 2 audio clips. They must be a WAV file, 6-10 seconds long.\n",
|
| 214 |
+
"CUSTOM_VOICE_NAME = \"custom\"\n",
|
| 215 |
+
"\n",
|
| 216 |
+
"import os\n",
|
| 217 |
+
"from google.colab import files\n",
|
| 218 |
+
"\n",
|
| 219 |
+
"custom_voice_folder = f\"tortoise/voices/{CUSTOM_VOICE_NAME}\"\n",
|
| 220 |
+
"os.makedirs(custom_voice_folder)\n",
|
| 221 |
+
"for i, file_data in enumerate(files.upload().values()):\n",
|
| 222 |
+
" with open(os.path.join(custom_voice_folder, f'{i}.wav'), 'wb') as f:\n",
|
| 223 |
+
" f.write(file_data)"
|
| 224 |
+
]
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"cell_type": "code",
|
| 228 |
+
"execution_count": null,
|
| 229 |
+
"metadata": {
|
| 230 |
+
"id": "jJnJwv3R9uWT"
|
| 231 |
+
},
|
| 232 |
+
"outputs": [],
|
| 233 |
+
"source": [
|
| 234 |
+
"# Generate speech with the custotm voice.\n",
|
| 235 |
+
"voice_samples, conditioning_latents = load_voices(CUSTOM_VOICE_NAME)\n",
|
| 236 |
+
"gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, \n",
|
| 237 |
+
" preset=preset)\n",
|
| 238 |
+
"torchaudio.save(f'generated-{CUSTOM_VOICE_NAME}.wav', gen.squeeze(0).cpu(), 24000)\n",
|
| 239 |
+
"IPython.display.Audio(f'generated-{CUSTOM_VOICE_NAME}.wav')"
|
| 240 |
+
]
|
| 241 |
+
}
|
| 242 |
+
],
|
| 243 |
+
"metadata": {
|
| 244 |
+
"accelerator": "GPU",
|
| 245 |
+
"colab": {
|
| 246 |
+
"provenance": []
|
| 247 |
+
},
|
| 248 |
+
"kernelspec": {
|
| 249 |
+
"display_name": "Python 3 (ipykernel)",
|
| 250 |
+
"language": "python",
|
| 251 |
+
"name": "python3"
|
| 252 |
+
},
|
| 253 |
+
"language_info": {
|
| 254 |
+
"codemirror_mode": {
|
| 255 |
+
"name": "ipython",
|
| 256 |
+
"version": 3
|
| 257 |
+
},
|
| 258 |
+
"file_extension": ".py",
|
| 259 |
+
"mimetype": "text/x-python",
|
| 260 |
+
"name": "python",
|
| 261 |
+
"nbconvert_exporter": "python",
|
| 262 |
+
"pygments_lexer": "ipython3",
|
| 263 |
+
"version": "3.9.16"
|
| 264 |
+
}
|
| 265 |
+
},
|
| 266 |
+
"nbformat": 4,
|
| 267 |
+
"nbformat_minor": 4
|
| 268 |
+
}
|