Spaces:
Runtime error
Runtime error
| # AGPL: a notification must be added stating that changes have been made to that file. | |
| import os | |
| import shutil | |
| from pathlib import Path | |
| import streamlit as st | |
| from random import randint | |
| from tortoise.api import MODELS_DIR | |
| from tortoise.inference import ( | |
| infer_on_texts, | |
| run_and_save_tts, | |
| split_and_recombine_text, | |
| ) | |
| from tortoise.utils.diffusion import SAMPLERS | |
| from app_utils.filepicker import st_file_selector | |
| from app_utils.conf import TortoiseConfig | |
| from app_utils.funcs import ( | |
| timeit, | |
| load_model, | |
| list_voices, | |
| load_voice_conditionings, | |
| ) | |
| LATENT_MODES = [ | |
| "Tortoise original (bad)", | |
| "average per 4.27s (broken on small files)", | |
| "average per voice file (broken on small files)", | |
| ] | |
| def main(): | |
| conf = TortoiseConfig() | |
| with st.expander("Create New Voice", expanded=True): | |
| if "file_uploader_key" not in st.session_state: | |
| st.session_state["file_uploader_key"] = str(randint(1000, 100000000)) | |
| st.session_state["text_input_key"] = str(randint(1000, 100000000)) | |
| uploaded_files = st.file_uploader( | |
| "Upload Audio Samples for a New Voice", | |
| accept_multiple_files=True, | |
| type=["wav"], | |
| key=st.session_state["file_uploader_key"] | |
| ) | |
| voice_name = st.text_input( | |
| "New Voice Name", | |
| help="Enter a name for your new voice.", | |
| value="", | |
| key=st.session_state["text_input_key"] | |
| ) | |
| create_voice_button = st.button( | |
| "Create Voice", | |
| disabled = ((voice_name.strip() == "") | (len(uploaded_files) == 0)) | |
| ) | |
| if create_voice_button: | |
| st.write(st.session_state) | |
| with st.spinner(f"Creating new voice: {voice_name}"): | |
| new_voice_name = voice_name.strip().replace(" ", "_") | |
| voices_dir = f'./tortoise/voices/{new_voice_name}/' | |
| if os.path.exists(voices_dir): | |
| shutil.rmtree(voices_dir) | |
| os.makedirs(voices_dir) | |
| for index, uploaded_file in enumerate(uploaded_files): | |
| bytes_data = uploaded_file.read() | |
| with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file: | |
| wav_file.write(bytes_data) | |
| st.session_state["text_input_key"] = str(randint(1000, 100000000)) | |
| st.session_state["file_uploader_key"] = str(randint(1000, 100000000)) | |
| st.experimental_rerun() | |
| text = st.text_area( | |
| "Text", | |
| help="Text to speak.", | |
| value="The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them.", | |
| ) | |
| voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"] | |
| voice = st.selectbox( | |
| "Voice", | |
| voices, | |
| help="Selects the voice to use for generation. See options in voices/ directory (and add your own!) " | |
| "Use the & character to join two voices together. Use a comma to perform inference on multiple voices.", | |
| index=0, | |
| ) | |
| preset = st.selectbox( | |
| "Preset", | |
| ( | |
| "single_sample", | |
| "ultra_fast", | |
| "very_fast", | |
| "ultra_fast_old", | |
| "fast", | |
| "standard", | |
| "high_quality", | |
| ), | |
| help="Which voice preset to use.", | |
| index=1, | |
| ) | |
| with st.expander("Advanced"): | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| """#### Model parameters""" | |
| candidates = st.number_input( | |
| "Candidates", | |
| help="How many output candidates to produce per-voice.", | |
| value=1, | |
| ) | |
| latent_averaging_mode = st.radio( | |
| "Latent averaging mode", | |
| LATENT_MODES, | |
| help="How voice samples should be averaged together.", | |
| index=0, | |
| ) | |
| sampler = st.radio( | |
| "Sampler", | |
| #SAMPLERS, | |
| ["dpm++2m", "p", "ddim"], | |
| help="Diffusion sampler. Note that dpm++2m is experimental and typically requires more steps.", | |
| index=1, | |
| ) | |
| steps = st.number_input( | |
| "Steps", | |
| help="Override the steps used for diffusion (default depends on preset)", | |
| value=10, | |
| ) | |
| seed = st.number_input( | |
| "Seed", | |
| help="Random seed which can be used to reproduce results.", | |
| value=-1, | |
| ) | |
| if seed == -1: | |
| seed = None | |
| voice_fixer = st.checkbox( | |
| "Voice fixer", | |
| help="Use `voicefixer` to improve audio quality. This is a post-processing step which can be applied to any output.", | |
| value=True, | |
| ) | |
| """#### Directories""" | |
| output_path = st.text_input( | |
| "Output Path", help="Where to store outputs.", value="results/" | |
| ) | |
| with col2: | |
| """#### Optimizations""" | |
| high_vram = not st.checkbox( | |
| "Low VRAM", | |
| help="Re-enable default offloading behaviour of tortoise", | |
| value=True, | |
| ) | |
| half = st.checkbox( | |
| "Half-Precision", | |
| help="Enable autocast to half precision for autoregressive model", | |
| value=False, | |
| ) | |
| kv_cache = st.checkbox( | |
| "Key-Value Cache", | |
| help="Enable kv_cache usage, leading to drastic speedups but worse memory usage", | |
| value=True, | |
| ) | |
| cond_free = st.checkbox( | |
| "Conditioning Free", | |
| help="Force conditioning free diffusion", | |
| value=True, | |
| ) | |
| no_cond_free = st.checkbox( | |
| "Force Not Conditioning Free", | |
| help="Force disable conditioning free diffusion", | |
| value=False, | |
| ) | |
| """#### Text Splitting""" | |
| min_chars_to_split = st.number_input( | |
| "Min Chars to Split", | |
| help="Minimum number of characters to split text on", | |
| min_value=50, | |
| value=200, | |
| step=1, | |
| ) | |
| """#### Debug""" | |
| produce_debug_state = st.checkbox( | |
| "Produce Debug State", | |
| help="Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.", | |
| value=True, | |
| ) | |
| ar_checkpoint = "." | |
| diff_checkpoint = "." | |
| if st.button("Update Basic Settings"): | |
| conf.update( | |
| EXTRA_VOICES_DIR=extra_voices_dir, | |
| LOW_VRAM=not high_vram, | |
| AR_CHECKPOINT=ar_checkpoint, | |
| DIFF_CHECKPOINT=diff_checkpoint, | |
| ) | |
| ar_checkpoint = None | |
| diff_checkpoint = None | |
| tts = load_model(MODELS_DIR, high_vram, kv_cache, ar_checkpoint, diff_checkpoint) | |
| if st.button("Start"): | |
| assert latent_averaging_mode | |
| assert preset | |
| assert voice | |
| def show_generation(fp, filename: str): | |
| """ | |
| audio_buffer = BytesIO() | |
| save_gen_with_voicefix(g, audio_buffer, squeeze=False) | |
| torchaudio.save(audio_buffer, g, 24000, format='wav') | |
| """ | |
| st.audio(str(fp), format="audio/wav") | |
| st.download_button( | |
| "Download sample", | |
| str(fp), | |
| file_name=filename, # this doesn't actually seem to work lol | |
| ) | |
| with st.spinner( | |
| f"Generating {candidates} candidates for voice {voice} (seed={seed}). You can see progress in the terminal" | |
| ): | |
| os.makedirs(output_path, exist_ok=True) | |
| selected_voices = voice.split(",") | |
| for k, selected_voice in enumerate(selected_voices): | |
| if "&" in selected_voice: | |
| voice_sel = selected_voice.split("&") | |
| else: | |
| voice_sel = [selected_voice] | |
| voice_samples, conditioning_latents = load_voice_conditionings( | |
| voice_sel, [] | |
| ) | |
| voice_path = Path(os.path.join(output_path, selected_voice)) | |
| with timeit( | |
| f"Generating {candidates} candidates for voice {selected_voice} (seed={seed})" | |
| ): | |
| nullable_kwargs = { | |
| k: v | |
| for k, v in zip( | |
| ["sampler", "diffusion_iterations", "cond_free"], | |
| [sampler, steps, cond_free], | |
| ) | |
| if v is not None | |
| } | |
| def call_tts(text: str): | |
| return tts.tts_with_preset( | |
| text, | |
| k=candidates, | |
| voice_samples=voice_samples, | |
| conditioning_latents=conditioning_latents, | |
| preset=preset, | |
| use_deterministic_seed=seed, | |
| return_deterministic_state=True, | |
| cvvp_amount=0.0, | |
| half=half, | |
| latent_averaging_mode=LATENT_MODES.index( | |
| latent_averaging_mode | |
| ), | |
| **nullable_kwargs, | |
| ) | |
| if len(text) < min_chars_to_split: | |
| filepaths = run_and_save_tts( | |
| call_tts, | |
| text, | |
| voice_path, | |
| return_deterministic_state=True, | |
| return_filepaths=True, | |
| voicefixer=voice_fixer, | |
| ) | |
| for i, fp in enumerate(filepaths): | |
| show_generation(fp, f"{selected_voice}-text-{i}.wav") | |
| else: | |
| desired_length = int(min_chars_to_split) | |
| texts = split_and_recombine_text( | |
| text, desired_length, desired_length + 100 | |
| ) | |
| filepaths = infer_on_texts( | |
| call_tts, | |
| texts, | |
| voice_path, | |
| return_deterministic_state=True, | |
| return_filepaths=True, | |
| lines_to_regen=set(range(len(texts))), | |
| voicefixer=voice_fixer, | |
| ) | |
| for i, fp in enumerate(filepaths): | |
| show_generation(fp, f"{selected_voice}-text-{i}.wav") | |
| if produce_debug_state: | |
| """Debug states can be found in the output directory""" | |
| if __name__ == "__main__": | |
| main() | |