|
import gradio as gr |
|
|
|
import alias as _alias |
|
import outetts |
|
import json |
|
import tempfile |
|
import hashlib |
|
import os |
|
from typing import Optional |
|
from outetts.models.info import MODEL_INFO |
|
from outetts.utils import helpers |
|
from huggingface_hub import hf_hub_download |
|
import torch |
|
from transformers import BitsAndBytesConfig |
|
import spaces |
|
|
|
|
|
MODELS = {v.value: v for _, v in outetts.Models.__members__.items()} |
|
|
|
MODEL_QUANTIZATION = { |
|
outetts.Models.VERSION_0_1_SIZE_350M: outetts.LlamaCppQuantization.Q8_0, |
|
outetts.Models.VERSION_0_2_SIZE_500M: outetts.LlamaCppQuantization.Q8_0, |
|
outetts.Models.VERSION_0_3_SIZE_500M: outetts.LlamaCppQuantization.Q8_0, |
|
} |
|
|
|
|
|
speaker_cache = {} |
|
|
|
def get_file_hash(file_path): |
|
"""Calculate MD5 hash of a file for caching purposes.""" |
|
hash_md5 = hashlib.md5() |
|
with open(file_path, "rb") as f: |
|
for chunk in iter(lambda: f.read(4096), b""): |
|
hash_md5.update(chunk) |
|
return hash_md5.hexdigest() |
|
|
|
def try_ggml_model(model: outetts.Models, backend: outetts.Backend, quantization: outetts.LlamaCppQuantization): |
|
model_config = MODEL_INFO[model] |
|
repo = f"OuteAI/{model.value}-GGUF" |
|
filename = f"{model.value}-{quantization.value}.gguf" |
|
model_path = hf_hub_download( |
|
repo_id=repo, |
|
filename=filename, |
|
local_dir=os.path.join(helpers.get_cache_dir(), "gguf"), |
|
local_files_only=False |
|
) |
|
return outetts.ModelConfig( |
|
model_path=model_path, |
|
tokenizer_path=f"OuteAI/{model.value}", |
|
backend=backend, |
|
n_gpu_layers=99, |
|
verbose=False, |
|
device=None, |
|
dtype=None, |
|
additional_model_config={}, |
|
audio_codec_path=None, |
|
**model_config |
|
) |
|
|
|
@spaces.GPU |
|
def get_interface(model_name: str): |
|
"""Get interface instance for the model (no caching to avoid CUDA memory issues).""" |
|
model = MODELS[model_name] |
|
|
|
has_cuda = torch.cuda.is_available() |
|
if has_cuda: |
|
model_config = MODEL_INFO[model] |
|
config = outetts.ModelConfig( |
|
model_path=f"OuteAI/{model_name}", |
|
tokenizer_path=f"OuteAI/{model_name}", |
|
backend=outetts.Backend.HF, |
|
additional_model_config={ |
|
"device_map": "auto" if has_cuda else "cpu", |
|
"quantization_config": BitsAndBytesConfig( |
|
load_in_8bit=True |
|
) if has_cuda else None, |
|
}, |
|
**model_config |
|
) |
|
else: |
|
quantization = MODEL_QUANTIZATION.get(model, outetts.LlamaCppQuantization.Q6_K) |
|
config = try_ggml_model(model, outetts.Backend.LLAMACPP, quantization) |
|
|
|
|
|
interface = outetts.Interface(config=config) |
|
return interface |
|
|
|
def get_or_create_speaker(interface, audio_file): |
|
"""Get speaker from cache or create new one if not cached.""" |
|
|
|
file_hash = get_file_hash(audio_file) |
|
cache_key = f"{interface.config.interface_version}_{file_hash}" |
|
|
|
|
|
if cache_key in speaker_cache: |
|
print(f"β
Using cached speaker profile for {os.path.basename(audio_file)}") |
|
return speaker_cache[cache_key] |
|
|
|
|
|
print(f"π Creating new speaker profile for {os.path.basename(audio_file)}") |
|
speaker = interface.create_speaker(audio_file, whisper_model="large-v3-turbo") |
|
|
|
|
|
speaker_cache[cache_key] = speaker |
|
print(f"πΎ Cached speaker profile ({len(speaker_cache)} total cached)") |
|
|
|
return speaker |
|
|
|
def create_speaker_and_generate(model_name, audio_file, test_text: Optional[str] = None, temperature: float = 0.4): |
|
"""Create speaker from audio and optionally generate test audio.""" |
|
if audio_file is None: |
|
|
|
return "Please upload an audio file to create a speaker profile.", None |
|
|
|
|
|
interface = get_interface(model_name) |
|
|
|
|
|
speaker = get_or_create_speaker(interface, audio_file) |
|
|
|
|
|
speaker_json = json.dumps(speaker, indent=2, ensure_ascii=False) |
|
|
|
|
|
generated_audio = None |
|
if test_text and test_text.strip(): |
|
output = interface.generate( |
|
config=outetts.GenerationConfig( |
|
text=test_text, |
|
speaker=speaker, |
|
sampler_config=outetts.SamplerConfig( |
|
temperature=temperature |
|
), |
|
max_length=MODEL_INFO[MODELS[model_name]]["max_seq_length"] |
|
) |
|
) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: |
|
output.save(f.name) |
|
generated_audio = f.name |
|
|
|
return speaker_json, generated_audio |
|
|
|
example_text = "Hello, this is a test of the OuteTTS speaker profile." |
|
|
|
|
|
demo = gr.Interface( |
|
fn=create_speaker_and_generate, |
|
inputs=[ |
|
gr.Dropdown( |
|
choices=list(MODELS.keys()), |
|
value=list(MODELS.keys())[-1], |
|
label="Select OuteTTS Model", |
|
info="Choose the model variant to use" |
|
), |
|
gr.Audio( |
|
label="Upload Reference Audio", |
|
type="filepath", |
|
sources=["upload", "microphone"] |
|
), |
|
gr.Textbox( |
|
label="Test Text (Optional)", |
|
placeholder="Enter text to generate speech (leave empty to only create speaker profile)...", |
|
lines=3, |
|
value=None |
|
), |
|
gr.Slider( |
|
minimum=0.1, |
|
maximum=1.0, |
|
step=0.1, |
|
value=0.4, |
|
label="Temperature", |
|
info="Controls randomness in generation" |
|
) |
|
], |
|
outputs=[ |
|
gr.Textbox( |
|
label="Speaker Profile (JSON)", |
|
lines=15, |
|
max_lines=20, |
|
show_copy_button=True |
|
), |
|
gr.Audio( |
|
label="Generated Test Audio (if text provided)", |
|
type="filepath" |
|
) |
|
], |
|
title="ποΈ OuteTTS Speaker Creator", |
|
description="Create and manage speaker profiles for OuteTTS text-to-speech synthesis. Upload audio to create a speaker profile, and optionally provide test text to generate sample audio.", |
|
theme=gr.themes.Soft(), |
|
examples=[ |
|
["OuteTTS-1.0-0.6B", None, example_text, 0.2], |
|
["OuteTTS-0.3-500M", None, example_text, 0.2], |
|
], |
|
cache_examples=False, |
|
flagging_mode="never" |
|
) |
|
|
|
if __name__ == "__main__": |
|
|
|
demo.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
show_api=True, |
|
show_error=True |
|
) |
|
|