Spaces:
Running
Running
import gradio as gr | |
import torch | |
from parler_tts import ParlerTTSForConditionalGeneration | |
from transformers import AutoTokenizer | |
import soundfile as sf | |
import uuid | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model_name = "ai4bharat/indic-parler-tts" | |
model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
desc_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path) | |
def synthesize(language, text, gender, emotion, speed, pitch, quality): | |
desc = ( | |
f"A native {language} {gender.lower()} speaker with a {emotion.lower()} and expressive tone, " | |
f"speaking at a {speed.lower()} rate with {pitch.lower()} pitch and {quality.lower()} voice quality." | |
) | |
desc_inputs = desc_tokenizer(desc, return_tensors="pt").to(device) | |
text_inputs = tokenizer(text, return_tensors="pt").to(device) | |
gen_audio = model.generate( | |
input_ids=desc_inputs.input_ids, | |
attention_mask=desc_inputs.attention_mask, | |
prompt_input_ids=text_inputs.input_ids, | |
prompt_attention_mask=torch.ones_like(text_inputs.input_ids).to(device) | |
) | |
audio_np = gen_audio.cpu().numpy().squeeze() | |
filename = f"{uuid.uuid4()}.wav" | |
sf.write(filename, audio_np, model.config.sampling_rate) | |
return filename | |
iface = gr.Interface( | |
fn=synthesize, | |
inputs=[ | |
gr.Dropdown(["Malayalam", "English", "Hindi", "Tamil"], label="Language"), | |
gr.Textbox(label="Text to Synthesize", lines=4), | |
gr.Radio(["Male", "Female"], label="Speaker Gender"), | |
gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"), | |
gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"), | |
gr.Dropdown(["Low", "Normal", "High"], label="Pitch"), | |
gr.Dropdown(["Basic", "Refined"], label="Voice Quality"), | |
], | |
outputs=gr.Audio(type="filepath", label="Synthesized Audio"), | |
allow_flagging="never", | |
title="Multilingual TTS using Indic Parler-TTS", | |
description="Type text, choose a speaker style, and get synthesized speech for Malayalam, Hindi, Tamil, or English." | |
) | |
iface.launch() |