Spaces:
Running
Running
import torch | |
import soundfile as sf | |
import uuid | |
import gradio as gr | |
import numpy as np | |
import re | |
from parler_tts import ParlerTTSForConditionalGeneration | |
from transformers import AutoTokenizer | |
# Load model and tokenizers | |
model_name = "ai4bharat/indic-parler-tts" | |
device = "cpu" | |
print("Loading model...") | |
model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device).eval() | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
desc_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path) | |
print("Applying dynamic quantization...") | |
quantized_model = torch.quantization.quantize_dynamic( | |
model, | |
{torch.nn.Linear}, | |
dtype=torch.qint8 | |
) | |
# Sentence splitter | |
def split_text(text, max_len=150): | |
chunks = re.split(r'(?<=[.!?]) +', text) | |
refined = [] | |
for chunk in chunks: | |
if len(chunk) <= max_len: | |
refined.append(chunk) | |
else: | |
words = chunk.split() | |
temp = [] | |
buf_len = 0 | |
for word in words: | |
temp.append(word) | |
buf_len += len(word) + 1 | |
if buf_len > max_len: | |
refined.append(' '.join(temp)) | |
temp = [] | |
buf_len = 0 | |
if temp: | |
refined.append(' '.join(temp)) | |
return refined | |
# Core TTS function | |
def synthesize(language, text, gender, emotion, speed): | |
description = ( | |
f"A native {language.lower()} female speaker with an expressive tone." | |
) | |
audio_chunks = [] | |
text_chunks = split_text(text) | |
for chunk in text_chunks: | |
# New tokenization for each chunk | |
desc_input = desc_tokenizer(description, return_tensors="pt").to(device) | |
prompt_input = tokenizer(chunk, return_tensors="pt").to(device) | |
with torch.no_grad(): | |
output = quantized_model.generate( | |
input_ids=desc_input.input_ids, | |
attention_mask=desc_input.attention_mask, | |
prompt_input_ids=prompt_input.input_ids, | |
prompt_attention_mask=torch.ones_like(prompt_input.input_ids).to(device) | |
) | |
audio = output.cpu().numpy().squeeze() | |
audio_chunks.append(audio) | |
full_audio = np.concatenate(audio_chunks) | |
filename = f"{uuid.uuid4().hex}.wav" | |
sf.write(filename, full_audio, quantized_model.config.sampling_rate) | |
return filename | |
# Gradio UI | |
iface = gr.Interface( | |
fn=synthesize, | |
inputs=[ | |
gr.Dropdown(["Malayalam", "Hindi", "Tamil", "English", "Kannada"], label="Language"), | |
gr.Textbox(label="Text to Synthesize", lines=6, placeholder="Enter your sentence here..."), | |
# gr.Radio(["Male", "Female"], label="Speaker Gender"), | |
# gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"), | |
# gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"), | |
#gr.Dropdown(["Low", "Normal", "High"], label="Pitch"), | |
#gr.Dropdown(["Basic", "Refined"], label="Voice Quality"), | |
], | |
outputs=gr.Audio(type="filepath", label="Synthesized Speech"), | |
title="Multilingual Indic TTS (Quantized + Chunked)", | |
description="CPU-based TTS with quantized Parler-TTS and chunked input for Malayalam, Hindi, Tamil, and English.", | |
) | |
iface.launch() |