Spaces:
Running
Running
File size: 3,293 Bytes
66d0bf1 5e6c5bb 66d0bf1 5e6c5bb 66d0bf1 5e6c5bb 66d0bf1 5e6c5bb 18e4b07 5e6c5bb 18e4b07 5e6c5bb 18e4b07 5e6c5bb 18e4b07 5e6c5bb 18e4b07 5e6c5bb 18e4b07 5e6c5bb ef866b7 66d0bf1 18e4b07 5e6c5bb 18e4b07 5e6c5bb 18e4b07 5e6c5bb 18e4b07 5e6c5bb 18e4b07 5e6c5bb 18e4b07 5e6c5bb 18e4b07 66d0bf1 18e4b07 66d0bf1 ef866b7 5e6c5bb ef866b7 18e4b07 66d0bf1 5e6c5bb 18e4b07 66d0bf1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import torch
import soundfile as sf
import uuid
import gradio as gr
import numpy as np
import re
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
# Load model and tokenizers
model_name = "ai4bharat/indic-parler-tts"
device = "cpu"
print("Loading model...")
model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
desc_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
print("Applying dynamic quantization...")
quantized_model = torch.quantization.quantize_dynamic(
model,
{torch.nn.Linear},
dtype=torch.qint8
)
# Sentence splitter
def split_text(text, max_len=150):
chunks = re.split(r'(?<=[.!?]) +', text)
refined = []
for chunk in chunks:
if len(chunk) <= max_len:
refined.append(chunk)
else:
words = chunk.split()
temp = []
buf_len = 0
for word in words:
temp.append(word)
buf_len += len(word) + 1
if buf_len > max_len:
refined.append(' '.join(temp))
temp = []
buf_len = 0
if temp:
refined.append(' '.join(temp))
return refined
# Core TTS function
def synthesize(language, text, gender, emotion, speed):
description = (
f"A native {language.lower()} female speaker with an expressive tone."
)
audio_chunks = []
text_chunks = split_text(text)
for chunk in text_chunks:
# New tokenization for each chunk
desc_input = desc_tokenizer(description, return_tensors="pt").to(device)
prompt_input = tokenizer(chunk, return_tensors="pt").to(device)
with torch.no_grad():
output = quantized_model.generate(
input_ids=desc_input.input_ids,
attention_mask=desc_input.attention_mask,
prompt_input_ids=prompt_input.input_ids,
prompt_attention_mask=torch.ones_like(prompt_input.input_ids).to(device)
)
audio = output.cpu().numpy().squeeze()
audio_chunks.append(audio)
full_audio = np.concatenate(audio_chunks)
filename = f"{uuid.uuid4().hex}.wav"
sf.write(filename, full_audio, quantized_model.config.sampling_rate)
return filename
# Gradio UI
iface = gr.Interface(
fn=synthesize,
inputs=[
gr.Dropdown(["Malayalam", "Hindi", "Tamil", "English", "Kannada"], label="Language"),
gr.Textbox(label="Text to Synthesize", lines=6, placeholder="Enter your sentence here..."),
# gr.Radio(["Male", "Female"], label="Speaker Gender"),
# gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"),
# gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"),
#gr.Dropdown(["Low", "Normal", "High"], label="Pitch"),
#gr.Dropdown(["Basic", "Refined"], label="Voice Quality"),
],
outputs=gr.Audio(type="filepath", label="Synthesized Speech"),
title="Multilingual Indic TTS (Quantized + Chunked)",
description="CPU-based TTS with quantized Parler-TTS and chunked input for Malayalam, Hindi, Tamil, and English.",
)
iface.launch() |