Spaces:

leenag
/

Multilingual_TTS

Running

File size: 3,293 Bytes

66d0bf1
 
 
5e6c5bb
 
 
 
 
66d0bf1
5e6c5bb
66d0bf1
5e6c5bb
 
 
 
66d0bf1
 
 
5e6c5bb
 
 
 
 
 
 
18e4b07
5e6c5bb
 
18e4b07
5e6c5bb
 
18e4b07
5e6c5bb
 
18e4b07
 
5e6c5bb
18e4b07
 
 
 
 
 
 
 
 
5e6c5bb
18e4b07
 
5e6c5bb
ef866b7
66d0bf1
 
18e4b07
 
5e6c5bb
18e4b07
 
 
5e6c5bb
18e4b07
5e6c5bb
18e4b07
 
 
5e6c5bb
 
 
 
18e4b07
 
5e6c5bb
18e4b07
5e6c5bb
18e4b07
66d0bf1
 
18e4b07
66d0bf1
 
 
ef866b7
5e6c5bb
ef866b7
 
 
18e4b07
 
66d0bf1
5e6c5bb
 
18e4b07
66d0bf1

import torch
import soundfile as sf
import uuid
import gradio as gr
import numpy as np
import re
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer

# Load model and tokenizers
model_name = "ai4bharat/indic-parler-tts"
device = "cpu"

print("Loading model...")
model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
desc_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)

print("Applying dynamic quantization...")
quantized_model = torch.quantization.quantize_dynamic(
    model,
    {torch.nn.Linear},
    dtype=torch.qint8
)

# Sentence splitter
def split_text(text, max_len=150):
    chunks = re.split(r'(?<=[.!?]) +', text)
    refined = []
    for chunk in chunks:
        if len(chunk) <= max_len:
            refined.append(chunk)
        else:
            words = chunk.split()
            temp = []
            buf_len = 0
            for word in words:
                temp.append(word)
                buf_len += len(word) + 1
                if buf_len > max_len:
                    refined.append(' '.join(temp))
                    temp = []
                    buf_len = 0
            if temp:
                refined.append(' '.join(temp))
    return refined

# Core TTS function
def synthesize(language, text, gender, emotion, speed):
    description = (
        f"A native {language.lower()} female speaker with an expressive tone."
    )

    audio_chunks = []
    text_chunks = split_text(text)

    for chunk in text_chunks:
        # New tokenization for each chunk
        desc_input = desc_tokenizer(description, return_tensors="pt").to(device)
        prompt_input = tokenizer(chunk, return_tensors="pt").to(device)

        with torch.no_grad():
            output = quantized_model.generate(
                input_ids=desc_input.input_ids,
                attention_mask=desc_input.attention_mask,
                prompt_input_ids=prompt_input.input_ids,
                prompt_attention_mask=torch.ones_like(prompt_input.input_ids).to(device)
            )

        audio = output.cpu().numpy().squeeze()
        audio_chunks.append(audio)

    full_audio = np.concatenate(audio_chunks)
    filename = f"{uuid.uuid4().hex}.wav"
    sf.write(filename, full_audio, quantized_model.config.sampling_rate)
    return filename

# Gradio UI
iface = gr.Interface(
    fn=synthesize,
    inputs=[
        gr.Dropdown(["Malayalam", "Hindi", "Tamil", "English", "Kannada"], label="Language"),
        gr.Textbox(label="Text to Synthesize", lines=6, placeholder="Enter your sentence here..."),
        # gr.Radio(["Male", "Female"], label="Speaker Gender"),
        # gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"),
        # gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"),
        #gr.Dropdown(["Low", "Normal", "High"], label="Pitch"),
        #gr.Dropdown(["Basic", "Refined"], label="Voice Quality"),
    ],
    outputs=gr.Audio(type="filepath", label="Synthesized Speech"),
    title="Multilingual Indic TTS (Quantized + Chunked)",
    description="CPU-based TTS with quantized Parler-TTS and chunked input for Malayalam, Hindi, Tamil, and English.",
)

iface.launch()