Spaces:

leenag
/

Multilingual_TTS

Running

App Files Files Community

leenag commited on Apr 16

Commit

18e4b07

verified ·

1 Parent(s): 16df6a6

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -37

app.py CHANGED Viewed

@@ -23,64 +23,60 @@ quantized_model = torch.quantization.quantize_dynamic(
     dtype=torch.qint8
 )
-# Sentence splitter (splits by full stop, exclamation, or question mark)
 def split_text(text, max_len=150):
-    # First, try to split by sentence punctuation
     chunks = re.split(r'(?<=[.!?]) +', text)
-    # If any chunk is still too long, split further
-    refined_chunks = []
     for chunk in chunks:
         if len(chunk) <= max_len:
-            refined_chunks.append(chunk)
         else:
-            # Break on space while respecting max_len
             words = chunk.split()
-            buffer = []
-            length = 0
             for word in words:
-                buffer.append(word)
-                length += len(word) + 1
-                if length > max_len:
-                    refined_chunks.append(' '.join(buffer))
-                    buffer = []
-                    length = 0
-            if buffer:
-                refined_chunks.append(' '.join(buffer))
-    return refined_chunks
-# Main synthesis function
-def synthesize(language, text, gender, emotion, speed, pitch, quality):
     description = (
         f"A native {language.lower()} {gender.lower()} speaker with a {emotion.lower()} and expressive tone, "
         f"speaking at a {speed.lower()} rate."
     )
-    description_input = desc_tokenizer(description, return_tensors="pt").to(device)
-    chunks = split_text(text)
-    audio_pieces = []
-    for chunk in chunks:
         prompt_input = tokenizer(chunk, return_tensors="pt").to(device)
         with torch.no_grad():
-            generation = quantized_model.generate(
-                input_ids=description_input.input_ids,
-                attention_mask=description_input.attention_mask,
                 prompt_input_ids=prompt_input.input_ids,
                 prompt_attention_mask=torch.ones_like(prompt_input.input_ids).to(device)
             )
-        audio_chunk = generation.cpu().numpy().squeeze()
-        audio_pieces.append(audio_chunk)
-    # Concatenate all audio chunks
-    final_audio = np.concatenate(audio_pieces)
     filename = f"{uuid.uuid4().hex}.wav"
-    sf.write(filename, final_audio, quantized_model.config.sampling_rate)
     return filename
-# Gradio Interface
 iface = gr.Interface(
     fn=synthesize,
     inputs=[
@@ -89,12 +85,12 @@ iface = gr.Interface(
         gr.Radio(["Male", "Female"], label="Speaker Gender"),
         gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"),
         gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"),
-        gr.Dropdown(["Low", "Normal", "High"], label="Pitch"),
-        gr.Dropdown(["Basic", "Refined"], label="Voice Quality"),
     ],
     outputs=gr.Audio(type="filepath", label="Synthesized Speech"),
     title="Multilingual Indic TTS (Quantized + Chunked)",
-    description="Fast CPU-based TTS with quantized Parler-TTS and text chunking for Malayalam, Hindi, Tamil, and English.",
 )
 iface.launch()

     dtype=torch.qint8
 )
+# Sentence splitter
 def split_text(text, max_len=150):
     chunks = re.split(r'(?<=[.!?]) +', text)
+    refined = []
     for chunk in chunks:
         if len(chunk) <= max_len:
+            refined.append(chunk)
         else:
             words = chunk.split()
+            temp = []
+            buf_len = 0
             for word in words:
+                temp.append(word)
+                buf_len += len(word) + 1
+                if buf_len > max_len:
+                    refined.append(' '.join(temp))
+                    temp = []
+                    buf_len = 0
+            if temp:
+                refined.append(' '.join(temp))
+    return refined
+# Core TTS function
+def synthesize(language, text, gender, emotion, speed):
     description = (
         f"A native {language.lower()} {gender.lower()} speaker with a {emotion.lower()} and expressive tone, "
         f"speaking at a {speed.lower()} rate."
     )
+    audio_chunks = []
+    text_chunks = split_text(text)
+    for chunk in text_chunks:
+        # New tokenization for each chunk
+        desc_input = desc_tokenizer(description, return_tensors="pt").to(device)
         prompt_input = tokenizer(chunk, return_tensors="pt").to(device)
         with torch.no_grad():
+            output = quantized_model.generate(
+                input_ids=desc_input.input_ids,
+                attention_mask=desc_input.attention_mask,
                 prompt_input_ids=prompt_input.input_ids,
                 prompt_attention_mask=torch.ones_like(prompt_input.input_ids).to(device)
             )
+        audio = output.cpu().numpy().squeeze()
+        audio_chunks.append(audio)
+    full_audio = np.concatenate(audio_chunks)
     filename = f"{uuid.uuid4().hex}.wav"
+    sf.write(filename, full_audio, quantized_model.config.sampling_rate)
     return filename
+# Gradio UI
 iface = gr.Interface(
     fn=synthesize,
     inputs=[
         gr.Radio(["Male", "Female"], label="Speaker Gender"),
         gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Emotion"),
         gr.Dropdown(["Slow", "Moderate", "Fast"], label="Speaking Rate"),
+        #gr.Dropdown(["Low", "Normal", "High"], label="Pitch"),
+        #gr.Dropdown(["Basic", "Refined"], label="Voice Quality"),
     ],
     outputs=gr.Audio(type="filepath", label="Synthesized Speech"),
     title="Multilingual Indic TTS (Quantized + Chunked)",
+    description="CPU-based TTS with quantized Parler-TTS and chunked input for Malayalam, Hindi, Tamil, and English.",
 )
 iface.launch()