Spaces:

nimeshnaik
/

TextToSpeech

Running

App Files Files Community

Nimesh Naik commited on 28 days ago

Commit

03a1488

1 Parent(s): 6a42d0a

New code added

Browse files

Files changed (2) hide show

app.py +73 -41
requirement.txt +6 -4

app.py CHANGED Viewed

@@ -1,46 +1,78 @@
 import gradio as gr
 import torch
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 import soundfile as sf
-import tempfile
 import os
-# Load processor and model
-processor = AutoProcessor.from_pretrained("ai4bharat/indic-parler-tts")
-model = AutoModelForSpeechSeq2Seq.from_pretrained("ai4bharat/indic-parler-tts")
-model.eval()
-LANGUAGE_OPTIONS = {
-    "Hindi": "hi", "Tamil": "ta", "Telugu": "te", "Malayalam": "ml", "Kannada": "kn",
-    "Bengali": "bn", "Marathi": "mr", "Gujarati": "gu", "Punjabi": "pa",
-    "Odia": "or", "Assamese": "as", "Urdu": "ur", "English (Indian)": "en"
-}
-def tts_generate(text, language_name):
-    lang = LANGUAGE_OPTIONS[language_name]
-    inputs = processor(text=[text], return_tensors="pt", lang=lang)
-    with torch.no_grad():
-        output = model.generate(**inputs)
-    audio_arr = processor.decode(output[0], skip_special_tokens=True)
-    # Save audio as temporary .wav file
-    temp_path = tempfile.mktemp(suffix=".wav")
-    sf.write(temp_path, audio_arr, 16000)
-    return temp_path
-# Gradio Interface
-interface = gr.Interface(
-    fn=tts_generate,
-    inputs=[
-        gr.Textbox(label="Enter Text"),
-        gr.Dropdown(choices=list(LANGUAGE_OPTIONS.keys()), label="Select Language")
-    ],
-    outputs=gr.Audio(label="Generated Audio", type="filepath"),
-    title="Indic Parler TTS - AI4Bharat",
-    description="Enter text and choose a language to generate and download speech audio."
-)
-interface.launch()

 import gradio as gr
 import torch
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer
 import soundfile as sf
+import numpy as np
 import os
+# Set device (GPU if available, else CPU)
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# Load Indic Parler-TTS model and tokenizer
+model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts-mini").to(device)
+tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts-mini")
+# Supported languages (Indic Parler-TTS officially supports these)
+languages = [
+    "Assamese", "Bengali", "Bodo", "Dogri", "English", "Gujarati", "Hindi",
+    "Kannada", "Konkani", "Maithili", "Malayalam", "Manipuri", "Marathi",
+    "Nepali", "Odia", "Sanskrit", "Santali", "Sindhi", "Tamil", "Telugu", "Urdu"
+]
+def generate_speech(text, language, voice_description):
+    """
+    Generate speech from text, language, and voice description.
+    Returns the path to the generated audio file.
+    """
+    if not text.strip():
+        return None, "Error: Text input cannot be empty."
+    if language not in languages:
+        return None, f"Error: Language '{language}' is not supported. Choose from: {', '.join(languages)}"
+    # Combine voice description with language context (optional, for better control)
+    description = f"A speaker delivering speech in {language}. {voice_description}"
+    # Tokenize inputs
+    input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
+    prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
+    # Generate audio
+    try:
+        generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+        audio_arr = generation.cpu().numpy().squeeze()
+        # Save audio to a temporary file
+        output_file = "output.wav"
+        sf.write(output_file, audio_arr, model.config.sampling_rate)
+        return output_file, None
+    except Exception as e:
+        return None, f"Error generating audio: {str(e)}"
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Indic Parler-TTS: Text-to-Speech")
+    gr.Markdown("Enter text, select a language, and describe the voice to generate audio. Download the audio output.")
+    with gr.Row():
+        text_input = gr.Textbox(label="Input Text", placeholder="Enter text to convert to speech...")
+        language_input = gr.Dropdown(label="Language", choices=languages, value="English")
+        voice_description = gr.Textbox(
+            label="Voice Description",
+            placeholder="E.g., A female speaker with a clear, cheerful tone and moderate pace.",
+            value="A neutral speaker with clear audio quality."
+        )
+    generate_btn = gr.Button("Generate Audio")
+    audio_output = gr.Audio(label="Generated Audio", type="filepath", interactive=False)
+    error_output = gr.Textbox(label="Status/Error", visible=True, interactive=False)
+    # Connect button to function
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, language_input, voice_description],
+        outputs=[audio_output, error_output]
+    )
+if __name__ == "__main__":
+    demo.launch()

requirement.txt CHANGED Viewed

@@ -1,4 +1,6 @@
-transformers
-torch
-soundfile
-gradio

+torch
+transformers
+parler-tts
+gradio
+soundfile
+numpy