Nimesh Naik commited on
Commit
03a1488
·
1 Parent(s): 6a42d0a

New code added

Browse files
Files changed (2) hide show
  1. app.py +73 -41
  2. requirement.txt +6 -4
app.py CHANGED
@@ -1,46 +1,78 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 
4
  import soundfile as sf
5
- import tempfile
6
  import os
7
 
8
- # Load processor and model
9
- processor = AutoProcessor.from_pretrained("ai4bharat/indic-parler-tts")
10
- model = AutoModelForSpeechSeq2Seq.from_pretrained("ai4bharat/indic-parler-tts")
11
- model.eval()
12
-
13
- LANGUAGE_OPTIONS = {
14
- "Hindi": "hi", "Tamil": "ta", "Telugu": "te", "Malayalam": "ml", "Kannada": "kn",
15
- "Bengali": "bn", "Marathi": "mr", "Gujarati": "gu", "Punjabi": "pa",
16
- "Odia": "or", "Assamese": "as", "Urdu": "ur", "English (Indian)": "en"
17
- }
18
-
19
- def tts_generate(text, language_name):
20
- lang = LANGUAGE_OPTIONS[language_name]
21
-
22
- inputs = processor(text=[text], return_tensors="pt", lang=lang)
23
- with torch.no_grad():
24
- output = model.generate(**inputs)
25
-
26
- audio_arr = processor.decode(output[0], skip_special_tokens=True)
27
-
28
- # Save audio as temporary .wav file
29
- temp_path = tempfile.mktemp(suffix=".wav")
30
- sf.write(temp_path, audio_arr, 16000)
31
-
32
- return temp_path
33
-
34
- # Gradio Interface
35
- interface = gr.Interface(
36
- fn=tts_generate,
37
- inputs=[
38
- gr.Textbox(label="Enter Text"),
39
- gr.Dropdown(choices=list(LANGUAGE_OPTIONS.keys()), label="Select Language")
40
- ],
41
- outputs=gr.Audio(label="Generated Audio", type="filepath"),
42
- title="Indic Parler TTS - AI4Bharat",
43
- description="Enter text and choose a language to generate and download speech audio."
44
- )
45
-
46
- interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
+ from parler_tts import ParlerTTSForConditionalGeneration
4
+ from transformers import AutoTokenizer
5
  import soundfile as sf
6
+ import numpy as np
7
  import os
8
 
9
+ # Set device (GPU if available, else CPU)
10
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
+
12
+ # Load Indic Parler-TTS model and tokenizer
13
+ model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts-mini").to(device)
14
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts-mini")
15
+
16
+ # Supported languages (Indic Parler-TTS officially supports these)
17
+ languages = [
18
+ "Assamese", "Bengali", "Bodo", "Dogri", "English", "Gujarati", "Hindi",
19
+ "Kannada", "Konkani", "Maithili", "Malayalam", "Manipuri", "Marathi",
20
+ "Nepali", "Odia", "Sanskrit", "Santali", "Sindhi", "Tamil", "Telugu", "Urdu"
21
+ ]
22
+
23
+ def generate_speech(text, language, voice_description):
24
+ """
25
+ Generate speech from text, language, and voice description.
26
+ Returns the path to the generated audio file.
27
+ """
28
+ if not text.strip():
29
+ return None, "Error: Text input cannot be empty."
30
+ if language not in languages:
31
+ return None, f"Error: Language '{language}' is not supported. Choose from: {', '.join(languages)}"
32
+
33
+ # Combine voice description with language context (optional, for better control)
34
+ description = f"A speaker delivering speech in {language}. {voice_description}"
35
+
36
+ # Tokenize inputs
37
+ input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
38
+ prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
39
+
40
+ # Generate audio
41
+ try:
42
+ generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
43
+ audio_arr = generation.cpu().numpy().squeeze()
44
+
45
+ # Save audio to a temporary file
46
+ output_file = "output.wav"
47
+ sf.write(output_file, audio_arr, model.config.sampling_rate)
48
+ return output_file, None
49
+ except Exception as e:
50
+ return None, f"Error generating audio: {str(e)}"
51
+
52
+ # Gradio interface
53
+ with gr.Blocks() as demo:
54
+ gr.Markdown("# Indic Parler-TTS: Text-to-Speech")
55
+ gr.Markdown("Enter text, select a language, and describe the voice to generate audio. Download the audio output.")
56
+
57
+ with gr.Row():
58
+ text_input = gr.Textbox(label="Input Text", placeholder="Enter text to convert to speech...")
59
+ language_input = gr.Dropdown(label="Language", choices=languages, value="English")
60
+ voice_description = gr.Textbox(
61
+ label="Voice Description",
62
+ placeholder="E.g., A female speaker with a clear, cheerful tone and moderate pace.",
63
+ value="A neutral speaker with clear audio quality."
64
+ )
65
+
66
+ generate_btn = gr.Button("Generate Audio")
67
+ audio_output = gr.Audio(label="Generated Audio", type="filepath", interactive=False)
68
+ error_output = gr.Textbox(label="Status/Error", visible=True, interactive=False)
69
+
70
+ # Connect button to function
71
+ generate_btn.click(
72
+ fn=generate_speech,
73
+ inputs=[text_input, language_input, voice_description],
74
+ outputs=[audio_output, error_output]
75
+ )
76
+
77
+ if __name__ == "__main__":
78
+ demo.launch()
requirement.txt CHANGED
@@ -1,4 +1,6 @@
1
- transformers
2
- torch
3
- soundfile
4
- gradio
 
 
 
1
+ torch
2
+ transformers
3
+ parler-tts
4
+ gradio
5
+ soundfile
6
+ numpy