Spaces:
Running
Running
Nimesh Naik
commited on
Commit
·
03a1488
1
Parent(s):
6a42d0a
New code added
Browse files- app.py +73 -41
- requirement.txt +6 -4
app.py
CHANGED
@@ -1,46 +1,78 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
-
from
|
|
|
4 |
import soundfile as sf
|
5 |
-
import
|
6 |
import os
|
7 |
|
8 |
-
#
|
9 |
-
|
10 |
-
|
11 |
-
model
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
+
from parler_tts import ParlerTTSForConditionalGeneration
|
4 |
+
from transformers import AutoTokenizer
|
5 |
import soundfile as sf
|
6 |
+
import numpy as np
|
7 |
import os
|
8 |
|
9 |
+
# Set device (GPU if available, else CPU)
|
10 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
11 |
+
|
12 |
+
# Load Indic Parler-TTS model and tokenizer
|
13 |
+
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts-mini").to(device)
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts-mini")
|
15 |
+
|
16 |
+
# Supported languages (Indic Parler-TTS officially supports these)
|
17 |
+
languages = [
|
18 |
+
"Assamese", "Bengali", "Bodo", "Dogri", "English", "Gujarati", "Hindi",
|
19 |
+
"Kannada", "Konkani", "Maithili", "Malayalam", "Manipuri", "Marathi",
|
20 |
+
"Nepali", "Odia", "Sanskrit", "Santali", "Sindhi", "Tamil", "Telugu", "Urdu"
|
21 |
+
]
|
22 |
+
|
23 |
+
def generate_speech(text, language, voice_description):
|
24 |
+
"""
|
25 |
+
Generate speech from text, language, and voice description.
|
26 |
+
Returns the path to the generated audio file.
|
27 |
+
"""
|
28 |
+
if not text.strip():
|
29 |
+
return None, "Error: Text input cannot be empty."
|
30 |
+
if language not in languages:
|
31 |
+
return None, f"Error: Language '{language}' is not supported. Choose from: {', '.join(languages)}"
|
32 |
+
|
33 |
+
# Combine voice description with language context (optional, for better control)
|
34 |
+
description = f"A speaker delivering speech in {language}. {voice_description}"
|
35 |
+
|
36 |
+
# Tokenize inputs
|
37 |
+
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
38 |
+
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
|
39 |
+
|
40 |
+
# Generate audio
|
41 |
+
try:
|
42 |
+
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
43 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
44 |
+
|
45 |
+
# Save audio to a temporary file
|
46 |
+
output_file = "output.wav"
|
47 |
+
sf.write(output_file, audio_arr, model.config.sampling_rate)
|
48 |
+
return output_file, None
|
49 |
+
except Exception as e:
|
50 |
+
return None, f"Error generating audio: {str(e)}"
|
51 |
+
|
52 |
+
# Gradio interface
|
53 |
+
with gr.Blocks() as demo:
|
54 |
+
gr.Markdown("# Indic Parler-TTS: Text-to-Speech")
|
55 |
+
gr.Markdown("Enter text, select a language, and describe the voice to generate audio. Download the audio output.")
|
56 |
+
|
57 |
+
with gr.Row():
|
58 |
+
text_input = gr.Textbox(label="Input Text", placeholder="Enter text to convert to speech...")
|
59 |
+
language_input = gr.Dropdown(label="Language", choices=languages, value="English")
|
60 |
+
voice_description = gr.Textbox(
|
61 |
+
label="Voice Description",
|
62 |
+
placeholder="E.g., A female speaker with a clear, cheerful tone and moderate pace.",
|
63 |
+
value="A neutral speaker with clear audio quality."
|
64 |
+
)
|
65 |
+
|
66 |
+
generate_btn = gr.Button("Generate Audio")
|
67 |
+
audio_output = gr.Audio(label="Generated Audio", type="filepath", interactive=False)
|
68 |
+
error_output = gr.Textbox(label="Status/Error", visible=True, interactive=False)
|
69 |
+
|
70 |
+
# Connect button to function
|
71 |
+
generate_btn.click(
|
72 |
+
fn=generate_speech,
|
73 |
+
inputs=[text_input, language_input, voice_description],
|
74 |
+
outputs=[audio_output, error_output]
|
75 |
+
)
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
demo.launch()
|
requirement.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
gradio
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
parler-tts
|
4 |
+
gradio
|
5 |
+
soundfile
|
6 |
+
numpy
|