Spaces:
Running
Running
File size: 3,182 Bytes
e1e20b7 1061733 03a1488 1061733 03a1488 1061733 e1e20b7 03a1488 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import gradio as gr
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
import numpy as np
import os
# Set device (GPU if available, else CPU)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Load Indic Parler-TTS model and tokenizer
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts-mini").to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts-mini")
# Supported languages (Indic Parler-TTS officially supports these)
languages = [
"Assamese", "Bengali", "Bodo", "Dogri", "English", "Gujarati", "Hindi",
"Kannada", "Konkani", "Maithili", "Malayalam", "Manipuri", "Marathi",
"Nepali", "Odia", "Sanskrit", "Santali", "Sindhi", "Tamil", "Telugu", "Urdu"
]
def generate_speech(text, language, voice_description):
"""
Generate speech from text, language, and voice description.
Returns the path to the generated audio file.
"""
if not text.strip():
return None, "Error: Text input cannot be empty."
if language not in languages:
return None, f"Error: Language '{language}' is not supported. Choose from: {', '.join(languages)}"
# Combine voice description with language context (optional, for better control)
description = f"A speaker delivering speech in {language}. {voice_description}"
# Tokenize inputs
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
# Generate audio
try:
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
# Save audio to a temporary file
output_file = "output.wav"
sf.write(output_file, audio_arr, model.config.sampling_rate)
return output_file, None
except Exception as e:
return None, f"Error generating audio: {str(e)}"
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Indic Parler-TTS: Text-to-Speech")
gr.Markdown("Enter text, select a language, and describe the voice to generate audio. Download the audio output.")
with gr.Row():
text_input = gr.Textbox(label="Input Text", placeholder="Enter text to convert to speech...")
language_input = gr.Dropdown(label="Language", choices=languages, value="English")
voice_description = gr.Textbox(
label="Voice Description",
placeholder="E.g., A female speaker with a clear, cheerful tone and moderate pace.",
value="A neutral speaker with clear audio quality."
)
generate_btn = gr.Button("Generate Audio")
audio_output = gr.Audio(label="Generated Audio", type="filepath", interactive=False)
error_output = gr.Textbox(label="Status/Error", visible=True, interactive=False)
# Connect button to function
generate_btn.click(
fn=generate_speech,
inputs=[text_input, language_input, voice_description],
outputs=[audio_output, error_output]
)
if __name__ == "__main__":
demo.launch() |