File size: 3,182 Bytes
e1e20b7
1061733
03a1488
 
1061733
03a1488
1061733
e1e20b7
03a1488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
import numpy as np
import os

# Set device (GPU if available, else CPU)
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load Indic Parler-TTS model and tokenizer
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts-mini").to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts-mini")

# Supported languages (Indic Parler-TTS officially supports these)
languages = [
    "Assamese", "Bengali", "Bodo", "Dogri", "English", "Gujarati", "Hindi", 
    "Kannada", "Konkani", "Maithili", "Malayalam", "Manipuri", "Marathi", 
    "Nepali", "Odia", "Sanskrit", "Santali", "Sindhi", "Tamil", "Telugu", "Urdu"
]

def generate_speech(text, language, voice_description):
    """
    Generate speech from text, language, and voice description.
    Returns the path to the generated audio file.
    """
    if not text.strip():
        return None, "Error: Text input cannot be empty."
    if language not in languages:
        return None, f"Error: Language '{language}' is not supported. Choose from: {', '.join(languages)}"

    # Combine voice description with language context (optional, for better control)
    description = f"A speaker delivering speech in {language}. {voice_description}"

    # Tokenize inputs
    input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
    prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)

    # Generate audio
    try:
        generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
        audio_arr = generation.cpu().numpy().squeeze()

        # Save audio to a temporary file
        output_file = "output.wav"
        sf.write(output_file, audio_arr, model.config.sampling_rate)
        return output_file, None
    except Exception as e:
        return None, f"Error generating audio: {str(e)}"

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Indic Parler-TTS: Text-to-Speech")
    gr.Markdown("Enter text, select a language, and describe the voice to generate audio. Download the audio output.")

    with gr.Row():
        text_input = gr.Textbox(label="Input Text", placeholder="Enter text to convert to speech...")
        language_input = gr.Dropdown(label="Language", choices=languages, value="English")
        voice_description = gr.Textbox(
            label="Voice Description",
            placeholder="E.g., A female speaker with a clear, cheerful tone and moderate pace.",
            value="A neutral speaker with clear audio quality."
        )

    generate_btn = gr.Button("Generate Audio")
    audio_output = gr.Audio(label="Generated Audio", type="filepath", interactive=False)
    error_output = gr.Textbox(label="Status/Error", visible=True, interactive=False)

    # Connect button to function
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, language_input, voice_description],
        outputs=[audio_output, error_output]
    )

if __name__ == "__main__":
    demo.launch()