Spaces:
Runtime error
Runtime error
File size: 1,728 Bytes
78dc3af 9aa9353 78dc3af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import gradio as gr
from transformers import AutoModel
import numpy as np
import soundfile as sf
import tempfile
import whisper
# Load TTS model (IndicF5)
tts_model = AutoModel.from_pretrained("ai4bharat/IndicF5", trust_remote_code=True)
# Load ASR model (Whisper)
asr_model = whisper.load_model("medium")
def generate_tts_and_transcribe(text, ref_audio, ref_text):
# Save uploaded ref_audio to a path
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(ref_audio.read())
ref_audio_path = tmp.name
# Generate speech using IndicF5
audio = tts_model(text, ref_audio_path=ref_audio_path, ref_text=ref_text)
# Normalize
if audio.dtype == np.int16:
audio = audio.astype(np.float32) / 32768.0
# Save TTS output
tts_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
sf.write(tts_path, np.array(audio, dtype=np.float32), samplerate=24000)
# Transcribe using Whisper
asr_result = asr_model.transcribe(tts_path, language="ta")
transcript = asr_result["text"]
return tts_path, transcript
# Gradio Interface
demo = gr.Interface(
fn=generate_tts_and_transcribe,
inputs=[
gr.Textbox(label="Text to Synthesize (Tamil)"),
gr.Audio(label="Reference Audio (.wav)", type="file"),
gr.Textbox(label="Reference Text (Tamil)")
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.Textbox(label="ASR Transcription (Whisper)")
],
title="IndicF5 Tamil TTS + Whisper ASR",
description="Give a reference audio and text, synthesize Tamil speech using IndicF5, and transcribe it with Whisper."
)
if __name__ == "__main__":
demo.launch()
|