import gradio as gr from transformers import AutoModel import numpy as np import soundfile as sf import tempfile import whisper # Load TTS model (IndicF5) tts_model = AutoModel.from_pretrained("ai4bharat/IndicF5", trust_remote_code=True) # Load ASR model (Whisper) asr_model = whisper.load_model("medium") def generate_tts_and_transcribe(text, ref_audio, ref_text): # Save uploaded ref_audio to a path with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: tmp.write(ref_audio.read()) ref_audio_path = tmp.name # Generate speech using IndicF5 audio = tts_model(text, ref_audio_path=ref_audio_path, ref_text=ref_text) # Normalize if audio.dtype == np.int16: audio = audio.astype(np.float32) / 32768.0 # Save TTS output tts_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name sf.write(tts_path, np.array(audio, dtype=np.float32), samplerate=24000) # Transcribe using Whisper asr_result = asr_model.transcribe(tts_path, language="ta") transcript = asr_result["text"] return tts_path, transcript # Gradio Interface demo = gr.Interface( fn=generate_tts_and_transcribe, inputs=[ gr.Textbox(label="Text to Synthesize (Tamil)"), gr.Audio(label="Reference Audio (.wav)", type="file"), gr.Textbox(label="Reference Text (Tamil)") ], outputs=[ gr.Audio(label="Generated Audio", type="filepath"), gr.Textbox(label="ASR Transcription (Whisper)") ], title="IndicF5 Tamil TTS + Whisper ASR", description="Give a reference audio and text, synthesize Tamil speech using IndicF5, and transcribe it with Whisper." ) if __name__ == "__main__": demo.launch()