import gradio as gr from transformers import pipeline import edge_tts import numpy as np import asyncio import os # Load STT and chatbot pipelines stt = pipeline("automatic-speech-recognition", model="openai/whisper-small") chatbot = pipeline("text-generation", model="HooshvareLab/gpt2-fa") async def tts(text: str, voice: str = "fa-IR-FaridNeural"): communicate = edge_tts.Communicate(text, voice) audio_data = b"" async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_data += chunk["data"] audio_array = np.frombuffer(audio_data, dtype=np.int16) sample_rate = 24000 return sample_rate, audio_array async def audio_to_audio(audio_input): if audio_input is None: return None, "No audio input received." sample_rate_in, data_in = audio_input audio = {"array": data_in, "sampling_rate": sample_rate_in} # 1. ASR → text text = stt(audio)["text"] # 2. Generate response response = chatbot(text, max_length=50, num_return_sequences=1)[0]["generated_text"] # 3. TTS return await tts(response) # Gradio interface demo = gr.Interface( fn=audio_to_audio, inputs=gr.Audio( sources=["microphone"], # Use 'sources' instead of deprecated 'source' :contentReference[oaicite:2]{index=2} type="numpy", label="Speak in Farsi" ), outputs=gr.Audio(type="numpy", label="Response in Farsi"), title="Farsi Audio Chatbot", description="Speak in Farsi, and the app will respond in Farsi audio.", allow_flagging="never" ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)) )