import gradio as gr from transformers import pipeline import edge_tts import numpy as np # Load speech-to-text model (Whisper small for Farsi) stt = pipeline("automatic-speech-recognition", model="openai/whisper-small") # Load chatbot model (GPT2 fine-tuned for Farsi) chatbot = pipeline("text-generation", model="HooshvareLab/gpt2-fa") # Function to convert text to speech using edge-tts async def tts(text, voice="fa-IR-FaridNeural"): communicate = edge_tts.Communicate(text, voice) audio_data = b"" async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_data += chunk["data"] audio_array = np.frombuffer(audio_data, dtype=np.int16) sample_rate = 24000 # As per edge-tts documentation return sample_rate, audio_array # Main function: Audio-to-audio pipeline def audio_to_audio(audio_input): sample_rate_in, data_in = audio_input audio = {"array": data_in, "sampling_rate": sample_rate_in} # Step 1: Convert speech to text text = stt(audio)["text"] # Step 2: Generate chatbot response response = chatbot(text, max_length=50, num_return_sequences=1)[0]["generated_text"] # Step 3: Convert text to speech sample_rate_out, data_out = tts(response) return (sample_rate_out, data_out) # Gradio interface demo = gr.Interface( fn=audio_to_audio, inputs=gr.Audio(source="microphone", type="numpy"), outputs=gr.Audio(type="numpy"), title="Farsi Audio Chatbot", description="Speak in Farsi, and the app will respond in Farsi." ) # Launch the app demo.launch()