EmRa228's picture
Update app.py
cf014fb verified
import gradio as gr
from transformers import pipeline
import edge_tts
import numpy as np
import asyncio
import os
# Load STT and chatbot pipelines
stt = pipeline("automatic-speech-recognition", model="openai/whisper-small")
chatbot = pipeline("text-generation", model="HooshvareLab/gpt2-fa")
async def tts(text: str, voice: str = "fa-IR-FaridNeural"):
communicate = edge_tts.Communicate(text, voice)
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
audio_array = np.frombuffer(audio_data, dtype=np.int16)
sample_rate = 24000
return sample_rate, audio_array
async def audio_to_audio(audio_input):
if audio_input is None:
return None, "No audio input received."
sample_rate_in, data_in = audio_input
audio = {"array": data_in, "sampling_rate": sample_rate_in}
# 1. ASR → text
text = stt(audio)["text"]
# 2. Generate response
response = chatbot(text, max_length=50, num_return_sequences=1)[0]["generated_text"]
# 3. TTS
return await tts(response)
# Gradio interface
demo = gr.Interface(
fn=audio_to_audio,
inputs=gr.Audio(
sources=["microphone"], # Use 'sources' instead of deprecated 'source' :contentReference[oaicite:2]{index=2}
type="numpy",
label="Speak in Farsi"
),
outputs=gr.Audio(type="numpy", label="Response in Farsi"),
title="Farsi Audio Chatbot",
description="Speak in Farsi, and the app will respond in Farsi audio.",
allow_flagging="never"
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=int(os.environ.get("PORT", 7860))
)