Spaces:
Sleeping
Sleeping
File size: 1,598 Bytes
25681f6 7fc6e29 1b41e6d 6d598f3 7fc6e29 25681f6 6d598f3 7fc6e29 25681f6 6d598f3 7fc6e29 6d598f3 7fc6e29 6d598f3 7fc6e29 37a2817 6d598f3 7fc6e29 6d598f3 7fc6e29 7877a4f 6d598f3 7fc6e29 6d598f3 7fc6e29 6d598f3 7fc6e29 6d598f3 7fc6e29 1b41e6d 6d598f3 7fc6e29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import gradio as gr
from transformers import pipeline
import edge_tts
import numpy as np
# Load speech-to-text model (Whisper small for Farsi)
stt = pipeline("automatic-speech-recognition", model="openai/whisper-small")
# Load chatbot model (GPT2 fine-tuned for Farsi)
chatbot = pipeline("text-generation", model="HooshvareLab/gpt2-fa")
# Function to convert text to speech using edge-tts
async def tts(text, voice="fa-IR-FaridNeural"):
communicate = edge_tts.Communicate(text, voice)
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
audio_array = np.frombuffer(audio_data, dtype=np.int16)
sample_rate = 24000 # As per edge-tts documentation
return sample_rate, audio_array
# Main function: Audio-to-audio pipeline
def audio_to_audio(audio_input):
sample_rate_in, data_in = audio_input
audio = {"array": data_in, "sampling_rate": sample_rate_in}
# Step 1: Convert speech to text
text = stt(audio)["text"]
# Step 2: Generate chatbot response
response = chatbot(text, max_length=50, num_return_sequences=1)[0]["generated_text"]
# Step 3: Convert text to speech
sample_rate_out, data_out = tts(response)
return (sample_rate_out, data_out)
# Gradio interface
demo = gr.Interface(
fn=audio_to_audio,
inputs=gr.Audio(source="microphone", type="numpy"),
outputs=gr.Audio(type="numpy"),
title="Farsi Audio Chatbot",
description="Speak in Farsi, and the app will respond in Farsi."
)
# Launch the app
demo.launch() |