File size: 1,839 Bytes
25681f6
7fc6e29
 
 
7031ffc
 
 
 
1b41e6d
6d598f3
7fc6e29
25681f6
6d598f3
7fc6e29
25681f6
6d598f3
 
7fc6e29
6d598f3
 
 
 
7fc6e29
6d598f3
7fc6e29
37a2817
6d598f3
7031ffc
e4097c6
 
7fc6e29
 
 
6d598f3
7fc6e29
7877a4f
6d598f3
7fc6e29
 
6d598f3
7031ffc
7fc6e29
 
 
6d598f3
7fc6e29
 
e4097c6
 
6d598f3
e4097c6
7fc6e29
1b41e6d
6d598f3
7fc6e29
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import gradio as gr
from transformers import pipeline
import edge_tts
import numpy as np
import asyncio

# Print Gradio version for debugging
print(f"Gradio version: {gr.__version__}")

# Load speech-to-text model (Whisper small for Farsi)
stt = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# Load chatbot model (GPT2 fine-tuned for Farsi)
chatbot = pipeline("text-generation", model="HooshvareLab/gpt2-fa")

# Function to convert text to speech using edge-tts
async def tts(text, voice="fa-IR-FaridNeural"):
    communicate = edge_tts.Communicate(text, voice)
    audio_data = b""
    async for chunk in communicate.stream():
        if chunk["type"] == "audio":
            audio_data += chunk["data"]
    audio_array = np.frombuffer(audio_data, dtype=np.int16)
    sample_rate = 24000  # As per edge-tts documentation
    return sample_rate, audio_array

# Main function: Audio-to-audio pipeline
async def audio_to_audio(audio_input):
    if audio_input is None:
        return None, "No audio input received."
    sample_rate_in, data_in = audio_input
    audio = {"array": data_in, "sampling_rate": sample_rate_in}
    
    # Step 1: Convert speech to text
    text = stt(audio)["text"]
    
    # Step 2: Generate chatbot response
    response = chatbot(text, max_length=50, num_return_sequences=1)[0]["generated_text"]
    
    # Step 3: Convert text to speech
    sample_rate_out, data_out = await tts(response)
    
    return (sample_rate_out, data_out)

# Gradio interface
demo = gr.Interface(
    fn=audio_to_audio,
    inputs=gr.Audio(source="microphone", type="numpy", label="Speak in Farsi"),
    outputs=gr.Audio(type="numpy", label="Response in Farsi"),
    title="Farsi Audio Chatbot",
    description="Speak in Farsi, and the app will respond in Farsi audio."
)

# Launch the app
demo.launch()