EmRa228 commited on
Commit
cf014fb
·
verified ·
1 Parent(s): 6e7eeda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -23
app.py CHANGED
@@ -3,53 +3,50 @@ from transformers import pipeline
3
  import edge_tts
4
  import numpy as np
5
  import asyncio
 
6
 
7
- # Print Gradio version for debugging
8
- print(f"Gradio version: {gr.__version__}")
9
-
10
- # Load speech-to-text model (Whisper small for Farsi)
11
  stt = pipeline("automatic-speech-recognition", model="openai/whisper-small")
12
-
13
- # Load chatbot model (GPT2 fine-tuned for Farsi)
14
  chatbot = pipeline("text-generation", model="HooshvareLab/gpt2-fa")
15
 
16
- # Function to convert text to speech using edge-tts
17
- async def tts(text, voice="fa-IR-FaridNeural"):
18
  communicate = edge_tts.Communicate(text, voice)
19
  audio_data = b""
20
  async for chunk in communicate.stream():
21
  if chunk["type"] == "audio":
22
  audio_data += chunk["data"]
23
  audio_array = np.frombuffer(audio_data, dtype=np.int16)
24
- sample_rate = 24000 # As per edge-tts documentation
25
  return sample_rate, audio_array
26
 
27
- # Main function: Audio-to-audio pipeline
28
  async def audio_to_audio(audio_input):
29
  if audio_input is None:
30
  return None, "No audio input received."
31
  sample_rate_in, data_in = audio_input
32
  audio = {"array": data_in, "sampling_rate": sample_rate_in}
33
-
34
- # Step 1: Convert speech to text
35
  text = stt(audio)["text"]
36
-
37
- # Step 2: Generate chatbot response
38
  response = chatbot(text, max_length=50, num_return_sequences=1)[0]["generated_text"]
39
-
40
- # Step 3: Convert text to speech
41
- sample_rate_out, data_out = await tts(response)
42
-
43
- return (sample_rate_out, data_out)
44
 
45
  # Gradio interface
46
  demo = gr.Interface(
47
  fn=audio_to_audio,
48
- inputs=gr.Audio(source="microphone", type="numpy", label="Speak in Farsi"),
 
 
 
 
49
  outputs=gr.Audio(type="numpy", label="Response in Farsi"),
50
  title="Farsi Audio Chatbot",
51
- description="Speak in Farsi, and the app will respond in Farsi audio."
 
52
  )
53
 
54
- # Launch the app
55
- demo.launch()
 
 
 
 
3
  import edge_tts
4
  import numpy as np
5
  import asyncio
6
+ import os
7
 
8
+ # Load STT and chatbot pipelines
 
 
 
9
  stt = pipeline("automatic-speech-recognition", model="openai/whisper-small")
 
 
10
  chatbot = pipeline("text-generation", model="HooshvareLab/gpt2-fa")
11
 
12
+ async def tts(text: str, voice: str = "fa-IR-FaridNeural"):
 
13
  communicate = edge_tts.Communicate(text, voice)
14
  audio_data = b""
15
  async for chunk in communicate.stream():
16
  if chunk["type"] == "audio":
17
  audio_data += chunk["data"]
18
  audio_array = np.frombuffer(audio_data, dtype=np.int16)
19
+ sample_rate = 24000
20
  return sample_rate, audio_array
21
 
 
22
  async def audio_to_audio(audio_input):
23
  if audio_input is None:
24
  return None, "No audio input received."
25
  sample_rate_in, data_in = audio_input
26
  audio = {"array": data_in, "sampling_rate": sample_rate_in}
27
+ # 1. ASR → text
 
28
  text = stt(audio)["text"]
29
+ # 2. Generate response
 
30
  response = chatbot(text, max_length=50, num_return_sequences=1)[0]["generated_text"]
31
+ # 3. TTS
32
+ return await tts(response)
 
 
 
33
 
34
  # Gradio interface
35
  demo = gr.Interface(
36
  fn=audio_to_audio,
37
+ inputs=gr.Audio(
38
+ sources=["microphone"], # Use 'sources' instead of deprecated 'source' :contentReference[oaicite:2]{index=2}
39
+ type="numpy",
40
+ label="Speak in Farsi"
41
+ ),
42
  outputs=gr.Audio(type="numpy", label="Response in Farsi"),
43
  title="Farsi Audio Chatbot",
44
+ description="Speak in Farsi, and the app will respond in Farsi audio.",
45
+ allow_flagging="never"
46
  )
47
 
48
+ if __name__ == "__main__":
49
+ demo.launch(
50
+ server_name="0.0.0.0",
51
+ server_port=int(os.environ.get("PORT", 7860))
52
+ )