dschandra commited on
Commit
b10a4cb
·
verified ·
1 Parent(s): 628f952

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -27
app.py CHANGED
@@ -1,46 +1,61 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
- from vosk import Model, KaldiRecognizer
4
- import os
5
- import wave
6
  from gtts import gTTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- # Load the Hugging Face conversational model
9
- conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")
 
10
 
11
- # Initialize the Vosk ASR model
12
- model = Model("model") # Download the Vosk model beforehand
13
- recognizer = KaldiRecognizer(model, 16000)
 
 
 
 
 
14
 
15
  def process_audio(audio_file):
16
- # Convert the audio file to text using Vosk
17
- audio = audio_file.getarray() # Get audio data as array
18
- if recognizer.AcceptWaveform(audio):
19
- text = recognizer.Result() # Convert to text
20
- else:
21
- text = "Sorry, I couldn't understand that."
22
-
23
- # Use Hugging Face's model to get a response
24
- response = conversational_pipeline(text, max_length=50) # Limit the response length
25
- bot_response = response[0]['generated_text']
26
-
27
- # Convert the bot's response to speech using gTTS
28
  tts = gTTS(bot_response)
29
  tts.save("response.mp3")
30
-
31
- # Play the audio file
32
- os.system("mpg321 response.mp3") # Make sure mpg321 is installed in the Hugging Face space
33
 
34
  return bot_response, "response.mp3"
35
 
36
- # Create Gradio interface
37
  iface = gr.Interface(
38
  fn=process_audio,
39
  inputs=gr.inputs.Audio(source="microphone", type="file"),
40
  outputs=[gr.outputs.Textbox(), gr.outputs.Audio(type="file")],
41
  live=True,
42
- title="Voice Bot",
43
- description="Talk to the bot, and it will respond!"
44
  )
45
 
46
  # Launch the interface
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
 
4
  from gtts import gTTS
5
+ import os
6
+
7
+ # Load Wav2Vec2 model and processor for speech-to-text
8
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
9
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
10
+
11
+ def speech_to_text(audio_file):
12
+ # Load audio file and process with Wav2Vec 2.0
13
+ audio_input, _ = librosa.load(audio_file, sr=16000)
14
+ input_values = processor(audio_input, return_tensors="pt").input_values
15
+
16
+ # Perform speech-to-text
17
+ with torch.no_grad():
18
+ logits = model(input_values).logits
19
 
20
+ # Get the predicted ids and convert them back to text
21
+ predicted_ids = torch.argmax(logits, dim=-1)
22
+ transcription = processor.decode(predicted_ids[0])
23
 
24
+ return transcription
25
+
26
+ def generate_response(text):
27
+ # Using Hugging Face to generate a text-based response
28
+ # Use any model like DialoGPT for text response generation
29
+ conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")
30
+ response = conversational_pipeline(text, max_length=50)
31
+ return response[0]['generated_text']
32
 
33
  def process_audio(audio_file):
34
+ # Convert speech to text using Wav2Vec 2.0
35
+ text = speech_to_text(audio_file)
36
+ print(f"User said: {text}")
37
+
38
+ # Get the bot's response
39
+ bot_response = generate_response(text)
40
+ print(f"Bot response: {bot_response}")
41
+
42
+ # Convert the bot's response to speech
 
 
 
43
  tts = gTTS(bot_response)
44
  tts.save("response.mp3")
45
+
46
+ # Play the response
47
+ os.system("mpg321 response.mp3")
48
 
49
  return bot_response, "response.mp3"
50
 
51
+ # Create Gradio interface for audio input/output
52
  iface = gr.Interface(
53
  fn=process_audio,
54
  inputs=gr.inputs.Audio(source="microphone", type="file"),
55
  outputs=[gr.outputs.Textbox(), gr.outputs.Audio(type="file")],
56
  live=True,
57
+ title="Voice Bot with Wav2Vec2.0",
58
+ description="Speak to the bot and get a response!"
59
  )
60
 
61
  # Launch the interface