Spaces:

dschandra
/

AIVoice

Sleeping

App Files Files Community

dschandra commited on Dec 28, 2024

Commit

b10a4cb

verified ·

1 Parent(s): 628f952

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -27

app.py CHANGED Viewed

@@ -1,46 +1,61 @@
 import gradio as gr
-from transformers import pipeline
-from vosk import Model, KaldiRecognizer
-import os
-import wave
 from gtts import gTTS
-# Load the Hugging Face conversational model
-conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")
-# Initialize the Vosk ASR model
-model = Model("model")  # Download the Vosk model beforehand
-recognizer = KaldiRecognizer(model, 16000)
 def process_audio(audio_file):
-    # Convert the audio file to text using Vosk
-    audio = audio_file.getarray()  # Get audio data as array
-    if recognizer.AcceptWaveform(audio):
-        text = recognizer.Result()  # Convert to text
-    else:
-        text = "Sorry, I couldn't understand that."
-    # Use Hugging Face's model to get a response
-    response = conversational_pipeline(text, max_length=50)  # Limit the response length
-    bot_response = response[0]['generated_text']
-    # Convert the bot's response to speech using gTTS
     tts = gTTS(bot_response)
     tts.save("response.mp3")
-    # Play the audio file
-    os.system("mpg321 response.mp3")  # Make sure mpg321 is installed in the Hugging Face space
     return bot_response, "response.mp3"
-# Create Gradio interface
 iface = gr.Interface(
     fn=process_audio,
     inputs=gr.inputs.Audio(source="microphone", type="file"),
     outputs=[gr.outputs.Textbox(), gr.outputs.Audio(type="file")],
     live=True,
-    title="Voice Bot",
-    description="Talk to the bot, and it will respond!"
 )
 # Launch the interface

 import gradio as gr
+import torch
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 from gtts import gTTS
+import os
+# Load Wav2Vec2 model and processor for speech-to-text
+processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
+model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
+def speech_to_text(audio_file):
+    # Load audio file and process with Wav2Vec 2.0
+    audio_input, _ = librosa.load(audio_file, sr=16000)
+    input_values = processor(audio_input, return_tensors="pt").input_values
+    # Perform speech-to-text
+    with torch.no_grad():
+        logits = model(input_values).logits
+    # Get the predicted ids and convert them back to text
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.decode(predicted_ids[0])
+    return transcription
+def generate_response(text):
+    # Using Hugging Face to generate a text-based response
+    # Use any model like DialoGPT for text response generation
+    conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")
+    response = conversational_pipeline(text, max_length=50)
+    return response[0]['generated_text']
 def process_audio(audio_file):
+    # Convert speech to text using Wav2Vec 2.0
+    text = speech_to_text(audio_file)
+    print(f"User said: {text}")
+    # Get the bot's response
+    bot_response = generate_response(text)
+    print(f"Bot response: {bot_response}")
+    # Convert the bot's response to speech
     tts = gTTS(bot_response)
     tts.save("response.mp3")
+    # Play the response
+    os.system("mpg321 response.mp3")
     return bot_response, "response.mp3"
+# Create Gradio interface for audio input/output
 iface = gr.Interface(
     fn=process_audio,
     inputs=gr.inputs.Audio(source="microphone", type="file"),
     outputs=[gr.outputs.Textbox(), gr.outputs.Audio(type="file")],
     live=True,
+    title="Voice Bot with Wav2Vec2.0",
+    description="Speak to the bot and get a response!"
 )
 # Launch the interface