AIVoice / app.py
dschandra's picture
Update app.py
b10a4cb verified
raw
history blame
2.03 kB
import gradio as gr
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from gtts import gTTS
import os
# Load Wav2Vec2 model and processor for speech-to-text
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
def speech_to_text(audio_file):
# Load audio file and process with Wav2Vec 2.0
audio_input, _ = librosa.load(audio_file, sr=16000)
input_values = processor(audio_input, return_tensors="pt").input_values
# Perform speech-to-text
with torch.no_grad():
logits = model(input_values).logits
# Get the predicted ids and convert them back to text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
return transcription
def generate_response(text):
# Using Hugging Face to generate a text-based response
# Use any model like DialoGPT for text response generation
conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")
response = conversational_pipeline(text, max_length=50)
return response[0]['generated_text']
def process_audio(audio_file):
# Convert speech to text using Wav2Vec 2.0
text = speech_to_text(audio_file)
print(f"User said: {text}")
# Get the bot's response
bot_response = generate_response(text)
print(f"Bot response: {bot_response}")
# Convert the bot's response to speech
tts = gTTS(bot_response)
tts.save("response.mp3")
# Play the response
os.system("mpg321 response.mp3")
return bot_response, "response.mp3"
# Create Gradio interface for audio input/output
iface = gr.Interface(
fn=process_audio,
inputs=gr.inputs.Audio(source="microphone", type="file"),
outputs=[gr.outputs.Textbox(), gr.outputs.Audio(type="file")],
live=True,
title="Voice Bot with Wav2Vec2.0",
description="Speak to the bot and get a response!"
)
# Launch the interface
iface.launch()