|
import gradio as gr |
|
import torch |
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
|
from gtts import gTTS |
|
import os |
|
|
|
|
|
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") |
|
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") |
|
|
|
def speech_to_text(audio_file): |
|
|
|
audio_input, _ = librosa.load(audio_file, sr=16000) |
|
input_values = processor(audio_input, return_tensors="pt").input_values |
|
|
|
|
|
with torch.no_grad(): |
|
logits = model(input_values).logits |
|
|
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
transcription = processor.decode(predicted_ids[0]) |
|
|
|
return transcription |
|
|
|
def generate_response(text): |
|
|
|
|
|
conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium") |
|
response = conversational_pipeline(text, max_length=50) |
|
return response[0]['generated_text'] |
|
|
|
def process_audio(audio_file): |
|
|
|
text = speech_to_text(audio_file) |
|
print(f"User said: {text}") |
|
|
|
|
|
bot_response = generate_response(text) |
|
print(f"Bot response: {bot_response}") |
|
|
|
|
|
tts = gTTS(bot_response) |
|
tts.save("response.mp3") |
|
|
|
|
|
os.system("mpg321 response.mp3") |
|
|
|
return bot_response, "response.mp3" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_audio, |
|
inputs=gr.inputs.Audio(source="microphone", type="file"), |
|
outputs=[gr.outputs.Textbox(), gr.outputs.Audio(type="file")], |
|
live=True, |
|
title="Voice Bot with Wav2Vec2.0", |
|
description="Speak to the bot and get a response!" |
|
) |
|
|
|
|
|
iface.launch() |
|
|