import gradio as gr import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from gtts import gTTS import os # Load Wav2Vec2 model and processor for speech-to-text processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") def speech_to_text(audio_file): # Load audio file and process with Wav2Vec 2.0 audio_input, _ = librosa.load(audio_file, sr=16000) input_values = processor(audio_input, return_tensors="pt").input_values # Perform speech-to-text with torch.no_grad(): logits = model(input_values).logits # Get the predicted ids and convert them back to text predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) return transcription def generate_response(text): # Using Hugging Face to generate a text-based response # Use any model like DialoGPT for text response generation conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium") response = conversational_pipeline(text, max_length=50) return response[0]['generated_text'] def process_audio(audio_file): # Convert speech to text using Wav2Vec 2.0 text = speech_to_text(audio_file) print(f"User said: {text}") # Get the bot's response bot_response = generate_response(text) print(f"Bot response: {bot_response}") # Convert the bot's response to speech tts = gTTS(bot_response) tts.save("response.mp3") # Play the response os.system("mpg321 response.mp3") return bot_response, "response.mp3" # Create Gradio interface for audio input/output iface = gr.Interface( fn=process_audio, inputs=gr.inputs.Audio(source="microphone", type="file"), outputs=[gr.outputs.Textbox(), gr.outputs.Audio(type="file")], live=True, title="Voice Bot with Wav2Vec2.0", description="Speak to the bot and get a response!" ) # Launch the interface iface.launch()