import gradio as gr
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from gtts import gTTS
import os

# Load Wav2Vec2 model and processor for speech-to-text
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

def speech_to_text(audio_file):
    # Load audio file and process with Wav2Vec 2.0
    audio_input, _ = librosa.load(audio_file, sr=16000)
    input_values = processor(audio_input, return_tensors="pt").input_values

    # Perform speech-to-text
    with torch.no_grad():
        logits = model(input_values).logits

    # Get the predicted ids and convert them back to text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])

    return transcription

def generate_response(text):
    # Using Hugging Face to generate a text-based response
    # Use any model like DialoGPT for text response generation
    conversational_pipeline = pipeline("text-generation", model="microsoft/DialoGPT-medium")
    response = conversational_pipeline(text, max_length=50)
    return response[0]['generated_text']

def process_audio(audio_file):
    # Convert speech to text using Wav2Vec 2.0
    text = speech_to_text(audio_file)
    print(f"User said: {text}")

    # Get the bot's response
    bot_response = generate_response(text)
    print(f"Bot response: {bot_response}")

    # Convert the bot's response to speech
    tts = gTTS(bot_response)
    tts.save("response.mp3")
    
    # Play the response
    os.system("mpg321 response.mp3")
    
    return bot_response, "response.mp3"

# Create Gradio interface for audio input/output
iface = gr.Interface(
    fn=process_audio,
    inputs=gr.inputs.Audio(source="microphone", type="file"),
    outputs=[gr.outputs.Textbox(), gr.outputs.Audio(type="file")],
    live=True,
    title="Voice Bot with Wav2Vec2.0",
    description="Speak to the bot and get a response!"
)

# Launch the interface
iface.launch()