import gradio as gr
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5ForSpeechToText
import soundfile as sf
import tempfile
import os

# Check if CUDA is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load SpeechT5 models and processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device)
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)

# Function to convert speech to text
def speech_to_text(audio_dict):
    # Extract the audio array from the dictionary
    audio_array = audio_dict["array"]
    
    # Pass the audio array directly to the processor
    inputs = processor(audio=audio_array, sampling_rate=16000, return_tensors="pt").input_values.to(device)
    
    with torch.no_grad():
        logits = asr_model(inputs).logits
    
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

# Function to convert text to speech
def text_to_speech(text):
    inputs = processor(text=text, return_tensors="pt").input_ids.to(device)
    # Create dummy decoder input IDs (this is a simplification)
    decoder_input_ids = torch.zeros((1, 1), dtype=torch.long).to(device)
    with torch.no_grad():
        speech = tts_model.generate_speech(
            inputs,
            decoder_input_ids=decoder_input_ids
        )
    return speech

# Gradio demo
def demo():
    with gr.Blocks() as demo:
        gr.Markdown("# Voice Chatbot")
        gr.Markdown("Simply speak into the microphone and get an audio response.")
        
        audio_input = gr.Audio(sources=["microphone"], type="numpy", label="Speak")
        audio_output = gr.Audio(label="Response", autoplay=True)
        transcript_display = gr.Textbox(label="Conversation")
        
        def process_audio(audio):
            if audio is None:
                return None, "No audio detected."
            
            # Convert audio to the correct format
            sample_rate, audio_data = audio
            audio_data = audio_data.flatten().astype(np.float32) / 32768.0  # Normalize to [-1.0, 1.0]
            
            # Speech-to-text
            transcript = speech_to_text({"array": audio_data, "sampling_rate": sample_rate})
            print(f"Transcribed: {transcript}")
            
            # Generate response (for simplicity, echo the transcript)
            response_text = transcript
            print(f"Response: {response_text}")
            
            # Text-to-speech
            response_audio = text_to_speech(response_text)
            
            # Save the response audio to a temporary file
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                sf.write(temp_file.name, response_audio.cpu().numpy(), 16000)
                temp_filename = temp_file.name
            
            # Read the audio file
            audio_data, sample_rate = sf.read(temp_filename)
            
            # Clean up the temporary file
            os.unlink(temp_filename)
            
            return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}"
        
        audio_input.change(process_audio, 
                           inputs=[audio_input], 
                           outputs=[audio_output, transcript_display])
        
        clear_btn = gr.Button("Clear Conversation")
        clear_btn.click(lambda: (None, ""), outputs=[audio_output, transcript_display])
    
    demo.launch()

if __name__ == "__main__":
    demo()