import gradio as gr import numpy as np import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5ForSpeechToText import soundfile as sf import tempfile import os # Check if CUDA is available, otherwise use CPU device = "cuda" if torch.cuda.is_available() else "cpu" # Load SpeechT5 models and processor processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr") asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device) tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device) # Function to convert speech to text def speech_to_text(audio_dict): # Extract the audio array from the dictionary audio_array = audio_dict["array"] # Pass the audio array directly to the processor inputs = processor(audio=audio_array, sampling_rate=16000, return_tensors="pt").input_values.to(device) with torch.no_grad(): logits = asr_model(inputs).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription # Function to convert text to speech def text_to_speech(text): inputs = processor(text=text, return_tensors="pt").input_ids.to(device) # Create dummy decoder input IDs (this is a simplification) decoder_input_ids = torch.zeros((1, 1), dtype=torch.long).to(device) with torch.no_grad(): speech = tts_model.generate_speech( inputs, decoder_input_ids=decoder_input_ids ) return speech # Gradio demo def demo(): with gr.Blocks() as demo: gr.Markdown("# Voice Chatbot") gr.Markdown("Simply speak into the microphone and get an audio response.") audio_input = gr.Audio(sources=["microphone"], type="numpy", label="Speak") audio_output = gr.Audio(label="Response", autoplay=True) transcript_display = gr.Textbox(label="Conversation") def process_audio(audio): if audio is None: return None, "No audio detected." # Convert audio to the correct format sample_rate, audio_data = audio audio_data = audio_data.flatten().astype(np.float32) / 32768.0 # Normalize to [-1.0, 1.0] # Speech-to-text transcript = speech_to_text({"array": audio_data, "sampling_rate": sample_rate}) print(f"Transcribed: {transcript}") # Generate response (for simplicity, echo the transcript) response_text = transcript print(f"Response: {response_text}") # Text-to-speech response_audio = text_to_speech(response_text) # Save the response audio to a temporary file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: sf.write(temp_file.name, response_audio.cpu().numpy(), 16000) temp_filename = temp_file.name # Read the audio file audio_data, sample_rate = sf.read(temp_filename) # Clean up the temporary file os.unlink(temp_filename) return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}" audio_input.change(process_audio, inputs=[audio_input], outputs=[audio_output, transcript_display]) clear_btn = gr.Button("Clear Conversation") clear_btn.click(lambda: (None, ""), outputs=[audio_output, transcript_display]) demo.launch() if __name__ == "__main__": demo()