File size: 3,309 Bytes
4fb650d
 
 
6218f6a
 
 
4fb650d
 
 
 
6218f6a
 
 
 
4fb650d
6218f6a
 
 
 
 
 
 
 
4fb650d
6218f6a
 
 
4fb650d
6218f6a
 
4fb650d
6218f6a
4fb650d
 
 
 
 
 
 
 
 
 
 
 
 
6218f6a
 
 
4fb650d
 
6218f6a
 
4fb650d
6218f6a
 
4fb650d
 
6218f6a
 
 
 
 
 
 
 
 
 
4fb650d
6218f6a
 
4fb650d
6218f6a
4fb650d
 
6218f6a
 
4fb650d
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5ForSpeechToText
from datasets import load_dataset
import soundfile as sf

# Check if CUDA is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load SpeechT5 models and processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device)
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)

# Function to convert speech to text
def speech_to_text(audio):
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt").input_values.to(device)
    with torch.no_grad():
        logits = asr_model(inputs).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

# Function to convert text to speech
def text_to_speech(text):
    inputs = processor(text, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        speech = tts_model.generate_speech(inputs)
    return speech

# Gradio demo
def demo():
    with gr.Blocks() as demo:
        gr.Markdown("# Voice Chatbot")
        gr.Markdown("Simply speak into the microphone and get an audio response.")
        
        audio_input = gr.Audio(sources=["microphone"], type="numpy", label="Speak")
        audio_output = gr.Audio(label="Response", autoplay=True)
        transcript_display = gr.Textbox(label="Conversation")
        
        def process_audio(audio):
            if audio is None:
                return None, "No audio detected."
            
            # Convert audio to the correct format
            sample_rate, audio_data = audio
            audio_data = audio_data.flatten().astype(np.float32) / 32768.0  # Normalize to [-1.0, 1.0]
            
            # Speech-to-text
            transcript = speech_to_text(audio_data)
            print(f"Transcribed: {transcript}")
            
            # Generate response (for simplicity, echo the transcript)
            response_text = transcript
            print(f"Response: {response_text}")
            
            # Text-to-speech
            response_audio = text_to_speech(response_text)
            
            # Save the response audio to a temporary file
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                sf.write(temp_file.name, response_audio.cpu().numpy(), 16000)
                temp_filename = temp_file.name
            
            # Read the audio file
            audio_data, sample_rate = sf.read(temp_filename)
            
            # Clean up the temporary file
            os.unlink(temp_filename)
            
            return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}"
        
        audio_input.change(process_audio, 
                           inputs=[audio_input], 
                           outputs=[audio_output, transcript_display])
        
        clear_btn = gr.Button("Clear Conversation")
        clear_btn.click(lambda: (None, ""), outputs=[audio_output, transcript_display])
    
    demo.launch()

if __name__ == "__main__":
    demo()