audio1test / app.py
hashhac
embeddings added
36420ca
raw
history blame
3.84 kB
import gradio as gr
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5ForSpeechToText
import soundfile as sf
import tempfile
import os
from datasets import load_dataset
# Check if CUDA is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load SpeechT5 models and processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
asr_model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr").to(device)
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# Load speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
# Function to convert speech to text
def speech_to_text(audio_dict):
# Extract the audio array from the dictionary
audio_array = audio_dict["array"]
# Pass the audio array directly to the processor
inputs = processor(audio=audio_array, sampling_rate=16000, return_tensors="pt").input_values.to(device)
with torch.no_grad():
logits = asr_model(inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# Function to convert text to speech
def text_to_speech(text):
inputs = processor(text=text, return_tensors="pt").input_ids.to(device)
with torch.no_grad():
speech = tts_model.generate_speech(
inputs,
speaker_embeddings=speaker_embeddings
)
return speech
# Gradio demo
def demo():
with gr.Blocks() as demo:
gr.Markdown("# Voice Chatbot")
gr.Markdown("Simply speak into the microphone and get an audio response.")
audio_input = gr.Audio(sources=["microphone"], type="numpy", label="Speak")
audio_output = gr.Audio(label="Response", autoplay=True)
transcript_display = gr.Textbox(label="Conversation")
def process_audio(audio):
if audio is None:
return None, "No audio detected."
# Convert audio to the correct format
sample_rate, audio_data = audio
audio_data = audio_data.flatten().astype(np.float32) / 32768.0 # Normalize to [-1.0, 1.0]
# Speech-to-text
transcript = speech_to_text({"array": audio_data, "sampling_rate": sample_rate})
print(f"Transcribed: {transcript}")
# Generate response (for simplicity, echo the transcript)
response_text = transcript
print(f"Response: {response_text}")
# Text-to-speech
response_audio = text_to_speech(response_text)
# Save the response audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
sf.write(temp_file.name, response_audio.cpu().numpy(), 16000)
temp_filename = temp_file.name
# Read the audio file
audio_data, sample_rate = sf.read(temp_filename)
# Clean up the temporary file
os.unlink(temp_filename)
return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}"
audio_input.change(process_audio,
inputs=[audio_input],
outputs=[audio_output, transcript_display])
clear_btn = gr.Button("Clear Conversation")
clear_btn.click(lambda: (None, ""), outputs=[audio_output, transcript_display])
demo.launch()
if __name__ == "__main__":
demo()