audio1test / app.py
hashhac
added sound putputs
ab25fef
import gradio as gr
import numpy as np
import torch
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import soundfile as sf
import tempfile
import os
# Check if CUDA is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load Whisper for ASR
print("Loading ASR model...")
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
# Load SpeechT5 for TTS
print("Loading TTS model...")
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# Load SpeechT5 vocoder (THIS WAS MISSING)
print("Loading vocoder...")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
# Load speaker embeddings for TTS
print("Loading speaker embeddings...")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
# Function to convert speech to text using Whisper
def speech_to_text(audio_data, sample_rate):
# Normalize audio data
audio_data = audio_data.flatten().astype(np.float32) / 32768.0
# Process with Whisper
result = asr_pipeline({"raw": audio_data, "sampling_rate": sample_rate})
return result["text"]
# Function to convert text to speech using SpeechT5
def text_to_speech(text):
# Process text input
inputs = tts_processor(text=text, return_tensors="pt").to(device)
# Generate speech with speaker embeddings
with torch.no_grad():
speech = tts_model.generate_speech(
inputs["input_ids"],
speaker_embeddings=speaker_embeddings
)
# Convert spectrogram to waveform using vocoder
waveform = vocoder(speech)
return waveform
# Gradio demo
def demo():
with gr.Blocks() as demo:
gr.Markdown("# Voice Chatbot")
gr.Markdown("Simply speak into the microphone and get an audio response.")
audio_input = gr.Audio(sources=["microphone"], type="numpy", label="Speak")
audio_output = gr.Audio(label="Response", autoplay=True)
transcript_display = gr.Textbox(label="Conversation")
def process_audio(audio):
if audio is None:
return None, "No audio detected."
try:
# Get audio data
sample_rate, audio_data = audio
# Speech-to-text
transcript = speech_to_text(audio_data, sample_rate)
print(f"Transcribed: {transcript}")
# Generate response (for simplicity, echo the transcript)
response_text = transcript
print(f"Response: {response_text}")
# Text-to-speech
response_audio = text_to_speech(response_text)
# Save the response audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
# Ensure audio is properly scaled
audio_np = response_audio.cpu().numpy()
# Normalize audio to avoid clipping
audio_np = audio_np / (np.max(np.abs(audio_np)) + 1e-8) * 0.9
sf.write(temp_file.name, audio_np, 16000)
temp_filename = temp_file.name
# Read the audio file
audio_data, sample_rate = sf.read(temp_filename)
# Clean up the temporary file
os.unlink(temp_filename)
return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}"
except Exception as e:
print(f"Error in process_audio: {e}")
import traceback
traceback.print_exc()
return None, f"Error processing audio: {str(e)}"
audio_input.change(process_audio,
inputs=[audio_input],
outputs=[audio_output, transcript_display])
clear_btn = gr.Button("Clear Conversation")
clear_btn.click(lambda: (None, ""), outputs=[audio_output, transcript_display])
demo.launch()
if __name__ == "__main__":
demo()