Spaces:

bobpopboom
/

audio1test

Sleeping

audio1test / app.py

hashhac

added sound putputs

ab25fef 2 months ago

4.56 kB

	import gradio as gr
	import numpy as np
	import torch
	from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from datasets import load_dataset
	import soundfile as sf
	import tempfile
	import os

	# Check if CUDA is available, otherwise use CPU
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load Whisper for ASR
	print("Loading ASR model...")
	asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)

	# Load SpeechT5 for TTS
	print("Loading TTS model...")
	tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)

	# Load SpeechT5 vocoder (THIS WAS MISSING)
	print("Loading vocoder...")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

	# Load speaker embeddings for TTS
	print("Loading speaker embeddings...")
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)

	# Function to convert speech to text using Whisper
	def speech_to_text(audio_data, sample_rate):
	# Normalize audio data
	audio_data = audio_data.flatten().astype(np.float32) / 32768.0

	# Process with Whisper
	result = asr_pipeline({"raw": audio_data, "sampling_rate": sample_rate})
	return result["text"]

	# Function to convert text to speech using SpeechT5
	def text_to_speech(text):
	# Process text input
	inputs = tts_processor(text=text, return_tensors="pt").to(device)

	# Generate speech with speaker embeddings
	with torch.no_grad():
	speech = tts_model.generate_speech(
	inputs["input_ids"],
	speaker_embeddings=speaker_embeddings
	)

	# Convert spectrogram to waveform using vocoder
	waveform = vocoder(speech)

	return waveform

	# Gradio demo
	def demo():
	with gr.Blocks() as demo:
	gr.Markdown("# Voice Chatbot")
	gr.Markdown("Simply speak into the microphone and get an audio response.")

	audio_input = gr.Audio(sources=["microphone"], type="numpy", label="Speak")
	audio_output = gr.Audio(label="Response", autoplay=True)
	transcript_display = gr.Textbox(label="Conversation")

	def process_audio(audio):
	if audio is None:
	return None, "No audio detected."

	try:
	# Get audio data
	sample_rate, audio_data = audio

	# Speech-to-text
	transcript = speech_to_text(audio_data, sample_rate)
	print(f"Transcribed: {transcript}")

	# Generate response (for simplicity, echo the transcript)
	response_text = transcript
	print(f"Response: {response_text}")

	# Text-to-speech
	response_audio = text_to_speech(response_text)

	# Save the response audio to a temporary file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
	# Ensure audio is properly scaled
	audio_np = response_audio.cpu().numpy()
	# Normalize audio to avoid clipping
	audio_np = audio_np / (np.max(np.abs(audio_np)) + 1e-8) * 0.9
	sf.write(temp_file.name, audio_np, 16000)
	temp_filename = temp_file.name

	# Read the audio file
	audio_data, sample_rate = sf.read(temp_filename)

	# Clean up the temporary file
	os.unlink(temp_filename)

	return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}"

	except Exception as e:
	print(f"Error in process_audio: {e}")
	import traceback
	traceback.print_exc()
	return None, f"Error processing audio: {str(e)}"

	audio_input.change(process_audio,
	inputs=[audio_input],
	outputs=[audio_output, transcript_display])

	clear_btn = gr.Button("Clear Conversation")
	clear_btn.click(lambda: (None, ""), outputs=[audio_output, transcript_display])

	demo.launch()

	if __name__ == "__main__":
	demo()