Spaces:

bobpopboom
/

audio1test

Sleeping

App Files Files Community

hashhac commited on Mar 15

Commit

ab25fef

1 Parent(s): ca1dafb

added sound putputs

Browse files

Files changed (1) hide show

app.py +46 -28

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import numpy as np
 import torch
-from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech
 from datasets import load_dataset
 import soundfile as sf
 import tempfile
@@ -10,7 +10,7 @@ import os
 # Check if CUDA is available, otherwise use CPU
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load Whisper for ASR (much more reliable than SpeechT5 for ASR)
 print("Loading ASR model...")
 asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
@@ -19,6 +19,10 @@ print("Loading TTS model...")
 tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 # Load speaker embeddings for TTS
 print("Loading speaker embeddings...")
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
@@ -44,8 +48,11 @@ def text_to_speech(text):
             inputs["input_ids"],
             speaker_embeddings=speaker_embeddings
         )
-    return speech
 # Gradio demo
 def demo():
@@ -61,32 +68,43 @@ def demo():
             if audio is None:
                 return None, "No audio detected."
-            # Get audio data
-            sample_rate, audio_data = audio
-            # Speech-to-text
-            transcript = speech_to_text(audio_data, sample_rate)
-            print(f"Transcribed: {transcript}")
-            # Generate response (for simplicity, echo the transcript)
-            response_text = transcript
-            print(f"Response: {response_text}")
-            # Text-to-speech
-            response_audio = text_to_speech(response_text)
-            # Save the response audio to a temporary file
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
-                sf.write(temp_file.name, response_audio.cpu().numpy(), 16000)
-                temp_filename = temp_file.name
-            # Read the audio file
-            audio_data, sample_rate = sf.read(temp_filename)
-            # Clean up the temporary file
-            os.unlink(temp_filename)
-            return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}"
         audio_input.change(process_audio,
                            inputs=[audio_input],

 import gradio as gr
 import numpy as np
 import torch
+from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from datasets import load_dataset
 import soundfile as sf
 import tempfile
 # Check if CUDA is available, otherwise use CPU
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load Whisper for ASR
 print("Loading ASR model...")
 asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
 tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
+# Load SpeechT5 vocoder (THIS WAS MISSING)
+print("Loading vocoder...")
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 # Load speaker embeddings for TTS
 print("Loading speaker embeddings...")
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
             inputs["input_ids"],
             speaker_embeddings=speaker_embeddings
         )
+        # Convert spectrogram to waveform using vocoder
+        waveform = vocoder(speech)
+    return waveform
 # Gradio demo
 def demo():
             if audio is None:
                 return None, "No audio detected."
+            try:
+                # Get audio data
+                sample_rate, audio_data = audio
+                # Speech-to-text
+                transcript = speech_to_text(audio_data, sample_rate)
+                print(f"Transcribed: {transcript}")
+                # Generate response (for simplicity, echo the transcript)
+                response_text = transcript
+                print(f"Response: {response_text}")
+                # Text-to-speech
+                response_audio = text_to_speech(response_text)
+                # Save the response audio to a temporary file
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+                    # Ensure audio is properly scaled
+                    audio_np = response_audio.cpu().numpy()
+                    # Normalize audio to avoid clipping
+                    audio_np = audio_np / (np.max(np.abs(audio_np)) + 1e-8) * 0.9
+                    sf.write(temp_file.name, audio_np, 16000)
+                    temp_filename = temp_file.name
+                # Read the audio file
+                audio_data, sample_rate = sf.read(temp_filename)
+                # Clean up the temporary file
+                os.unlink(temp_filename)
+                return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}"
+            except Exception as e:
+                print(f"Error in process_audio: {e}")
+                import traceback
+                traceback.print_exc()
+                return None, f"Error processing audio: {str(e)}"
         audio_input.change(process_audio,
                            inputs=[audio_input],