hashhac commited on
Commit
ab25fef
·
1 Parent(s): ca1dafb

added sound putputs

Browse files
Files changed (1) hide show
  1. app.py +46 -28
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import numpy as np
3
  import torch
4
- from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech
5
  from datasets import load_dataset
6
  import soundfile as sf
7
  import tempfile
@@ -10,7 +10,7 @@ import os
10
  # Check if CUDA is available, otherwise use CPU
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
- # Load Whisper for ASR (much more reliable than SpeechT5 for ASR)
14
  print("Loading ASR model...")
15
  asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
16
 
@@ -19,6 +19,10 @@ print("Loading TTS model...")
19
  tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
20
  tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
21
 
 
 
 
 
22
  # Load speaker embeddings for TTS
23
  print("Loading speaker embeddings...")
24
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
@@ -44,8 +48,11 @@ def text_to_speech(text):
44
  inputs["input_ids"],
45
  speaker_embeddings=speaker_embeddings
46
  )
 
 
 
47
 
48
- return speech
49
 
50
  # Gradio demo
51
  def demo():
@@ -61,32 +68,43 @@ def demo():
61
  if audio is None:
62
  return None, "No audio detected."
63
 
64
- # Get audio data
65
- sample_rate, audio_data = audio
66
-
67
- # Speech-to-text
68
- transcript = speech_to_text(audio_data, sample_rate)
69
- print(f"Transcribed: {transcript}")
70
-
71
- # Generate response (for simplicity, echo the transcript)
72
- response_text = transcript
73
- print(f"Response: {response_text}")
74
-
75
- # Text-to-speech
76
- response_audio = text_to_speech(response_text)
77
-
78
- # Save the response audio to a temporary file
79
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
80
- sf.write(temp_file.name, response_audio.cpu().numpy(), 16000)
81
- temp_filename = temp_file.name
82
-
83
- # Read the audio file
84
- audio_data, sample_rate = sf.read(temp_filename)
85
-
86
- # Clean up the temporary file
87
- os.unlink(temp_filename)
 
 
 
 
 
 
 
88
 
89
- return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}"
 
 
 
 
90
 
91
  audio_input.change(process_audio,
92
  inputs=[audio_input],
 
1
  import gradio as gr
2
  import numpy as np
3
  import torch
4
+ from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  from datasets import load_dataset
6
  import soundfile as sf
7
  import tempfile
 
10
  # Check if CUDA is available, otherwise use CPU
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
+ # Load Whisper for ASR
14
  print("Loading ASR model...")
15
  asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
16
 
 
19
  tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
20
  tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
21
 
22
+ # Load SpeechT5 vocoder (THIS WAS MISSING)
23
+ print("Loading vocoder...")
24
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
25
+
26
  # Load speaker embeddings for TTS
27
  print("Loading speaker embeddings...")
28
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 
48
  inputs["input_ids"],
49
  speaker_embeddings=speaker_embeddings
50
  )
51
+
52
+ # Convert spectrogram to waveform using vocoder
53
+ waveform = vocoder(speech)
54
 
55
+ return waveform
56
 
57
  # Gradio demo
58
  def demo():
 
68
  if audio is None:
69
  return None, "No audio detected."
70
 
71
+ try:
72
+ # Get audio data
73
+ sample_rate, audio_data = audio
74
+
75
+ # Speech-to-text
76
+ transcript = speech_to_text(audio_data, sample_rate)
77
+ print(f"Transcribed: {transcript}")
78
+
79
+ # Generate response (for simplicity, echo the transcript)
80
+ response_text = transcript
81
+ print(f"Response: {response_text}")
82
+
83
+ # Text-to-speech
84
+ response_audio = text_to_speech(response_text)
85
+
86
+ # Save the response audio to a temporary file
87
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
88
+ # Ensure audio is properly scaled
89
+ audio_np = response_audio.cpu().numpy()
90
+ # Normalize audio to avoid clipping
91
+ audio_np = audio_np / (np.max(np.abs(audio_np)) + 1e-8) * 0.9
92
+ sf.write(temp_file.name, audio_np, 16000)
93
+ temp_filename = temp_file.name
94
+
95
+ # Read the audio file
96
+ audio_data, sample_rate = sf.read(temp_filename)
97
+
98
+ # Clean up the temporary file
99
+ os.unlink(temp_filename)
100
+
101
+ return (sample_rate, audio_data), f"You: {transcript}\nAssistant: {response_text}"
102
 
103
+ except Exception as e:
104
+ print(f"Error in process_audio: {e}")
105
+ import traceback
106
+ traceback.print_exc()
107
+ return None, f"Error processing audio: {str(e)}"
108
 
109
  audio_input.change(process_audio,
110
  inputs=[audio_input],