hashhac commited on
Commit
289ad8b
·
1 Parent(s): 3931f99
Files changed (2) hide show
  1. app.py +77 -66
  2. requirements.txt +4 -1
app.py CHANGED
@@ -6,16 +6,16 @@ import gradio as gr
6
  import numpy as np
7
  import torch
8
  import os
 
9
  from transformers import (
10
  AutoModelForSpeechSeq2Seq,
11
  AutoProcessor,
12
  pipeline,
13
  AutoTokenizer,
14
- AutoModelForCausalLM,
15
- AutoModelForSeq2SeqLM
16
  )
17
- from datasets import load_dataset
18
- import scipy
19
 
20
  # Check if CUDA is available, otherwise use CPU
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -23,7 +23,7 @@ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
23
 
24
  # Step 1: Audio transcription with Whisper
25
  def load_asr_model():
26
- model_id = "openai/whisper-small" # Smaller version that's more efficient
27
 
28
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
29
  model_id,
@@ -50,7 +50,7 @@ def load_asr_model():
50
 
51
  # Step 2: Text generation with a smaller LLM
52
  def load_llm_model():
53
- model_id = "facebook/opt-1.3b" # A smaller language model
54
 
55
  tokenizer = AutoTokenizer.from_pretrained(model_id)
56
  model = AutoModelForCausalLM.from_pretrained(
@@ -62,64 +62,50 @@ def load_llm_model():
62
 
63
  return model, tokenizer
64
 
65
- # Step 3: Text-to-Speech with a free model
66
- # Step 3: Text-to-Speech with a free model
67
- def load_tts_model():
68
- # Import the specific SpeechT5 classes
69
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
70
-
71
- model_id = "microsoft/speecht5_tts"
72
- processor = SpeechT5Processor.from_pretrained(model_id)
73
- model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
74
- model.to(device)
75
-
76
- # Load vocoder for waveform generation
77
- vocoder_id = "microsoft/speecht5_hifigan"
78
- vocoder = SpeechT5HifiGan.from_pretrained(vocoder_id)
79
- vocoder.to(device)
80
-
81
- # Load speaker embeddings
82
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
83
- speaker_embeddings = torch.tensor(embeddings_dataset[7]["xvector"]).unsqueeze(0)
84
-
85
- return model, processor, vocoder, speaker_embeddings
86
-
87
- def text_to_speech(text):
88
- # Prepare inputs
89
- inputs = tts_processor(text=text, return_tensors="pt")
90
-
91
- # Generate speech with SpeechT5
92
- with torch.no_grad():
93
- # Convert speaker embeddings to correct dtype and move to device
94
- speaker_embeddings_device = speaker_embeddings.to(device).to(torch_dtype)
95
-
96
- # Generate speech
97
- speech = tts_model.generate_speech(
98
- inputs["input_ids"].to(device),
99
- speaker_embeddings_device,
100
- vocoder=tts_vocoder
101
- )
102
 
103
- # Convert to numpy array
104
- audio_array = speech.cpu().numpy().astype(np.float32)
105
-
106
- # Normalize the audio
107
- audio_array = audio_array / np.max(np.abs(audio_array) + 1e-6)
108
-
109
- audio_array = audio_array.reshape(1, -1).astype(np.float32)
110
 
111
- return (16000, audio_array) # SpeechT5 uses 16kHz sample rate
112
 
113
- # Initialize all models
114
  print("Loading ASR model...")
115
  asr_pipeline = load_asr_model()
116
 
117
  print("Loading LLM model...")
118
  llm_model, llm_tokenizer = load_llm_model()
119
 
120
- print("Loading TTS model...")
121
- tts_model, tts_processor, tts_vocoder, speaker_embeddings = load_tts_model()
122
-
123
  # Chat history management
124
  chat_history = []
125
 
@@ -167,21 +153,31 @@ def generate_response(prompt):
167
 
168
  return response_text
169
 
170
-
171
-
172
  def response(audio: tuple[int, np.ndarray]):
173
- # Step 1: Speech-to-Text
174
- transcript = asr_pipeline({"sampling_rate": audio[0], "raw": audio[1].flatten()})
 
 
 
 
 
 
 
 
 
 
175
  prompt = transcript["text"]
 
176
 
177
  # Step 2: Generate text response
178
  response_text = generate_response(prompt)
 
179
 
180
- # Step 3: Text-to-Speech
181
- sample_rate, audio_array = text_to_speech(response_text)
182
 
183
- # Convert to expected format
184
- chunk_size = 4800 # 200ms chunks at 24kHz
185
  for i in range(0, audio_array.shape[1], chunk_size):
186
  chunk = audio_array[:, i:i+chunk_size]
187
  if chunk.size > 0: # Ensure we don't yield empty chunks
@@ -205,14 +201,22 @@ def demo():
205
  return None
206
 
207
  sample_rate, audio_array = audio
208
- transcript = asr_pipeline({"sampling_rate": sample_rate, "raw": audio_array.flatten()})
 
 
 
 
 
 
 
 
209
  prompt = transcript["text"]
210
  print(f"Transcribed: {prompt}")
211
 
212
  response_text = generate_response(prompt)
213
  print(f"Response: {response_text}")
214
 
215
- sample_rate, audio_array = text_to_speech(response_text)
216
  return (sample_rate, audio_array[0])
217
 
218
  audio_input.change(process_audio, inputs=[audio_input], outputs=[audio_output])
@@ -224,5 +228,12 @@ if __name__ == "__main__":
224
  parser = argparse.ArgumentParser()
225
  parser.add_argument("--demo", action="store_true", help="Run Gradio demo instead of WebRTC")
226
  args = parser.parse_args()
227
- # would be faster with webRTC but needs to intialize the model to get it to work
228
  demo()
 
 
 
 
 
 
 
 
6
  import numpy as np
7
  import torch
8
  import os
9
+ import tempfile
10
  from transformers import (
11
  AutoModelForSpeechSeq2Seq,
12
  AutoProcessor,
13
  pipeline,
14
  AutoTokenizer,
15
+ AutoModelForCausalLM
 
16
  )
17
+ from gtts import gTTS
18
+ from scipy.io import wavfile
19
 
20
  # Check if CUDA is available, otherwise use CPU
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
23
 
24
  # Step 1: Audio transcription with Whisper
25
  def load_asr_model():
26
+ model_id = "openai/whisper-small"
27
 
28
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
29
  model_id,
 
50
 
51
  # Step 2: Text generation with a smaller LLM
52
  def load_llm_model():
53
+ model_id = "facebook/opt-1.3b"
54
 
55
  tokenizer = AutoTokenizer.from_pretrained(model_id)
56
  model = AutoModelForCausalLM.from_pretrained(
 
62
 
63
  return model, tokenizer
64
 
65
+ # Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
66
+ def gtts_text_to_speech(text):
67
+ # Create a temporary file
68
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
69
+ tmp_filename = f.name
70
+
71
+ # Use gTTS to convert text to speech
72
+ tts = gTTS(text=text, lang='en', slow=False)
73
+
74
+ # Save as MP3 first (gTTS only outputs MP3)
75
+ mp3_filename = tmp_filename.replace('.wav', '.mp3')
76
+ tts.save(mp3_filename)
77
+
78
+ # Convert MP3 to WAV using FFmpeg if available, otherwise use a fallback
79
+ try:
80
+ import subprocess
81
+ subprocess.run(['ffmpeg', '-i', mp3_filename, '-acodec', 'pcm_s16le', '-ar', '24000', '-ac', '1', tmp_filename],
82
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
83
+ except (ImportError, FileNotFoundError):
84
+ # Fallback if FFmpeg is not available
85
+ from pydub import AudioSegment
86
+ sound = AudioSegment.from_mp3(mp3_filename)
87
+ sound = sound.set_frame_rate(24000).set_channels(1)
88
+ sound.export(tmp_filename, format="wav")
89
+
90
+ # Read the WAV file
91
+ sample_rate, audio_data = wavfile.read(tmp_filename)
92
+
93
+ # Clean up temporary files
94
+ os.remove(mp3_filename)
95
+ os.remove(tmp_filename)
 
 
 
 
 
 
96
 
97
+ # Convert to expected format
98
+ audio_data = audio_data.reshape(1, -1).astype(np.int16)
 
 
 
 
 
99
 
100
+ return (sample_rate, audio_data)
101
 
102
+ # Initialize models
103
  print("Loading ASR model...")
104
  asr_pipeline = load_asr_model()
105
 
106
  print("Loading LLM model...")
107
  llm_model, llm_tokenizer = load_llm_model()
108
 
 
 
 
109
  # Chat history management
110
  chat_history = []
111
 
 
153
 
154
  return response_text
155
 
 
 
156
  def response(audio: tuple[int, np.ndarray]):
157
+ # Step 1: Convert audio to float32 before passing to ASR
158
+ sample_rate, audio_data = audio
159
+
160
+ # Convert int16 audio to float32
161
+ audio_float32 = audio_data.flatten().astype(np.float32) / 32768.0 # Normalize to [-1.0, 1.0]
162
+
163
+ # Speech-to-Text with correct data type
164
+ transcript = asr_pipeline({
165
+ "sampling_rate": sample_rate,
166
+ "raw": audio_float32
167
+ })
168
+
169
  prompt = transcript["text"]
170
+ print(f"Transcribed: {prompt}")
171
 
172
  # Step 2: Generate text response
173
  response_text = generate_response(prompt)
174
+ print(f"Response: {response_text}")
175
 
176
+ # Step 3: Text-to-Speech using gTTS
177
+ sample_rate, audio_array = gtts_text_to_speech(response_text)
178
 
179
+ # Convert to expected format and yield chunks
180
+ chunk_size = int(sample_rate * 0.2) # 200ms chunks
181
  for i in range(0, audio_array.shape[1], chunk_size):
182
  chunk = audio_array[:, i:i+chunk_size]
183
  if chunk.size > 0: # Ensure we don't yield empty chunks
 
201
  return None
202
 
203
  sample_rate, audio_array = audio
204
+
205
+ # Convert to float32 for ASR
206
+ audio_float32 = audio_array.flatten().astype(np.float32) / 32768.0
207
+
208
+ transcript = asr_pipeline({
209
+ "sampling_rate": sample_rate,
210
+ "raw": audio_float32
211
+ })
212
+
213
  prompt = transcript["text"]
214
  print(f"Transcribed: {prompt}")
215
 
216
  response_text = generate_response(prompt)
217
  print(f"Response: {response_text}")
218
 
219
+ sample_rate, audio_array = gtts_text_to_speech(response_text)
220
  return (sample_rate, audio_array[0])
221
 
222
  audio_input.change(process_audio, inputs=[audio_input], outputs=[audio_output])
 
228
  parser = argparse.ArgumentParser()
229
  parser.add_argument("--demo", action="store_true", help="Run Gradio demo instead of WebRTC")
230
  args = parser.parse_args()
231
+ # hugging face issues
232
  demo()
233
+
234
+ # if args.demo:
235
+ # demo()
236
+ # else:
237
+ # # For running with FastRTC
238
+ # # You would need to add your FastRTC server code here
239
+ # pass
requirements.txt CHANGED
@@ -7,4 +7,7 @@ gradio
7
  accelerate
8
  sentencepiece
9
  fastrtc[vad,tts]
10
- torchaudio
 
 
 
 
7
  accelerate
8
  sentencepiece
9
  fastrtc[vad,tts]
10
+ torchaudio
11
+ gtts
12
+ pydub
13
+ scipy