hashhac commited on
Commit
dbf60e3
·
1 Parent(s): de7876c
Files changed (2) hide show
  1. app.py +144 -55
  2. requirements.txt +2 -1
app.py CHANGED
@@ -52,32 +52,38 @@ def load_asr_model():
52
  def load_llm_model():
53
  model_id = "facebook/opt-1.3b"
54
 
 
55
  tokenizer = AutoTokenizer.from_pretrained(model_id)
56
 
57
- # Check if pad_token is None or if pad_token is the same as eos_token
58
- needs_pad_token = (tokenizer.pad_token is None or
59
- (tokenizer.pad_token == tokenizer.eos_token))
60
 
61
- if needs_pad_token:
62
- # Use a different special token as padding token
63
- tokenizer.add_special_tokens({'pad_token': '[PAD]'})
64
- print(f"Changed pad token from {tokenizer.pad_token} to [PAD], different from EOS token: {tokenizer.eos_token}")
 
 
 
 
 
 
 
 
65
 
66
- # Resize the token embeddings since we added a new token
67
- model = AutoModelForCausalLM.from_pretrained(
68
- model_id,
69
- torch_dtype=torch_dtype,
70
- low_cpu_mem_usage=True
71
- )
72
  model.resize_token_embeddings(len(tokenizer))
 
 
 
 
 
 
73
  else:
74
- print(f"Pad token ({tokenizer.pad_token}) is already different from EOS token ({tokenizer.eos_token})")
75
- model = AutoModelForCausalLM.from_pretrained(
76
- model_id,
77
- torch_dtype=torch_dtype,
78
- low_cpu_mem_usage=True
79
- )
80
 
 
81
  model.to(device)
82
 
83
  return model, tokenizer
@@ -85,72 +91,150 @@ def load_llm_model():
85
  # Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
86
  def gtts_text_to_speech(text):
87
  """Convert text to speech using gTTS and ensure proper WAV format."""
88
- # Create temporary files
89
- mp3_fd, mp3_filename = tempfile.mkstemp(suffix='.mp3')
90
- os.close(mp3_fd)
91
-
92
- wav_fd, wav_filename = tempfile.mkstemp(suffix='.wav')
93
- os.close(wav_fd)
94
 
95
  try:
96
- # Use gTTS to convert text to speech
 
 
 
 
97
  tts = gTTS(text=text, lang='en', slow=False)
98
  tts.save(mp3_filename)
99
 
100
- # Convert MP3 to WAV - preferred method with ffmpeg
 
 
 
 
 
101
  try:
102
  import subprocess
 
 
 
103
  result = subprocess.run(
104
- ['ffmpeg', '-y', '-i', mp3_filename, '-acodec', 'pcm_s16le', '-ar', '24000', '-ac', '1', wav_filename],
105
- stdout=subprocess.PIPE, stderr=subprocess.PIPE,
 
106
  check=True
107
  )
108
 
109
- except (ImportError, FileNotFoundError, subprocess.CalledProcessError):
110
- # Fallback if FFmpeg is not available or fails
111
- from pydub import AudioSegment
112
- sound = AudioSegment.from_mp3(mp3_filename)
113
- sound = sound.set_frame_rate(24000).set_channels(1)
114
- sound.export(wav_filename, format="wav")
 
 
115
 
116
- # Verify the WAV file exists and has size
117
- if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 0:
118
- # Read the WAV file with scipy
119
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  sample_rate, audio_data = wavfile.read(wav_filename)
 
121
  # Convert to expected format
122
  audio_data = audio_data.reshape(1, -1).astype(np.int16)
 
123
  return (sample_rate, audio_data)
 
124
  except Exception as e:
125
- print(f"Error reading WAV file with scipy: {e}")
126
- # Try alternative approach with pydub
127
- try:
128
- from pydub import AudioSegment
129
- sound = AudioSegment.from_file(wav_filename, format="wav")
130
- audio_data = np.array(sound.get_array_of_samples(), dtype=np.int16)
131
- audio_data = audio_data.reshape(1, -1)
132
- return (sound.frame_rate, audio_data)
133
- except Exception as e2:
134
- print(f"Error with pydub fallback: {e2}")
135
 
136
  # If all else fails, generate a simple tone
137
- print("Falling back to synthetic audio tone")
138
  sample_rate = 24000
139
- duration_sec = len(text) * 0.1 # Rough estimate of speech duration
140
  tone_length = int(sample_rate * duration_sec)
141
  audio_data = np.sin(2 * np.pi * np.arange(tone_length) * 440 / sample_rate)
142
  audio_data = (audio_data * 32767).astype(np.int16)
143
  audio_data = audio_data.reshape(1, -1)
144
  return (sample_rate, audio_data)
145
 
 
 
 
 
 
 
 
 
 
146
  finally:
147
  # Clean up temporary files
148
  for filename in [mp3_filename, wav_filename]:
149
  try:
150
  if os.path.exists(filename):
151
  os.remove(filename)
152
- except:
153
- pass
154
 
155
  # Initialize models
156
  print("Loading ASR model...")
@@ -183,10 +267,13 @@ def generate_response(prompt):
183
  full_prompt += "Assistant: "
184
 
185
  # Generate response with proper attention mask
 
186
  tokenized_inputs = llm_tokenizer(
187
  full_prompt,
188
  return_tensors="pt",
189
- padding=True,
 
 
190
  return_attention_mask=True
191
  )
192
 
@@ -194,7 +281,7 @@ def generate_response(prompt):
194
  input_ids = tokenized_inputs["input_ids"].to(device)
195
  attention_mask = tokenized_inputs["attention_mask"].to(device)
196
 
197
- # Generate response
198
  with torch.no_grad():
199
  output = llm_model.generate(
200
  input_ids=input_ids,
@@ -202,7 +289,9 @@ def generate_response(prompt):
202
  max_new_tokens=128,
203
  do_sample=True,
204
  temperature=0.7,
205
- top_p=0.9
 
 
206
  )
207
 
208
  response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
 
52
  def load_llm_model():
53
  model_id = "facebook/opt-1.3b"
54
 
55
+ # First load the tokenizer
56
  tokenizer = AutoTokenizer.from_pretrained(model_id)
57
 
58
+ # Print current token configuration
59
+ print(f"Initial pad token ID: {tokenizer.pad_token_id}, EOS token ID: {tokenizer.eos_token_id}")
 
60
 
61
+ # Load the model first
62
+ model = AutoModelForCausalLM.from_pretrained(
63
+ model_id,
64
+ torch_dtype=torch_dtype,
65
+ low_cpu_mem_usage=True
66
+ )
67
+
68
+ # Set pad token if needed
69
+ if tokenizer.pad_token is None or tokenizer.pad_token_id == tokenizer.eos_token_id:
70
+ # Add a new special token as padding token
71
+ special_tokens = {'pad_token': '[PAD]'}
72
+ num_added = tokenizer.add_special_tokens(special_tokens)
73
 
74
+ # Must resize the token embeddings when adding tokens
 
 
 
 
 
75
  model.resize_token_embeddings(len(tokenizer))
76
+
77
+ # Update the model's config to explicitly set the pad token ID
78
+ model.config.pad_token_id = tokenizer.pad_token_id
79
+
80
+ print(f"Added pad token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
81
+ print(f"Different from EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
82
  else:
83
+ print(f"Pad token already set: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
84
+ print(f"EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
 
 
 
 
85
 
86
+ # Move model to the right device
87
  model.to(device)
88
 
89
  return model, tokenizer
 
91
  # Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
92
  def gtts_text_to_speech(text):
93
  """Convert text to speech using gTTS and ensure proper WAV format."""
94
+ # Create absolute paths for temporary files
95
+ temp_dir = tempfile.gettempdir()
96
+ mp3_filename = os.path.join(temp_dir, f"tts_temp_{os.getpid()}_{time.time()}.mp3")
97
+ wav_filename = os.path.join(temp_dir, f"tts_temp_{os.getpid()}_{time.time()}.wav")
 
 
98
 
99
  try:
100
+ # Make sure text is not empty
101
+ if not text or text.isspace():
102
+ text = "I don't have a response for that."
103
+
104
+ # Create gTTS object and save to MP3
105
  tts = gTTS(text=text, lang='en', slow=False)
106
  tts.save(mp3_filename)
107
 
108
+ print(f"MP3 file created: {mp3_filename}, size: {os.path.getsize(mp3_filename)}")
109
+
110
+ # Try multiple methods to convert MP3 to WAV
111
+ wav_created = False
112
+
113
+ # Method 1: Try ffmpeg (most reliable)
114
  try:
115
  import subprocess
116
+ cmd = ['ffmpeg', '-y', '-i', mp3_filename, '-acodec', 'pcm_s16le', '-ar', '24000', '-ac', '1', wav_filename]
117
+ print(f"Running ffmpeg command: {' '.join(cmd)}")
118
+
119
  result = subprocess.run(
120
+ cmd,
121
+ stdout=subprocess.PIPE,
122
+ stderr=subprocess.PIPE,
123
  check=True
124
  )
125
 
126
+ if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
127
+ print(f"WAV file successfully created with ffmpeg: {wav_filename}, size: {os.path.getsize(wav_filename)}")
128
+ wav_created = True
129
+ else:
130
+ print(f"ffmpeg ran but WAV file is missing or too small: {wav_filename}")
131
+
132
+ except Exception as e:
133
+ print(f"ffmpeg conversion failed: {str(e)}")
134
 
135
+ # Method 2: Try pydub if ffmpeg failed
136
+ if not wav_created:
 
137
  try:
138
+ from pydub import AudioSegment
139
+ print("Converting MP3 to WAV using pydub...")
140
+ sound = AudioSegment.from_mp3(mp3_filename)
141
+ sound = sound.set_frame_rate(24000).set_channels(1)
142
+ sound.export(wav_filename, format="wav")
143
+
144
+ if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
145
+ print(f"WAV file successfully created with pydub: {wav_filename}, size: {os.path.getsize(wav_filename)}")
146
+ wav_created = True
147
+ else:
148
+ print(f"pydub ran but WAV file is missing or too small")
149
+
150
+ except Exception as e:
151
+ print(f"pydub conversion failed: {str(e)}")
152
+
153
+ # Method 3: Direct WAV creation with gTTS-like library (last resort)
154
+ if not wav_created:
155
+ try:
156
+ import numpy as np
157
+ from scipy.io import wavfile
158
+
159
+ print("Generating synthetic speech directly...")
160
+ # Generate a simple speech-like tone pattern
161
+ sample_rate = 24000
162
+ duration = len(text) * 0.075 # Approx timing
163
+ t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
164
+
165
+ # Create a speech-like tone with some variation
166
+ frequencies = [220, 440, 330, 550]
167
+ audio = np.zeros_like(t)
168
+ for i, freq in enumerate(frequencies):
169
+ audio += 0.2 * np.sin(2 * np.pi * freq * t + i)
170
+
171
+ # Add some envelope
172
+ envelope = np.ones_like(t)
173
+ attack = int(0.01 * sample_rate)
174
+ release = int(0.1 * sample_rate)
175
+ envelope[:attack] = np.linspace(0, 1, attack)
176
+ envelope[-release:] = np.linspace(1, 0, release)
177
+ audio = audio * envelope
178
+
179
+ # Normalize and convert to int16
180
+ audio = audio / np.max(np.abs(audio))
181
+ audio = (audio * 32767).astype(np.int16)
182
+
183
+ # Save as WAV
184
+ wavfile.write(wav_filename, sample_rate, audio)
185
+
186
+ if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 100:
187
+ print(f"WAV file successfully created directly: {wav_filename}, size: {os.path.getsize(wav_filename)}")
188
+ wav_created = True
189
+
190
+ except Exception as e:
191
+ print(f"Direct WAV creation failed: {str(e)}")
192
+
193
+ # Read the WAV file if it was created
194
+ if wav_created:
195
+ try:
196
+ # Add a small delay to ensure the file is fully written
197
+ time.sleep(0.1)
198
+
199
+ # Read WAV file with scipy
200
+ print(f"Reading WAV file: {wav_filename}")
201
  sample_rate, audio_data = wavfile.read(wav_filename)
202
+
203
  # Convert to expected format
204
  audio_data = audio_data.reshape(1, -1).astype(np.int16)
205
+ print(f"WAV file read successfully, shape: {audio_data.shape}, sample rate: {sample_rate}")
206
  return (sample_rate, audio_data)
207
+
208
  except Exception as e:
209
+ print(f"Error reading WAV file: {str(e)}")
 
 
 
 
 
 
 
 
 
210
 
211
  # If all else fails, generate a simple tone
212
+ print("All methods failed. Falling back to synthetic audio tone")
213
  sample_rate = 24000
214
+ duration_sec = max(1, len(text) * 0.1)
215
  tone_length = int(sample_rate * duration_sec)
216
  audio_data = np.sin(2 * np.pi * np.arange(tone_length) * 440 / sample_rate)
217
  audio_data = (audio_data * 32767).astype(np.int16)
218
  audio_data = audio_data.reshape(1, -1)
219
  return (sample_rate, audio_data)
220
 
221
+ except Exception as e:
222
+ print(f"Unexpected error in text-to-speech: {str(e)}")
223
+ # Generate a simple tone as last resort
224
+ sample_rate = 24000
225
+ audio_data = np.sin(2 * np.pi * np.arange(sample_rate) * 440 / sample_rate)
226
+ audio_data = (audio_data * 32767).astype(np.int16)
227
+ audio_data = audio_data.reshape(1, -1)
228
+ return (sample_rate, audio_data)
229
+
230
  finally:
231
  # Clean up temporary files
232
  for filename in [mp3_filename, wav_filename]:
233
  try:
234
  if os.path.exists(filename):
235
  os.remove(filename)
236
+ except Exception as e:
237
+ print(f"Failed to remove temporary file {filename}: {str(e)}")
238
 
239
  # Initialize models
240
  print("Loading ASR model...")
 
267
  full_prompt += "Assistant: "
268
 
269
  # Generate response with proper attention mask
270
+ # Ensure padding is done correctly with explicit parameters
271
  tokenized_inputs = llm_tokenizer(
272
  full_prompt,
273
  return_tensors="pt",
274
+ padding="max_length",
275
+ max_length=512, # Fixed length helps with attention masks
276
+ truncation=True,
277
  return_attention_mask=True
278
  )
279
 
 
281
  input_ids = tokenized_inputs["input_ids"].to(device)
282
  attention_mask = tokenized_inputs["attention_mask"].to(device)
283
 
284
+ # Generate response - explicitly pass all needed parameters
285
  with torch.no_grad():
286
  output = llm_model.generate(
287
  input_ids=input_ids,
 
289
  max_new_tokens=128,
290
  do_sample=True,
291
  temperature=0.7,
292
+ top_p=0.9,
293
+ pad_token_id=llm_tokenizer.pad_token_id, # Explicitly set pad token ID
294
+ eos_token_id=llm_tokenizer.eos_token_id # Explicitly set EOS token ID
295
  )
296
 
297
  response_text = llm_tokenizer.decode(output[0], skip_special_tokens=True)
requirements.txt CHANGED
@@ -10,4 +10,5 @@ fastrtc[vad,tts]
10
  torchaudio
11
  gtts
12
  pydub
13
- scipy
 
 
10
  torchaudio
11
  gtts
12
  pydub
13
+ scipy
14
+ time