hashhac commited on
Commit
de7876c
·
1 Parent(s): fe65571

wave running

Browse files
Files changed (1) hide show
  1. app.py +74 -36
app.py CHANGED
@@ -54,10 +54,15 @@ def load_llm_model():
54
 
55
  tokenizer = AutoTokenizer.from_pretrained(model_id)
56
 
57
- # Ensure pad token is set to something different than EOS token
58
- if tokenizer.pad_token is None:
 
 
 
59
  # Use a different special token as padding token
60
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
 
 
61
  # Resize the token embeddings since we added a new token
62
  model = AutoModelForCausalLM.from_pretrained(
63
  model_id,
@@ -66,6 +71,7 @@ def load_llm_model():
66
  )
67
  model.resize_token_embeddings(len(tokenizer))
68
  else:
 
69
  model = AutoModelForCausalLM.from_pretrained(
70
  model_id,
71
  torch_dtype=torch_dtype,
@@ -78,40 +84,73 @@ def load_llm_model():
78
 
79
  # Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
80
  def gtts_text_to_speech(text):
81
- # Create a temporary file
82
- with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
83
- tmp_filename = f.name
84
-
85
- # Use gTTS to convert text to speech
86
- tts = gTTS(text=text, lang='en', slow=False)
87
 
88
- # Save as MP3 first (gTTS only outputs MP3)
89
- mp3_filename = tmp_filename.replace('.wav', '.mp3')
90
- tts.save(mp3_filename)
91
 
92
- # Convert MP3 to WAV using FFmpeg if available, otherwise use a fallback
93
  try:
94
- import subprocess
95
- subprocess.run(['ffmpeg', '-i', mp3_filename, '-acodec', 'pcm_s16le', '-ar', '24000', '-ac', '1', tmp_filename],
96
- stdout=subprocess.PIPE, stderr=subprocess.PIPE)
97
- except (ImportError, FileNotFoundError):
98
- # Fallback if FFmpeg is not available
99
- from pydub import AudioSegment
100
- sound = AudioSegment.from_mp3(mp3_filename)
101
- sound = sound.set_frame_rate(24000).set_channels(1)
102
- sound.export(tmp_filename, format="wav")
103
-
104
- # Read the WAV file
105
- sample_rate, audio_data = wavfile.read(tmp_filename)
106
-
107
- # Clean up temporary files
108
- os.remove(mp3_filename)
109
- os.remove(tmp_filename)
110
-
111
- # Convert to expected format
112
- audio_data = audio_data.reshape(1, -1).astype(np.int16)
113
-
114
- return (sample_rate, audio_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  # Initialize models
117
  print("Loading ASR model...")
@@ -144,12 +183,11 @@ def generate_response(prompt):
144
  full_prompt += "Assistant: "
145
 
146
  # Generate response with proper attention mask
147
- # Let the tokenizer create the attention mask automatically
148
  tokenized_inputs = llm_tokenizer(
149
  full_prompt,
150
  return_tensors="pt",
151
  padding=True,
152
- return_attention_mask=True # This generates the proper attention mask
153
  )
154
 
155
  # Move to device
@@ -160,7 +198,7 @@ def generate_response(prompt):
160
  with torch.no_grad():
161
  output = llm_model.generate(
162
  input_ids=input_ids,
163
- attention_mask=attention_mask, # Use the tokenizer's attention mask
164
  max_new_tokens=128,
165
  do_sample=True,
166
  temperature=0.7,
 
54
 
55
  tokenizer = AutoTokenizer.from_pretrained(model_id)
56
 
57
+ # Check if pad_token is None or if pad_token is the same as eos_token
58
+ needs_pad_token = (tokenizer.pad_token is None or
59
+ (tokenizer.pad_token == tokenizer.eos_token))
60
+
61
+ if needs_pad_token:
62
  # Use a different special token as padding token
63
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
64
+ print(f"Changed pad token from {tokenizer.pad_token} to [PAD], different from EOS token: {tokenizer.eos_token}")
65
+
66
  # Resize the token embeddings since we added a new token
67
  model = AutoModelForCausalLM.from_pretrained(
68
  model_id,
 
71
  )
72
  model.resize_token_embeddings(len(tokenizer))
73
  else:
74
+ print(f"Pad token ({tokenizer.pad_token}) is already different from EOS token ({tokenizer.eos_token})")
75
  model = AutoModelForCausalLM.from_pretrained(
76
  model_id,
77
  torch_dtype=torch_dtype,
 
84
 
85
  # Step 3: Text-to-Speech with gTTS (Google Text-to-Speech)
86
  def gtts_text_to_speech(text):
87
+ """Convert text to speech using gTTS and ensure proper WAV format."""
88
+ # Create temporary files
89
+ mp3_fd, mp3_filename = tempfile.mkstemp(suffix='.mp3')
90
+ os.close(mp3_fd)
 
 
91
 
92
+ wav_fd, wav_filename = tempfile.mkstemp(suffix='.wav')
93
+ os.close(wav_fd)
 
94
 
 
95
  try:
96
+ # Use gTTS to convert text to speech
97
+ tts = gTTS(text=text, lang='en', slow=False)
98
+ tts.save(mp3_filename)
99
+
100
+ # Convert MP3 to WAV - preferred method with ffmpeg
101
+ try:
102
+ import subprocess
103
+ result = subprocess.run(
104
+ ['ffmpeg', '-y', '-i', mp3_filename, '-acodec', 'pcm_s16le', '-ar', '24000', '-ac', '1', wav_filename],
105
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE,
106
+ check=True
107
+ )
108
+
109
+ except (ImportError, FileNotFoundError, subprocess.CalledProcessError):
110
+ # Fallback if FFmpeg is not available or fails
111
+ from pydub import AudioSegment
112
+ sound = AudioSegment.from_mp3(mp3_filename)
113
+ sound = sound.set_frame_rate(24000).set_channels(1)
114
+ sound.export(wav_filename, format="wav")
115
+
116
+ # Verify the WAV file exists and has size
117
+ if os.path.exists(wav_filename) and os.path.getsize(wav_filename) > 0:
118
+ # Read the WAV file with scipy
119
+ try:
120
+ sample_rate, audio_data = wavfile.read(wav_filename)
121
+ # Convert to expected format
122
+ audio_data = audio_data.reshape(1, -1).astype(np.int16)
123
+ return (sample_rate, audio_data)
124
+ except Exception as e:
125
+ print(f"Error reading WAV file with scipy: {e}")
126
+ # Try alternative approach with pydub
127
+ try:
128
+ from pydub import AudioSegment
129
+ sound = AudioSegment.from_file(wav_filename, format="wav")
130
+ audio_data = np.array(sound.get_array_of_samples(), dtype=np.int16)
131
+ audio_data = audio_data.reshape(1, -1)
132
+ return (sound.frame_rate, audio_data)
133
+ except Exception as e2:
134
+ print(f"Error with pydub fallback: {e2}")
135
+
136
+ # If all else fails, generate a simple tone
137
+ print("Falling back to synthetic audio tone")
138
+ sample_rate = 24000
139
+ duration_sec = len(text) * 0.1 # Rough estimate of speech duration
140
+ tone_length = int(sample_rate * duration_sec)
141
+ audio_data = np.sin(2 * np.pi * np.arange(tone_length) * 440 / sample_rate)
142
+ audio_data = (audio_data * 32767).astype(np.int16)
143
+ audio_data = audio_data.reshape(1, -1)
144
+ return (sample_rate, audio_data)
145
+
146
+ finally:
147
+ # Clean up temporary files
148
+ for filename in [mp3_filename, wav_filename]:
149
+ try:
150
+ if os.path.exists(filename):
151
+ os.remove(filename)
152
+ except:
153
+ pass
154
 
155
  # Initialize models
156
  print("Loading ASR model...")
 
183
  full_prompt += "Assistant: "
184
 
185
  # Generate response with proper attention mask
 
186
  tokenized_inputs = llm_tokenizer(
187
  full_prompt,
188
  return_tensors="pt",
189
  padding=True,
190
+ return_attention_mask=True
191
  )
192
 
193
  # Move to device
 
198
  with torch.no_grad():
199
  output = llm_model.generate(
200
  input_ids=input_ids,
201
+ attention_mask=attention_mask,
202
  max_new_tokens=128,
203
  do_sample=True,
204
  temperature=0.7,