husseinelsaadi commited on
Commit
1f41a8a
·
1 Parent(s): 330157f

update whisper

Browse files
backend/services/interview_engine.py CHANGED
@@ -7,6 +7,7 @@ from langchain_groq import ChatGroq
7
  import logging
8
  import tempfile
9
  import shutil
 
10
 
11
  # Initialize models
12
  chat_groq_api = os.getenv("GROQ_API_KEY")
@@ -25,7 +26,7 @@ def load_whisper_model():
25
  global whisper_model
26
  if whisper_model is None:
27
  try:
28
- device = "cuda" if os.system("nvidia-smi") == 0 else "cpu"
29
  compute_type = "float16" if device == "cuda" else "int8"
30
  whisper_model = WhisperModel("base", device=device, compute_type=compute_type)
31
  logging.info(f"Whisper model loaded on {device} with {compute_type}")
@@ -175,59 +176,30 @@ def convert_webm_to_wav(webm_path, wav_path):
175
  return None
176
 
177
  def whisper_stt(audio_path):
178
- """Speech-to-text using Faster-Whisper with better error handling"""
179
  try:
180
  if not audio_path or not os.path.exists(audio_path):
181
  logging.error(f"Audio file does not exist: {audio_path}")
182
  return ""
183
-
184
- # Check if file has content
185
- file_size = os.path.getsize(audio_path)
186
- if file_size == 0:
187
  logging.error(f"Audio file is empty: {audio_path}")
188
  return ""
189
-
190
- logging.info(f"Processing audio file: {audio_path} ({file_size} bytes)")
191
-
192
- # If the file is WebM, try to convert it to WAV
193
- if audio_path.endswith('.webm'):
194
- wav_path = audio_path.replace('.webm', '.wav')
195
- converted_path = convert_webm_to_wav(audio_path, wav_path)
196
- if converted_path:
197
- audio_path = converted_path
198
- else:
199
- logging.warning("Could not convert WebM to WAV, trying with original file")
200
-
201
- model = load_whisper_model()
202
-
203
- # Add timeout and better error handling
204
- try:
205
- segments, info = model.transcribe(
206
- audio_path,
207
- language="en", # Specify language for better performance
208
- task="transcribe",
209
- vad_filter=True, # Voice activity detection
210
- vad_parameters=dict(min_silence_duration_ms=500)
211
- )
212
-
213
- transcript_parts = []
214
- for segment in segments:
215
- if hasattr(segment, 'text') and segment.text.strip():
216
- transcript_parts.append(segment.text.strip())
217
-
218
- transcript = " ".join(transcript_parts)
219
-
220
- if transcript:
221
- logging.info(f"Transcription successful: '{transcript[:100]}...'")
222
- else:
223
- logging.warning("No speech detected in audio file")
224
-
225
- return transcript.strip()
226
-
227
- except Exception as e:
228
- logging.error(f"Error during transcription: {e}")
229
  return ""
230
-
 
 
 
 
 
231
  except Exception as e:
232
  logging.error(f"Error in STT: {e}")
233
  return ""
 
7
  import logging
8
  import tempfile
9
  import shutil
10
+ import torch
11
 
12
  # Initialize models
13
  chat_groq_api = os.getenv("GROQ_API_KEY")
 
26
  global whisper_model
27
  if whisper_model is None:
28
  try:
29
+ device = "cuda" if torch.cuda.is_available() else "cpu"
30
  compute_type = "float16" if device == "cuda" else "int8"
31
  whisper_model = WhisperModel("base", device=device, compute_type=compute_type)
32
  logging.info(f"Whisper model loaded on {device} with {compute_type}")
 
176
  return None
177
 
178
  def whisper_stt(audio_path):
 
179
  try:
180
  if not audio_path or not os.path.exists(audio_path):
181
  logging.error(f"Audio file does not exist: {audio_path}")
182
  return ""
183
+
184
+ if os.path.getsize(audio_path) == 0:
 
 
185
  logging.error(f"Audio file is empty: {audio_path}")
186
  return ""
187
+
188
+ # Convert WebM to WAV using ffmpeg (ensure ffmpeg is available)
189
+ converted_path = audio_path.replace(".webm", ".wav")
190
+ subprocess.run([
191
+ "ffmpeg", "-y", "-i", audio_path, "-ar", "16000", "-ac", "1", converted_path
192
+ ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
193
+
194
+ if not os.path.exists(converted_path) or os.path.getsize(converted_path) == 0:
195
+ logging.error(f"Conversion failed or produced empty file: {converted_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  return ""
197
+
198
+ model = load_whisper_model()
199
+ segments, _ = model.transcribe(converted_path)
200
+ transcript = " ".join(segment.text for segment in segments)
201
+ return transcript.strip()
202
+
203
  except Exception as e:
204
  logging.error(f"Error in STT: {e}")
205
  return ""
backend/templates/interview.html CHANGED
@@ -695,7 +695,10 @@
695
  delete options.mimeType;
696
  }
697
 
698
- this.mediaRecorder = new MediaRecorder(stream, options);
 
 
 
699
  this.audioChunks = [];
700
 
701
  this.mediaRecorder.ondataavailable = (event) => {
@@ -757,7 +760,8 @@
757
  console.log('Processing', this.audioChunks.length, 'audio chunks');
758
 
759
  // Create blob from audio chunks
760
- const audioBlob = new Blob(this.audioChunks, { type: 'audio/webm;codecs=opus' });
 
761
 
762
  console.log('Created audio blob:', audioBlob.size, 'bytes');
763
 
 
695
  delete options.mimeType;
696
  }
697
 
698
+ this.mediaRecorder = new MediaRecorder(stream, {
699
+ mimeType: 'audio/webm;codecs=opus'
700
+ });
701
+
702
  this.audioChunks = [];
703
 
704
  this.mediaRecorder.ondataavailable = (event) => {
 
760
  console.log('Processing', this.audioChunks.length, 'audio chunks');
761
 
762
  // Create blob from audio chunks
763
+ const audioBlob = new Blob(this.audioChunks, { type: 'audio/webm' });
764
+ formData.append('audio', audioBlob, 'recording.webm');
765
 
766
  console.log('Created audio blob:', audioBlob.size, 'bytes');
767