Spaces:
Paused
Paused
Commit
·
1f41a8a
1
Parent(s):
330157f
update whisper
Browse files
backend/services/interview_engine.py
CHANGED
@@ -7,6 +7,7 @@ from langchain_groq import ChatGroq
|
|
7 |
import logging
|
8 |
import tempfile
|
9 |
import shutil
|
|
|
10 |
|
11 |
# Initialize models
|
12 |
chat_groq_api = os.getenv("GROQ_API_KEY")
|
@@ -25,7 +26,7 @@ def load_whisper_model():
|
|
25 |
global whisper_model
|
26 |
if whisper_model is None:
|
27 |
try:
|
28 |
-
device = "cuda" if
|
29 |
compute_type = "float16" if device == "cuda" else "int8"
|
30 |
whisper_model = WhisperModel("base", device=device, compute_type=compute_type)
|
31 |
logging.info(f"Whisper model loaded on {device} with {compute_type}")
|
@@ -175,59 +176,30 @@ def convert_webm_to_wav(webm_path, wav_path):
|
|
175 |
return None
|
176 |
|
177 |
def whisper_stt(audio_path):
|
178 |
-
"""Speech-to-text using Faster-Whisper with better error handling"""
|
179 |
try:
|
180 |
if not audio_path or not os.path.exists(audio_path):
|
181 |
logging.error(f"Audio file does not exist: {audio_path}")
|
182 |
return ""
|
183 |
-
|
184 |
-
|
185 |
-
file_size = os.path.getsize(audio_path)
|
186 |
-
if file_size == 0:
|
187 |
logging.error(f"Audio file is empty: {audio_path}")
|
188 |
return ""
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
else:
|
199 |
-
logging.warning("Could not convert WebM to WAV, trying with original file")
|
200 |
-
|
201 |
-
model = load_whisper_model()
|
202 |
-
|
203 |
-
# Add timeout and better error handling
|
204 |
-
try:
|
205 |
-
segments, info = model.transcribe(
|
206 |
-
audio_path,
|
207 |
-
language="en", # Specify language for better performance
|
208 |
-
task="transcribe",
|
209 |
-
vad_filter=True, # Voice activity detection
|
210 |
-
vad_parameters=dict(min_silence_duration_ms=500)
|
211 |
-
)
|
212 |
-
|
213 |
-
transcript_parts = []
|
214 |
-
for segment in segments:
|
215 |
-
if hasattr(segment, 'text') and segment.text.strip():
|
216 |
-
transcript_parts.append(segment.text.strip())
|
217 |
-
|
218 |
-
transcript = " ".join(transcript_parts)
|
219 |
-
|
220 |
-
if transcript:
|
221 |
-
logging.info(f"Transcription successful: '{transcript[:100]}...'")
|
222 |
-
else:
|
223 |
-
logging.warning("No speech detected in audio file")
|
224 |
-
|
225 |
-
return transcript.strip()
|
226 |
-
|
227 |
-
except Exception as e:
|
228 |
-
logging.error(f"Error during transcription: {e}")
|
229 |
return ""
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
231 |
except Exception as e:
|
232 |
logging.error(f"Error in STT: {e}")
|
233 |
return ""
|
|
|
7 |
import logging
|
8 |
import tempfile
|
9 |
import shutil
|
10 |
+
import torch
|
11 |
|
12 |
# Initialize models
|
13 |
chat_groq_api = os.getenv("GROQ_API_KEY")
|
|
|
26 |
global whisper_model
|
27 |
if whisper_model is None:
|
28 |
try:
|
29 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
30 |
compute_type = "float16" if device == "cuda" else "int8"
|
31 |
whisper_model = WhisperModel("base", device=device, compute_type=compute_type)
|
32 |
logging.info(f"Whisper model loaded on {device} with {compute_type}")
|
|
|
176 |
return None
|
177 |
|
178 |
def whisper_stt(audio_path):
|
|
|
179 |
try:
|
180 |
if not audio_path or not os.path.exists(audio_path):
|
181 |
logging.error(f"Audio file does not exist: {audio_path}")
|
182 |
return ""
|
183 |
+
|
184 |
+
if os.path.getsize(audio_path) == 0:
|
|
|
|
|
185 |
logging.error(f"Audio file is empty: {audio_path}")
|
186 |
return ""
|
187 |
+
|
188 |
+
# Convert WebM to WAV using ffmpeg (ensure ffmpeg is available)
|
189 |
+
converted_path = audio_path.replace(".webm", ".wav")
|
190 |
+
subprocess.run([
|
191 |
+
"ffmpeg", "-y", "-i", audio_path, "-ar", "16000", "-ac", "1", converted_path
|
192 |
+
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
193 |
+
|
194 |
+
if not os.path.exists(converted_path) or os.path.getsize(converted_path) == 0:
|
195 |
+
logging.error(f"Conversion failed or produced empty file: {converted_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
return ""
|
197 |
+
|
198 |
+
model = load_whisper_model()
|
199 |
+
segments, _ = model.transcribe(converted_path)
|
200 |
+
transcript = " ".join(segment.text for segment in segments)
|
201 |
+
return transcript.strip()
|
202 |
+
|
203 |
except Exception as e:
|
204 |
logging.error(f"Error in STT: {e}")
|
205 |
return ""
|
backend/templates/interview.html
CHANGED
@@ -695,7 +695,10 @@
|
|
695 |
delete options.mimeType;
|
696 |
}
|
697 |
|
698 |
-
this.mediaRecorder = new MediaRecorder(stream,
|
|
|
|
|
|
|
699 |
this.audioChunks = [];
|
700 |
|
701 |
this.mediaRecorder.ondataavailable = (event) => {
|
@@ -757,7 +760,8 @@
|
|
757 |
console.log('Processing', this.audioChunks.length, 'audio chunks');
|
758 |
|
759 |
// Create blob from audio chunks
|
760 |
-
const audioBlob = new Blob(this.audioChunks, { type: 'audio/webm
|
|
|
761 |
|
762 |
console.log('Created audio blob:', audioBlob.size, 'bytes');
|
763 |
|
|
|
695 |
delete options.mimeType;
|
696 |
}
|
697 |
|
698 |
+
this.mediaRecorder = new MediaRecorder(stream, {
|
699 |
+
mimeType: 'audio/webm;codecs=opus'
|
700 |
+
});
|
701 |
+
|
702 |
this.audioChunks = [];
|
703 |
|
704 |
this.mediaRecorder.ondataavailable = (event) => {
|
|
|
760 |
console.log('Processing', this.audioChunks.length, 'audio chunks');
|
761 |
|
762 |
// Create blob from audio chunks
|
763 |
+
const audioBlob = new Blob(this.audioChunks, { type: 'audio/webm' });
|
764 |
+
formData.append('audio', audioBlob, 'recording.webm');
|
765 |
|
766 |
console.log('Created audio blob:', audioBlob.size, 'bytes');
|
767 |
|