undiarized text is assigned to last speaker, with buffer information; traceback is used to format_exc errors
Browse files- whisper_fastapi_online_server.py +62 -18
whisper_fastapi_online_server.py
CHANGED
@@ -16,6 +16,7 @@ from src.whisper_streaming.timed_objects import ASRToken
|
|
16 |
import math
|
17 |
import logging
|
18 |
from datetime import timedelta
|
|
|
19 |
|
20 |
def format_time(seconds):
|
21 |
return str(timedelta(seconds=int(seconds)))
|
@@ -48,7 +49,7 @@ parser.add_argument(
|
|
48 |
parser.add_argument(
|
49 |
"--diarization",
|
50 |
type=bool,
|
51 |
-
default=
|
52 |
help="Whether to enable speaker diarization.",
|
53 |
)
|
54 |
|
@@ -81,6 +82,7 @@ class SharedState:
|
|
81 |
self.lock = asyncio.Lock()
|
82 |
self.beg_loop = time()
|
83 |
self.sep = " " # Default separator
|
|
|
84 |
|
85 |
async def update_transcription(self, new_tokens, buffer, end_buffer, full_transcription, sep):
|
86 |
async with self.lock:
|
@@ -119,7 +121,7 @@ class SharedState:
|
|
119 |
|
120 |
# Calculate remaining time for diarization
|
121 |
if self.end_attributed_speaker > 0:
|
122 |
-
remaining_time_diarization = max(0, round(
|
123 |
|
124 |
return {
|
125 |
"tokens": self.tokens.copy(),
|
@@ -142,6 +144,7 @@ class SharedState:
|
|
142 |
self.end_attributed_speaker = 0
|
143 |
self.full_transcription = ""
|
144 |
self.beg_loop = time()
|
|
|
145 |
|
146 |
##### LOAD APP #####
|
147 |
|
@@ -221,6 +224,7 @@ async def transcription_processor(shared_state, pcm_queue, online):
|
|
221 |
|
222 |
except Exception as e:
|
223 |
logger.warning(f"Exception in transcription_processor: {e}")
|
|
|
224 |
finally:
|
225 |
pcm_queue.task_done()
|
226 |
|
@@ -247,6 +251,7 @@ async def diarization_processor(shared_state, pcm_queue, diarization_obj):
|
|
247 |
|
248 |
except Exception as e:
|
249 |
logger.warning(f"Exception in diarization_processor: {e}")
|
|
|
250 |
finally:
|
251 |
pcm_queue.task_done()
|
252 |
|
@@ -272,21 +277,28 @@ async def results_formatter(shared_state, websocket):
|
|
272 |
|
273 |
# Process tokens to create response
|
274 |
previous_speaker = -10
|
275 |
-
lines = [
|
|
|
276 |
last_end_diarized = 0
|
|
|
277 |
|
278 |
for token in tokens:
|
279 |
speaker = token.speaker
|
|
|
280 |
if args.diarization:
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
|
|
|
|
|
|
|
|
287 |
last_end_diarized = max(token.end, last_end_diarized)
|
288 |
|
289 |
-
if speaker != previous_speaker:
|
290 |
lines.append(
|
291 |
{
|
292 |
"speaker": speaker,
|
@@ -302,22 +314,53 @@ async def results_formatter(shared_state, websocket):
|
|
302 |
lines[-1]["end"] = format_time(token.end)
|
303 |
lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
|
304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
# Prepare response object
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
|
314 |
-
|
|
|
|
|
|
|
|
|
315 |
|
316 |
# Add a small delay to avoid overwhelming the client
|
317 |
await asyncio.sleep(0.1)
|
318 |
|
319 |
except Exception as e:
|
320 |
logger.warning(f"Exception in results_formatter: {e}")
|
|
|
321 |
await asyncio.sleep(0.5) # Back off on error
|
322 |
|
323 |
##### ENDPOINTS #####
|
@@ -422,6 +465,7 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
422 |
|
423 |
except Exception as e:
|
424 |
logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
|
|
|
425 |
break
|
426 |
|
427 |
logger.info("Exiting ffmpeg_stdout_reader...")
|
|
|
16 |
import math
|
17 |
import logging
|
18 |
from datetime import timedelta
|
19 |
+
import traceback
|
20 |
|
21 |
def format_time(seconds):
|
22 |
return str(timedelta(seconds=int(seconds)))
|
|
|
49 |
parser.add_argument(
|
50 |
"--diarization",
|
51 |
type=bool,
|
52 |
+
default=False,
|
53 |
help="Whether to enable speaker diarization.",
|
54 |
)
|
55 |
|
|
|
82 |
self.lock = asyncio.Lock()
|
83 |
self.beg_loop = time()
|
84 |
self.sep = " " # Default separator
|
85 |
+
self.last_response_content = "" # To track changes in response
|
86 |
|
87 |
async def update_transcription(self, new_tokens, buffer, end_buffer, full_transcription, sep):
|
88 |
async with self.lock:
|
|
|
121 |
|
122 |
# Calculate remaining time for diarization
|
123 |
if self.end_attributed_speaker > 0:
|
124 |
+
remaining_time_diarization = max(0, round(max(self.end_buffer, self.tokens[-1].end if self.tokens else 0) - self.end_attributed_speaker, 2))
|
125 |
|
126 |
return {
|
127 |
"tokens": self.tokens.copy(),
|
|
|
144 |
self.end_attributed_speaker = 0
|
145 |
self.full_transcription = ""
|
146 |
self.beg_loop = time()
|
147 |
+
self.last_response_content = ""
|
148 |
|
149 |
##### LOAD APP #####
|
150 |
|
|
|
224 |
|
225 |
except Exception as e:
|
226 |
logger.warning(f"Exception in transcription_processor: {e}")
|
227 |
+
logger.warning(f"Traceback: {traceback.format_exc()}")
|
228 |
finally:
|
229 |
pcm_queue.task_done()
|
230 |
|
|
|
251 |
|
252 |
except Exception as e:
|
253 |
logger.warning(f"Exception in diarization_processor: {e}")
|
254 |
+
logger.warning(f"Traceback: {traceback.format_exc()}")
|
255 |
finally:
|
256 |
pcm_queue.task_done()
|
257 |
|
|
|
277 |
|
278 |
# Process tokens to create response
|
279 |
previous_speaker = -10
|
280 |
+
lines = [
|
281 |
+
]
|
282 |
last_end_diarized = 0
|
283 |
+
undiarized_text = []
|
284 |
|
285 |
for token in tokens:
|
286 |
speaker = token.speaker
|
287 |
+
# Handle diarization differently if diarization is enabled
|
288 |
if args.diarization:
|
289 |
+
# If token is not yet processed by diarization
|
290 |
+
if (speaker == -1 or speaker == 0) and token.end >= end_attributed_speaker:
|
291 |
+
# Add this token's text to undiarized buffer instead of creating a new line
|
292 |
+
undiarized_text.append(token.text)
|
293 |
+
continue
|
294 |
+
# If speaker isn't assigned yet but should be (based on timestamp)
|
295 |
+
elif (speaker == -1 or speaker == 0) and token.end < end_attributed_speaker:
|
296 |
+
speaker = previous_speaker
|
297 |
+
# Track last diarized token end time
|
298 |
+
if speaker not in [-1, 0]:
|
299 |
last_end_diarized = max(token.end, last_end_diarized)
|
300 |
|
301 |
+
if speaker != previous_speaker or not lines:
|
302 |
lines.append(
|
303 |
{
|
304 |
"speaker": speaker,
|
|
|
314 |
lines[-1]["end"] = format_time(token.end)
|
315 |
lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
|
316 |
|
317 |
+
# Update buffer_diarization with undiarized text
|
318 |
+
if undiarized_text:
|
319 |
+
combined_buffer_diarization = sep.join(undiarized_text)
|
320 |
+
if buffer_transcription:
|
321 |
+
combined_buffer_diarization += sep
|
322 |
+
await shared_state.update_diarization(end_attributed_speaker, combined_buffer_diarization)
|
323 |
+
buffer_diarization = combined_buffer_diarization
|
324 |
+
|
325 |
# Prepare response object
|
326 |
+
if lines:
|
327 |
+
response = {
|
328 |
+
"lines": lines,
|
329 |
+
"buffer_transcription": buffer_transcription,
|
330 |
+
"buffer_diarization": buffer_diarization,
|
331 |
+
"remaining_time_transcription": remaining_time_transcription,
|
332 |
+
"remaining_time_diarization": remaining_time_diarization
|
333 |
+
}
|
334 |
+
else:
|
335 |
+
response = {
|
336 |
+
"lines": [{
|
337 |
+
"speaker": 1,
|
338 |
+
"text": "",
|
339 |
+
"beg": format_time(0),
|
340 |
+
"end": format_time(token.end) if token else format_time(0),
|
341 |
+
"diff": 0
|
342 |
+
}],
|
343 |
+
"buffer_transcription": buffer_transcription,
|
344 |
+
"buffer_diarization": buffer_diarization,
|
345 |
+
"remaining_time_transcription": remaining_time_transcription,
|
346 |
+
"remaining_time_diarization": remaining_time_diarization
|
347 |
+
|
348 |
+
}
|
349 |
+
|
350 |
+
response_content = ' '.join([str(line['speaker']) + ' ' + line["text"] for line in lines]) + ' | ' + buffer_transcription + ' | ' + buffer_diarization
|
351 |
|
352 |
+
if response_content != shared_state.last_response_content:
|
353 |
+
# Only send if there's actual content to send
|
354 |
+
if lines or buffer_transcription or buffer_diarization:
|
355 |
+
await websocket.send_json(response)
|
356 |
+
shared_state.last_response_content = response_content
|
357 |
|
358 |
# Add a small delay to avoid overwhelming the client
|
359 |
await asyncio.sleep(0.1)
|
360 |
|
361 |
except Exception as e:
|
362 |
logger.warning(f"Exception in results_formatter: {e}")
|
363 |
+
logger.warning(f"Traceback: {traceback.format_exc()}")
|
364 |
await asyncio.sleep(0.5) # Back off on error
|
365 |
|
366 |
##### ENDPOINTS #####
|
|
|
465 |
|
466 |
except Exception as e:
|
467 |
logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
|
468 |
+
logger.warning(f"Traceback: {traceback.format_exc()}")
|
469 |
break
|
470 |
|
471 |
logger.info("Exiting ffmpeg_stdout_reader...")
|