qfuxa commited on
Commit
bd15235
·
1 Parent(s): d1918e7

undiarized text is assigned to last speaker, with buffer information; traceback is used to format_exc errors

Browse files
Files changed (1) hide show
  1. whisper_fastapi_online_server.py +62 -18
whisper_fastapi_online_server.py CHANGED
@@ -16,6 +16,7 @@ from src.whisper_streaming.timed_objects import ASRToken
16
  import math
17
  import logging
18
  from datetime import timedelta
 
19
 
20
  def format_time(seconds):
21
  return str(timedelta(seconds=int(seconds)))
@@ -48,7 +49,7 @@ parser.add_argument(
48
  parser.add_argument(
49
  "--diarization",
50
  type=bool,
51
- default=True,
52
  help="Whether to enable speaker diarization.",
53
  )
54
 
@@ -81,6 +82,7 @@ class SharedState:
81
  self.lock = asyncio.Lock()
82
  self.beg_loop = time()
83
  self.sep = " " # Default separator
 
84
 
85
  async def update_transcription(self, new_tokens, buffer, end_buffer, full_transcription, sep):
86
  async with self.lock:
@@ -119,7 +121,7 @@ class SharedState:
119
 
120
  # Calculate remaining time for diarization
121
  if self.end_attributed_speaker > 0:
122
- remaining_time_diarization = max(0, round(current_time - self.beg_loop - self.end_attributed_speaker, 2))
123
 
124
  return {
125
  "tokens": self.tokens.copy(),
@@ -142,6 +144,7 @@ class SharedState:
142
  self.end_attributed_speaker = 0
143
  self.full_transcription = ""
144
  self.beg_loop = time()
 
145
 
146
  ##### LOAD APP #####
147
 
@@ -221,6 +224,7 @@ async def transcription_processor(shared_state, pcm_queue, online):
221
 
222
  except Exception as e:
223
  logger.warning(f"Exception in transcription_processor: {e}")
 
224
  finally:
225
  pcm_queue.task_done()
226
 
@@ -247,6 +251,7 @@ async def diarization_processor(shared_state, pcm_queue, diarization_obj):
247
 
248
  except Exception as e:
249
  logger.warning(f"Exception in diarization_processor: {e}")
 
250
  finally:
251
  pcm_queue.task_done()
252
 
@@ -272,21 +277,28 @@ async def results_formatter(shared_state, websocket):
272
 
273
  # Process tokens to create response
274
  previous_speaker = -10
275
- lines = []
 
276
  last_end_diarized = 0
 
277
 
278
  for token in tokens:
279
  speaker = token.speaker
 
280
  if args.diarization:
281
- if speaker == -1 or speaker == 0:
282
- if token.end < end_attributed_speaker:
283
- speaker = previous_speaker
284
- else:
285
- speaker = 0
286
- else:
 
 
 
 
287
  last_end_diarized = max(token.end, last_end_diarized)
288
 
289
- if speaker != previous_speaker:
290
  lines.append(
291
  {
292
  "speaker": speaker,
@@ -302,22 +314,53 @@ async def results_formatter(shared_state, websocket):
302
  lines[-1]["end"] = format_time(token.end)
303
  lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
304
 
 
 
 
 
 
 
 
 
305
  # Prepare response object
306
- response = {
307
- "lines": lines,
308
- "buffer_transcription": buffer_transcription,
309
- "buffer_diarization": buffer_diarization,
310
- "remaining_time_transcription": remaining_time_transcription,
311
- "remaining_time_diarization": remaining_time_diarization
312
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
- await websocket.send_json(response)
 
 
 
 
315
 
316
  # Add a small delay to avoid overwhelming the client
317
  await asyncio.sleep(0.1)
318
 
319
  except Exception as e:
320
  logger.warning(f"Exception in results_formatter: {e}")
 
321
  await asyncio.sleep(0.5) # Back off on error
322
 
323
  ##### ENDPOINTS #####
@@ -422,6 +465,7 @@ async def websocket_endpoint(websocket: WebSocket):
422
 
423
  except Exception as e:
424
  logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
 
425
  break
426
 
427
  logger.info("Exiting ffmpeg_stdout_reader...")
 
16
  import math
17
  import logging
18
  from datetime import timedelta
19
+ import traceback
20
 
21
  def format_time(seconds):
22
  return str(timedelta(seconds=int(seconds)))
 
49
  parser.add_argument(
50
  "--diarization",
51
  type=bool,
52
+ default=False,
53
  help="Whether to enable speaker diarization.",
54
  )
55
 
 
82
  self.lock = asyncio.Lock()
83
  self.beg_loop = time()
84
  self.sep = " " # Default separator
85
+ self.last_response_content = "" # To track changes in response
86
 
87
  async def update_transcription(self, new_tokens, buffer, end_buffer, full_transcription, sep):
88
  async with self.lock:
 
121
 
122
  # Calculate remaining time for diarization
123
  if self.end_attributed_speaker > 0:
124
+ remaining_time_diarization = max(0, round(max(self.end_buffer, self.tokens[-1].end if self.tokens else 0) - self.end_attributed_speaker, 2))
125
 
126
  return {
127
  "tokens": self.tokens.copy(),
 
144
  self.end_attributed_speaker = 0
145
  self.full_transcription = ""
146
  self.beg_loop = time()
147
+ self.last_response_content = ""
148
 
149
  ##### LOAD APP #####
150
 
 
224
 
225
  except Exception as e:
226
  logger.warning(f"Exception in transcription_processor: {e}")
227
+ logger.warning(f"Traceback: {traceback.format_exc()}")
228
  finally:
229
  pcm_queue.task_done()
230
 
 
251
 
252
  except Exception as e:
253
  logger.warning(f"Exception in diarization_processor: {e}")
254
+ logger.warning(f"Traceback: {traceback.format_exc()}")
255
  finally:
256
  pcm_queue.task_done()
257
 
 
277
 
278
  # Process tokens to create response
279
  previous_speaker = -10
280
+ lines = [
281
+ ]
282
  last_end_diarized = 0
283
+ undiarized_text = []
284
 
285
  for token in tokens:
286
  speaker = token.speaker
287
+ # Handle diarization differently if diarization is enabled
288
  if args.diarization:
289
+ # If token is not yet processed by diarization
290
+ if (speaker == -1 or speaker == 0) and token.end >= end_attributed_speaker:
291
+ # Add this token's text to undiarized buffer instead of creating a new line
292
+ undiarized_text.append(token.text)
293
+ continue
294
+ # If speaker isn't assigned yet but should be (based on timestamp)
295
+ elif (speaker == -1 or speaker == 0) and token.end < end_attributed_speaker:
296
+ speaker = previous_speaker
297
+ # Track last diarized token end time
298
+ if speaker not in [-1, 0]:
299
  last_end_diarized = max(token.end, last_end_diarized)
300
 
301
+ if speaker != previous_speaker or not lines:
302
  lines.append(
303
  {
304
  "speaker": speaker,
 
314
  lines[-1]["end"] = format_time(token.end)
315
  lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
316
 
317
+ # Update buffer_diarization with undiarized text
318
+ if undiarized_text:
319
+ combined_buffer_diarization = sep.join(undiarized_text)
320
+ if buffer_transcription:
321
+ combined_buffer_diarization += sep
322
+ await shared_state.update_diarization(end_attributed_speaker, combined_buffer_diarization)
323
+ buffer_diarization = combined_buffer_diarization
324
+
325
  # Prepare response object
326
+ if lines:
327
+ response = {
328
+ "lines": lines,
329
+ "buffer_transcription": buffer_transcription,
330
+ "buffer_diarization": buffer_diarization,
331
+ "remaining_time_transcription": remaining_time_transcription,
332
+ "remaining_time_diarization": remaining_time_diarization
333
+ }
334
+ else:
335
+ response = {
336
+ "lines": [{
337
+ "speaker": 1,
338
+ "text": "",
339
+ "beg": format_time(0),
340
+ "end": format_time(token.end) if token else format_time(0),
341
+ "diff": 0
342
+ }],
343
+ "buffer_transcription": buffer_transcription,
344
+ "buffer_diarization": buffer_diarization,
345
+ "remaining_time_transcription": remaining_time_transcription,
346
+ "remaining_time_diarization": remaining_time_diarization
347
+
348
+ }
349
+
350
+ response_content = ' '.join([str(line['speaker']) + ' ' + line["text"] for line in lines]) + ' | ' + buffer_transcription + ' | ' + buffer_diarization
351
 
352
+ if response_content != shared_state.last_response_content:
353
+ # Only send if there's actual content to send
354
+ if lines or buffer_transcription or buffer_diarization:
355
+ await websocket.send_json(response)
356
+ shared_state.last_response_content = response_content
357
 
358
  # Add a small delay to avoid overwhelming the client
359
  await asyncio.sleep(0.1)
360
 
361
  except Exception as e:
362
  logger.warning(f"Exception in results_formatter: {e}")
363
+ logger.warning(f"Traceback: {traceback.format_exc()}")
364
  await asyncio.sleep(0.5) # Back off on error
365
 
366
  ##### ENDPOINTS #####
 
465
 
466
  except Exception as e:
467
  logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
468
+ logger.warning(f"Traceback: {traceback.format_exc()}")
469
  break
470
 
471
  logger.info("Exiting ffmpeg_stdout_reader...")