qfuxa commited on
Commit
56a936d
·
1 Parent(s): ba5b33f

Improve speaker handling; update sleep duration and manage speaker transitions more effectively

Browse files
Files changed (1) hide show
  1. whisper_fastapi_online_server.py +16 -15
whisper_fastapi_online_server.py CHANGED
@@ -214,10 +214,10 @@ async def websocket_endpoint(websocket: WebSocket):
214
  else:
215
  chunk_history.append({
216
  "beg": time() - beg_loop,
217
- "end": time() - beg_loop + 0.1,
218
  "text": '',
219
  })
220
- sleep(0.1)
221
  buffer = ''
222
 
223
  if args.diarization:
@@ -225,28 +225,29 @@ async def websocket_endpoint(websocket: WebSocket):
225
  diarization.assign_speakers_to_chunks(chunk_history)
226
 
227
 
228
- current_speaker = -1
229
- lines = [{
230
- "beg": 0,
231
- "end": 0,
232
- "speaker": current_speaker,
233
- "text": ""
234
- }]
235
- for ch in chunk_history:
236
- if args.diarization and ch["speaker"] and ch["speaker"] != current_speaker:
237
- new_speaker = ch["speaker"]
238
  lines.append(
239
  {
240
- "speaker": new_speaker,
241
  "text": ch['text'],
242
  "beg": format_time(ch['beg']),
243
  "end": format_time(ch['end']),
 
244
  }
245
  )
246
- current_speaker = new_speaker
247
- else:
248
  lines[-1]["text"] += ch['text']
249
  lines[-1]["end"] = format_time(ch['end'])
 
 
250
 
251
  response = {"lines": lines, "buffer": buffer}
252
  await websocket.send_json(response)
 
214
  else:
215
  chunk_history.append({
216
  "beg": time() - beg_loop,
217
+ "end": time() - beg_loop + 1,
218
  "text": '',
219
  })
220
+ sleep(1)
221
  buffer = ''
222
 
223
  if args.diarization:
 
225
  diarization.assign_speakers_to_chunks(chunk_history)
226
 
227
 
228
+ current_speaker = 0
229
+ lines = []
230
+ last_end_diarized = 0
231
+ for ind, ch in enumerate(chunk_history):
232
+ speaker = ch.get("speaker", -3)
233
+ if speaker == -1 and ind < len(chunk_history) - 1:
234
+ continue
235
+ elif speaker != current_speaker:
 
 
236
  lines.append(
237
  {
238
+ "speaker": speaker,
239
  "text": ch['text'],
240
  "beg": format_time(ch['beg']),
241
  "end": format_time(ch['end']),
242
+ "diff": round(ch['end'] - last_end_diarized, 2)
243
  }
244
  )
245
+ current_speaker = speaker
246
+ elif speaker != -1:
247
  lines[-1]["text"] += ch['text']
248
  lines[-1]["end"] = format_time(ch['end'])
249
+ if speaker != -1:
250
+ last_end_diarized = max(ch['end'], last_end_diarized)
251
 
252
  response = {"lines": lines, "buffer": buffer}
253
  await websocket.send_json(response)