Improve speaker handling; update sleep duration and manage speaker transitions more effectively
Browse files- whisper_fastapi_online_server.py +16 -15
whisper_fastapi_online_server.py
CHANGED
@@ -214,10 +214,10 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
214 |
else:
|
215 |
chunk_history.append({
|
216 |
"beg": time() - beg_loop,
|
217 |
-
"end": time() - beg_loop +
|
218 |
"text": '',
|
219 |
})
|
220 |
-
sleep(
|
221 |
buffer = ''
|
222 |
|
223 |
if args.diarization:
|
@@ -225,28 +225,29 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
225 |
diarization.assign_speakers_to_chunks(chunk_history)
|
226 |
|
227 |
|
228 |
-
current_speaker =
|
229 |
-
lines = [
|
230 |
-
|
231 |
-
|
232 |
-
"speaker"
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
if args.diarization and ch["speaker"] and ch["speaker"] != current_speaker:
|
237 |
-
new_speaker = ch["speaker"]
|
238 |
lines.append(
|
239 |
{
|
240 |
-
"speaker":
|
241 |
"text": ch['text'],
|
242 |
"beg": format_time(ch['beg']),
|
243 |
"end": format_time(ch['end']),
|
|
|
244 |
}
|
245 |
)
|
246 |
-
current_speaker =
|
247 |
-
|
248 |
lines[-1]["text"] += ch['text']
|
249 |
lines[-1]["end"] = format_time(ch['end'])
|
|
|
|
|
250 |
|
251 |
response = {"lines": lines, "buffer": buffer}
|
252 |
await websocket.send_json(response)
|
|
|
214 |
else:
|
215 |
chunk_history.append({
|
216 |
"beg": time() - beg_loop,
|
217 |
+
"end": time() - beg_loop + 1,
|
218 |
"text": '',
|
219 |
})
|
220 |
+
sleep(1)
|
221 |
buffer = ''
|
222 |
|
223 |
if args.diarization:
|
|
|
225 |
diarization.assign_speakers_to_chunks(chunk_history)
|
226 |
|
227 |
|
228 |
+
current_speaker = 0
|
229 |
+
lines = []
|
230 |
+
last_end_diarized = 0
|
231 |
+
for ind, ch in enumerate(chunk_history):
|
232 |
+
speaker = ch.get("speaker", -3)
|
233 |
+
if speaker == -1 and ind < len(chunk_history) - 1:
|
234 |
+
continue
|
235 |
+
elif speaker != current_speaker:
|
|
|
|
|
236 |
lines.append(
|
237 |
{
|
238 |
+
"speaker": speaker,
|
239 |
"text": ch['text'],
|
240 |
"beg": format_time(ch['beg']),
|
241 |
"end": format_time(ch['end']),
|
242 |
+
"diff": round(ch['end'] - last_end_diarized, 2)
|
243 |
}
|
244 |
)
|
245 |
+
current_speaker = speaker
|
246 |
+
elif speaker != -1:
|
247 |
lines[-1]["text"] += ch['text']
|
248 |
lines[-1]["end"] = format_time(ch['end'])
|
249 |
+
if speaker != -1:
|
250 |
+
last_end_diarized = max(ch['end'], last_end_diarized)
|
251 |
|
252 |
response = {"lines": lines, "buffer": buffer}
|
253 |
await websocket.send_json(response)
|