qfuxa commited on
Commit
51356bb
·
2 Parent(s): 0a18df6 45fcc03

Merge pull request #52 from QuentinFuxa/diart_integration_improvements

Browse files
src/web/live_transcription.html CHANGED
@@ -78,12 +78,59 @@
78
  #linesTranscript strong {
79
  color: #333;
80
  }
81
- /* Grey buffer styling */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  .buffer {
83
  color: rgb(180, 180, 180);
84
  font-style: italic;
85
  margin-left: 4px;
86
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  </style>
88
  </head>
89
  <body>
@@ -188,7 +235,7 @@
188
  }
189
  */
190
  const { lines = [], buffer = "" } = data;
191
- renderLinesWithBuffer(lines, buffer);
192
  };
193
  });
194
  }
@@ -199,27 +246,36 @@
199
  linesTranscriptDiv.innerHTML = "";
200
  return;
201
  }
 
 
 
202
  const linesHtml = lines.map((item, idx) => {
 
 
 
 
 
203
  let speakerLabel = "";
204
  if (item.speaker === -2) {
205
- speakerLabel = "No speaker";
 
 
 
 
206
  } else if (item.speaker !== -1) {
207
- speakerLabel = `Speaker ${item.speaker}`;
208
- }
209
 
210
- let timeInfo = "";
211
- if (item.beg !== undefined && item.end !== undefined) {
212
- timeInfo = ` [${item.beg}, ${item.end}]`;
213
- }
214
 
215
  let textContent = item.text;
216
  if (idx === lines.length - 1 && buffer) {
217
  textContent += `<span class="buffer">${buffer}</span>`;
218
  }
219
 
220
- return speakerLabel
221
- ? `<p><strong>${speakerLabel}${timeInfo}</strong> ${textContent}</p>`
222
- : `<p>${textContent}</p>`;
223
  }).join("");
224
 
225
  linesTranscriptDiv.innerHTML = linesHtml;
 
78
  #linesTranscript strong {
79
  color: #333;
80
  }
81
+ #speaker {
82
+ background-color: #dcefff;
83
+ border-radius: 30px;
84
+ padding: 2px 10px;
85
+ font-size: 14px;
86
+ }
87
+ #timeInfo {
88
+ color: #666;
89
+ margin-left: 10px;
90
+ }
91
+ .textcontent {
92
+ font-size: 16px;
93
+ margin-left: 10px;
94
+ padding-left: 10px;
95
+ border-left: 2px solid #dcefff;
96
+ margin-bottom: 10px;
97
+ }
98
  .buffer {
99
  color: rgb(180, 180, 180);
100
  font-style: italic;
101
  margin-left: 4px;
102
  }
103
+ .spinner {
104
+ display: inline-block;
105
+ width: 8px;
106
+ height: 8px;
107
+ border: 2px solid rgba(0, 0, 0, 0.2);
108
+ border-top: 2px solid #333;
109
+ border-radius: 50%;
110
+ animation: spin 0.6s linear infinite;
111
+ vertical-align: middle;
112
+ margin-bottom: 2px;
113
+ }
114
+
115
+ @keyframes spin {
116
+ to {
117
+ transform: rotate(360deg);
118
+ }
119
+ }
120
+ .silence {
121
+ color: #666;
122
+ background-color: #f3f3f3;
123
+ font-size: 13px;
124
+ border-radius: 30px;
125
+ padding: 2px 10px;
126
+ }
127
+ .loading {
128
+ color: #666;
129
+ background-color: #eff9ff;
130
+ font-size: 14px;
131
+ border-radius: 30px;
132
+ padding: 2px 10px;
133
+ }
134
  </style>
135
  </head>
136
  <body>
 
235
  }
236
  */
237
  const { lines = [], buffer = "" } = data;
238
+ renderLinesWithBuffer( lines, buffer);
239
  };
240
  });
241
  }
 
246
  linesTranscriptDiv.innerHTML = "";
247
  return;
248
  }
249
+
250
+
251
+
252
  const linesHtml = lines.map((item, idx) => {
253
+ let timeInfo = "";
254
+ if (item.beg !== undefined && item.end !== undefined) {
255
+ timeInfo = ` ${item.beg} - ${item.end}`;
256
+ }
257
+
258
  let speakerLabel = "";
259
  if (item.speaker === -2) {
260
+ speakerLabel = `<span class="silence">Silence<span id='timeInfo'>${timeInfo}</span></span>`;
261
+ } else if (item.speaker == -1) {
262
+ speakerLabel = `<span class='loading'> <span class="spinner"></span><span id='timeInfo'>${item.diff} second(s) of audio are undergoing diarization</span></span>`;
263
+ } else if (item.speaker == -3) {
264
+ speakerLabel = `<span id="speaker"><span id='timeInfo'>${timeInfo}</span>`;
265
  } else if (item.speaker !== -1) {
266
+ speakerLabel = `<span id="speaker">Speaker ${item.speaker}<span id='timeInfo'>${timeInfo}</span></span>`;
267
+ }
268
 
269
+
 
 
 
270
 
271
  let textContent = item.text;
272
  if (idx === lines.length - 1 && buffer) {
273
  textContent += `<span class="buffer">${buffer}</span>`;
274
  }
275
 
276
+ return textContent
277
+ ? `<p>${speakerLabel}<br/><div class='textcontent'>${textContent}</div></p>`
278
+ : `<p >${speakerLabel}<br/></p>`;
279
  }).join("");
280
 
281
  linesTranscriptDiv.innerHTML = linesHtml;
whisper_fastapi_online_server.py CHANGED
@@ -214,10 +214,10 @@ async def websocket_endpoint(websocket: WebSocket):
214
  else:
215
  chunk_history.append({
216
  "beg": time() - beg_loop,
217
- "end": time() - beg_loop + 0.1,
218
  "text": '',
219
  })
220
- sleep(0.1)
221
  buffer = ''
222
 
223
  if args.diarization:
@@ -225,28 +225,29 @@ async def websocket_endpoint(websocket: WebSocket):
225
  diarization.assign_speakers_to_chunks(chunk_history)
226
 
227
 
228
- current_speaker = -1
229
- lines = [{
230
- "beg": 0,
231
- "end": 0,
232
- "speaker": current_speaker,
233
- "text": ""
234
- }]
235
- for ch in chunk_history:
236
- if args.diarization and ch["speaker"] and ch["speaker"] != current_speaker:
237
- new_speaker = ch["speaker"]
238
  lines.append(
239
  {
240
- "speaker": new_speaker,
241
  "text": ch['text'],
242
  "beg": format_time(ch['beg']),
243
  "end": format_time(ch['end']),
 
244
  }
245
  )
246
- current_speaker = new_speaker
247
- else:
248
  lines[-1]["text"] += ch['text']
249
  lines[-1]["end"] = format_time(ch['end'])
 
 
250
 
251
  response = {"lines": lines, "buffer": buffer}
252
  await websocket.send_json(response)
 
214
  else:
215
  chunk_history.append({
216
  "beg": time() - beg_loop,
217
+ "end": time() - beg_loop + 1,
218
  "text": '',
219
  })
220
+ sleep(1)
221
  buffer = ''
222
 
223
  if args.diarization:
 
225
  diarization.assign_speakers_to_chunks(chunk_history)
226
 
227
 
228
+ current_speaker = 0
229
+ lines = []
230
+ last_end_diarized = 0
231
+ for ind, ch in enumerate(chunk_history):
232
+ speaker = ch.get("speaker", -3)
233
+ if speaker == -1 and ind < len(chunk_history) - 1:
234
+ continue
235
+ elif speaker != current_speaker:
 
 
236
  lines.append(
237
  {
238
+ "speaker": speaker,
239
  "text": ch['text'],
240
  "beg": format_time(ch['beg']),
241
  "end": format_time(ch['end']),
242
+ "diff": round(ch['end'] - last_end_diarized, 2)
243
  }
244
  )
245
+ current_speaker = speaker
246
+ elif speaker != -1:
247
  lines[-1]["text"] += ch['text']
248
  lines[-1]["end"] = format_time(ch['end'])
249
+ if speaker != -1:
250
+ last_end_diarized = max(ch['end'], last_end_diarized)
251
 
252
  response = {"lines": lines, "buffer": buffer}
253
  await websocket.send_json(response)