Merge pull request #52 from QuentinFuxa/diart_integration_improvements
Browse files- src/web/live_transcription.html +68 -12
- whisper_fastapi_online_server.py +16 -15
src/web/live_transcription.html
CHANGED
@@ -78,12 +78,59 @@
|
|
78 |
#linesTranscript strong {
|
79 |
color: #333;
|
80 |
}
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
.buffer {
|
83 |
color: rgb(180, 180, 180);
|
84 |
font-style: italic;
|
85 |
margin-left: 4px;
|
86 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
</style>
|
88 |
</head>
|
89 |
<body>
|
@@ -188,7 +235,7 @@
|
|
188 |
}
|
189 |
*/
|
190 |
const { lines = [], buffer = "" } = data;
|
191 |
-
renderLinesWithBuffer(lines, buffer);
|
192 |
};
|
193 |
});
|
194 |
}
|
@@ -199,27 +246,36 @@
|
|
199 |
linesTranscriptDiv.innerHTML = "";
|
200 |
return;
|
201 |
}
|
|
|
|
|
|
|
202 |
const linesHtml = lines.map((item, idx) => {
|
|
|
|
|
|
|
|
|
|
|
203 |
let speakerLabel = "";
|
204 |
if (item.speaker === -2) {
|
205 |
-
speakerLabel = "
|
|
|
|
|
|
|
|
|
206 |
} else if (item.speaker !== -1) {
|
207 |
-
speakerLabel =
|
208 |
-
}
|
209 |
|
210 |
-
|
211 |
-
if (item.beg !== undefined && item.end !== undefined) {
|
212 |
-
timeInfo = ` [${item.beg}, ${item.end}]`;
|
213 |
-
}
|
214 |
|
215 |
let textContent = item.text;
|
216 |
if (idx === lines.length - 1 && buffer) {
|
217 |
textContent += `<span class="buffer">${buffer}</span>`;
|
218 |
}
|
219 |
|
220 |
-
return
|
221 |
-
? `<p
|
222 |
-
: `<p>${
|
223 |
}).join("");
|
224 |
|
225 |
linesTranscriptDiv.innerHTML = linesHtml;
|
|
|
78 |
#linesTranscript strong {
|
79 |
color: #333;
|
80 |
}
|
81 |
+
#speaker {
|
82 |
+
background-color: #dcefff;
|
83 |
+
border-radius: 30px;
|
84 |
+
padding: 2px 10px;
|
85 |
+
font-size: 14px;
|
86 |
+
}
|
87 |
+
#timeInfo {
|
88 |
+
color: #666;
|
89 |
+
margin-left: 10px;
|
90 |
+
}
|
91 |
+
.textcontent {
|
92 |
+
font-size: 16px;
|
93 |
+
margin-left: 10px;
|
94 |
+
padding-left: 10px;
|
95 |
+
border-left: 2px solid #dcefff;
|
96 |
+
margin-bottom: 10px;
|
97 |
+
}
|
98 |
.buffer {
|
99 |
color: rgb(180, 180, 180);
|
100 |
font-style: italic;
|
101 |
margin-left: 4px;
|
102 |
}
|
103 |
+
.spinner {
|
104 |
+
display: inline-block;
|
105 |
+
width: 8px;
|
106 |
+
height: 8px;
|
107 |
+
border: 2px solid rgba(0, 0, 0, 0.2);
|
108 |
+
border-top: 2px solid #333;
|
109 |
+
border-radius: 50%;
|
110 |
+
animation: spin 0.6s linear infinite;
|
111 |
+
vertical-align: middle;
|
112 |
+
margin-bottom: 2px;
|
113 |
+
}
|
114 |
+
|
115 |
+
@keyframes spin {
|
116 |
+
to {
|
117 |
+
transform: rotate(360deg);
|
118 |
+
}
|
119 |
+
}
|
120 |
+
.silence {
|
121 |
+
color: #666;
|
122 |
+
background-color: #f3f3f3;
|
123 |
+
font-size: 13px;
|
124 |
+
border-radius: 30px;
|
125 |
+
padding: 2px 10px;
|
126 |
+
}
|
127 |
+
.loading {
|
128 |
+
color: #666;
|
129 |
+
background-color: #eff9ff;
|
130 |
+
font-size: 14px;
|
131 |
+
border-radius: 30px;
|
132 |
+
padding: 2px 10px;
|
133 |
+
}
|
134 |
</style>
|
135 |
</head>
|
136 |
<body>
|
|
|
235 |
}
|
236 |
*/
|
237 |
const { lines = [], buffer = "" } = data;
|
238 |
+
renderLinesWithBuffer( lines, buffer);
|
239 |
};
|
240 |
});
|
241 |
}
|
|
|
246 |
linesTranscriptDiv.innerHTML = "";
|
247 |
return;
|
248 |
}
|
249 |
+
|
250 |
+
|
251 |
+
|
252 |
const linesHtml = lines.map((item, idx) => {
|
253 |
+
let timeInfo = "";
|
254 |
+
if (item.beg !== undefined && item.end !== undefined) {
|
255 |
+
timeInfo = ` ${item.beg} - ${item.end}`;
|
256 |
+
}
|
257 |
+
|
258 |
let speakerLabel = "";
|
259 |
if (item.speaker === -2) {
|
260 |
+
speakerLabel = `<span class="silence">Silence<span id='timeInfo'>${timeInfo}</span></span>`;
|
261 |
+
} else if (item.speaker == -1) {
|
262 |
+
speakerLabel = `<span class='loading'> <span class="spinner"></span><span id='timeInfo'>${item.diff} second(s) of audio are undergoing diarization</span></span>`;
|
263 |
+
} else if (item.speaker == -3) {
|
264 |
+
speakerLabel = `<span id="speaker"><span id='timeInfo'>${timeInfo}</span>`;
|
265 |
} else if (item.speaker !== -1) {
|
266 |
+
speakerLabel = `<span id="speaker">Speaker ${item.speaker}<span id='timeInfo'>${timeInfo}</span></span>`;
|
267 |
+
}
|
268 |
|
269 |
+
|
|
|
|
|
|
|
270 |
|
271 |
let textContent = item.text;
|
272 |
if (idx === lines.length - 1 && buffer) {
|
273 |
textContent += `<span class="buffer">${buffer}</span>`;
|
274 |
}
|
275 |
|
276 |
+
return textContent
|
277 |
+
? `<p>${speakerLabel}<br/><div class='textcontent'>${textContent}</div></p>`
|
278 |
+
: `<p >${speakerLabel}<br/></p>`;
|
279 |
}).join("");
|
280 |
|
281 |
linesTranscriptDiv.innerHTML = linesHtml;
|
whisper_fastapi_online_server.py
CHANGED
@@ -214,10 +214,10 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
214 |
else:
|
215 |
chunk_history.append({
|
216 |
"beg": time() - beg_loop,
|
217 |
-
"end": time() - beg_loop +
|
218 |
"text": '',
|
219 |
})
|
220 |
-
sleep(
|
221 |
buffer = ''
|
222 |
|
223 |
if args.diarization:
|
@@ -225,28 +225,29 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
225 |
diarization.assign_speakers_to_chunks(chunk_history)
|
226 |
|
227 |
|
228 |
-
current_speaker =
|
229 |
-
lines = [
|
230 |
-
|
231 |
-
|
232 |
-
"speaker"
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
if args.diarization and ch["speaker"] and ch["speaker"] != current_speaker:
|
237 |
-
new_speaker = ch["speaker"]
|
238 |
lines.append(
|
239 |
{
|
240 |
-
"speaker":
|
241 |
"text": ch['text'],
|
242 |
"beg": format_time(ch['beg']),
|
243 |
"end": format_time(ch['end']),
|
|
|
244 |
}
|
245 |
)
|
246 |
-
current_speaker =
|
247 |
-
|
248 |
lines[-1]["text"] += ch['text']
|
249 |
lines[-1]["end"] = format_time(ch['end'])
|
|
|
|
|
250 |
|
251 |
response = {"lines": lines, "buffer": buffer}
|
252 |
await websocket.send_json(response)
|
|
|
214 |
else:
|
215 |
chunk_history.append({
|
216 |
"beg": time() - beg_loop,
|
217 |
+
"end": time() - beg_loop + 1,
|
218 |
"text": '',
|
219 |
})
|
220 |
+
sleep(1)
|
221 |
buffer = ''
|
222 |
|
223 |
if args.diarization:
|
|
|
225 |
diarization.assign_speakers_to_chunks(chunk_history)
|
226 |
|
227 |
|
228 |
+
current_speaker = 0
|
229 |
+
lines = []
|
230 |
+
last_end_diarized = 0
|
231 |
+
for ind, ch in enumerate(chunk_history):
|
232 |
+
speaker = ch.get("speaker", -3)
|
233 |
+
if speaker == -1 and ind < len(chunk_history) - 1:
|
234 |
+
continue
|
235 |
+
elif speaker != current_speaker:
|
|
|
|
|
236 |
lines.append(
|
237 |
{
|
238 |
+
"speaker": speaker,
|
239 |
"text": ch['text'],
|
240 |
"beg": format_time(ch['beg']),
|
241 |
"end": format_time(ch['end']),
|
242 |
+
"diff": round(ch['end'] - last_end_diarized, 2)
|
243 |
}
|
244 |
)
|
245 |
+
current_speaker = speaker
|
246 |
+
elif speaker != -1:
|
247 |
lines[-1]["text"] += ch['text']
|
248 |
lines[-1]["end"] = format_time(ch['end'])
|
249 |
+
if speaker != -1:
|
250 |
+
last_end_diarized = max(ch['end'], last_end_diarized)
|
251 |
|
252 |
response = {"lines": lines, "buffer": buffer}
|
253 |
await websocket.send_json(response)
|