first speaker is "0" no more None
Browse files
src/diarization/diarization_online.py
CHANGED
@@ -57,9 +57,10 @@ def init_diart(SAMPLE_RATE):
|
|
57 |
l_speakers = []
|
58 |
annotation, audio = result
|
59 |
for speaker in annotation._labels:
|
60 |
-
|
|
|
61 |
asyncio.create_task(
|
62 |
-
l_speakers_queue.put({"speaker": speaker, "
|
63 |
)
|
64 |
|
65 |
l_speakers_queue = asyncio.Queue()
|
@@ -74,13 +75,36 @@ def init_diart(SAMPLE_RATE):
|
|
74 |
class DiartDiarization():
|
75 |
def __init__(self, SAMPLE_RATE):
|
76 |
self.inference, self.l_speakers_queue, self.ws_source = init_diart(SAMPLE_RATE)
|
|
|
77 |
|
78 |
-
async def
|
79 |
self.ws_source.push_audio(pcm_array)
|
80 |
-
|
81 |
while not self.l_speakers_queue.empty():
|
82 |
-
|
83 |
-
return speakers
|
84 |
|
85 |
def close(self):
|
86 |
self.ws_source.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
l_speakers = []
|
58 |
annotation, audio = result
|
59 |
for speaker in annotation._labels:
|
60 |
+
segments_beg = annotation._labels[speaker].segments_boundaries_[0]
|
61 |
+
segments_end = annotation._labels[speaker].segments_boundaries_[-1]
|
62 |
asyncio.create_task(
|
63 |
+
l_speakers_queue.put({"speaker": speaker, "beg": segments_beg, "end": segments_end})
|
64 |
)
|
65 |
|
66 |
l_speakers_queue = asyncio.Queue()
|
|
|
75 |
class DiartDiarization():
|
76 |
def __init__(self, SAMPLE_RATE):
|
77 |
self.inference, self.l_speakers_queue, self.ws_source = init_diart(SAMPLE_RATE)
|
78 |
+
self.segment_speakers = []
|
79 |
|
80 |
+
async def diarize(self, pcm_array):
|
81 |
self.ws_source.push_audio(pcm_array)
|
82 |
+
self.segment_speakers = []
|
83 |
while not self.l_speakers_queue.empty():
|
84 |
+
self.segment_speakers.append(await self.l_speakers_queue.get())
|
|
|
85 |
|
86 |
def close(self):
|
87 |
self.ws_source.close()
|
88 |
+
|
89 |
+
|
90 |
+
def assign_speakers_to_chunks(self, chunks):
|
91 |
+
"""
|
92 |
+
Go through each chunk and see which speaker(s) overlap
|
93 |
+
that chunk's time range in the Diart annotation.
|
94 |
+
Then store the speaker label(s) (or choose the most overlapping).
|
95 |
+
This modifies `chunks` in-place or returns a new list with assigned speakers.
|
96 |
+
"""
|
97 |
+
if not self.segment_speakers:
|
98 |
+
return chunks
|
99 |
+
|
100 |
+
for segment in self.segment_speakers:
|
101 |
+
seg_beg = segment["beg"]
|
102 |
+
seg_end = segment["end"]
|
103 |
+
speaker = segment["speaker"]
|
104 |
+
for ch in chunks:
|
105 |
+
if seg_end <= ch["beg"] or seg_beg >= ch["end"]:
|
106 |
+
continue
|
107 |
+
# We have overlap. Let's just pick the speaker (could be more precise in a more complex implementation)
|
108 |
+
ch["speaker"] = speaker
|
109 |
+
|
110 |
+
return chunks
|
src/web/live_transcription.html
CHANGED
@@ -7,8 +7,8 @@
|
|
7 |
<style>
|
8 |
body {
|
9 |
font-family: 'Inter', sans-serif;
|
10 |
-
text-align: center;
|
11 |
margin: 20px;
|
|
|
12 |
}
|
13 |
#recordButton {
|
14 |
width: 80px;
|
@@ -28,18 +28,10 @@
|
|
28 |
#recordButton:active {
|
29 |
transform: scale(0.95);
|
30 |
}
|
31 |
-
#
|
32 |
margin-top: 20px;
|
33 |
-
font-size:
|
34 |
-
|
35 |
-
}
|
36 |
-
.transcription {
|
37 |
-
display: inline;
|
38 |
-
color: black;
|
39 |
-
}
|
40 |
-
.buffer {
|
41 |
-
display: inline;
|
42 |
-
color: rgb(197, 197, 197);
|
43 |
}
|
44 |
.settings-container {
|
45 |
display: flex;
|
@@ -73,9 +65,29 @@
|
|
73 |
label {
|
74 |
font-size: 14px;
|
75 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
</style>
|
77 |
</head>
|
78 |
<body>
|
|
|
79 |
<div class="settings-container">
|
80 |
<button id="recordButton">🎙️</button>
|
81 |
<div class="settings">
|
@@ -96,9 +108,11 @@
|
|
96 |
</div>
|
97 |
</div>
|
98 |
</div>
|
|
|
99 |
<p id="status"></p>
|
100 |
|
101 |
-
|
|
|
102 |
|
103 |
<script>
|
104 |
let isRecording = false;
|
@@ -106,89 +120,97 @@
|
|
106 |
let recorder = null;
|
107 |
let chunkDuration = 1000;
|
108 |
let websocketUrl = "ws://localhost:8000/asr";
|
109 |
-
|
110 |
-
// Tracks whether the user voluntarily closed the WebSocket
|
111 |
let userClosing = false;
|
112 |
|
113 |
const statusText = document.getElementById("status");
|
114 |
const recordButton = document.getElementById("recordButton");
|
115 |
const chunkSelector = document.getElementById("chunkSelector");
|
116 |
const websocketInput = document.getElementById("websocketInput");
|
117 |
-
const
|
118 |
|
119 |
-
let fullTranscription = ""; // Store confirmed transcription
|
120 |
-
|
121 |
-
// Update chunk duration based on the selector
|
122 |
chunkSelector.addEventListener("change", () => {
|
123 |
chunkDuration = parseInt(chunkSelector.value);
|
124 |
});
|
125 |
|
126 |
-
// Update WebSocket URL dynamically, with some basic checks
|
127 |
websocketInput.addEventListener("change", () => {
|
128 |
const urlValue = websocketInput.value.trim();
|
129 |
-
|
130 |
-
// Quick check to see if it starts with ws:// or wss://
|
131 |
if (!urlValue.startsWith("ws://") && !urlValue.startsWith("wss://")) {
|
132 |
-
statusText.textContent =
|
133 |
-
"Invalid WebSocket URL. It should start with ws:// or wss://";
|
134 |
return;
|
135 |
}
|
136 |
websocketUrl = urlValue;
|
137 |
statusText.textContent = "WebSocket URL updated. Ready to connect.";
|
138 |
});
|
139 |
|
140 |
-
/**
|
141 |
-
* Opens webSocket connection.
|
142 |
-
* returns a Promise that resolves when the connection is open.
|
143 |
-
* rejects if there was an error.
|
144 |
-
*/
|
145 |
function setupWebSocket() {
|
146 |
return new Promise((resolve, reject) => {
|
147 |
try {
|
148 |
websocket = new WebSocket(websocketUrl);
|
149 |
} catch (error) {
|
150 |
-
statusText.textContent =
|
151 |
-
"Invalid WebSocket URL. Please check the URL and try again.";
|
152 |
reject(error);
|
153 |
return;
|
154 |
}
|
155 |
|
156 |
websocket.onopen = () => {
|
157 |
-
statusText.textContent = "Connected to server";
|
158 |
resolve();
|
159 |
};
|
160 |
|
161 |
-
websocket.onclose = (
|
162 |
-
// If we manually closed it, we say so
|
163 |
if (userClosing) {
|
164 |
statusText.textContent = "WebSocket closed by user.";
|
165 |
} else {
|
166 |
-
statusText.textContent =
|
|
|
167 |
}
|
168 |
userClosing = false;
|
169 |
};
|
170 |
|
171 |
websocket.onerror = () => {
|
172 |
-
statusText.textContent = "Error connecting to WebSocket";
|
173 |
reject(new Error("Error connecting to WebSocket"));
|
174 |
};
|
175 |
|
|
|
176 |
websocket.onmessage = (event) => {
|
177 |
const data = JSON.parse(event.data);
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
188 |
};
|
189 |
});
|
190 |
}
|
191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
async function startRecording() {
|
193 |
try {
|
194 |
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
@@ -202,22 +224,18 @@
|
|
202 |
isRecording = true;
|
203 |
updateUI();
|
204 |
} catch (err) {
|
205 |
-
statusText.textContent =
|
206 |
-
"Error accessing microphone. Please allow microphone access.";
|
207 |
}
|
208 |
}
|
209 |
|
210 |
function stopRecording() {
|
211 |
userClosing = true;
|
212 |
-
|
213 |
-
// Stop the recorder if it exists
|
214 |
if (recorder) {
|
215 |
recorder.stop();
|
216 |
recorder = null;
|
217 |
}
|
218 |
isRecording = false;
|
219 |
|
220 |
-
// Close the websocket if it exists
|
221 |
if (websocket) {
|
222 |
websocket.close();
|
223 |
websocket = null;
|
@@ -228,15 +246,12 @@
|
|
228 |
|
229 |
async function toggleRecording() {
|
230 |
if (!isRecording) {
|
231 |
-
|
232 |
-
transcriptionsDiv.innerHTML = "";
|
233 |
-
|
234 |
try {
|
235 |
await setupWebSocket();
|
236 |
await startRecording();
|
237 |
} catch (err) {
|
238 |
-
statusText.textContent =
|
239 |
-
"Could not connect to WebSocket or access mic. Recording aborted.";
|
240 |
}
|
241 |
} else {
|
242 |
stopRecording();
|
@@ -245,9 +260,7 @@
|
|
245 |
|
246 |
function updateUI() {
|
247 |
recordButton.classList.toggle("recording", isRecording);
|
248 |
-
statusText.textContent = isRecording
|
249 |
-
? "Recording..."
|
250 |
-
: "Click to start transcription";
|
251 |
}
|
252 |
|
253 |
recordButton.addEventListener("click", toggleRecording);
|
|
|
7 |
<style>
|
8 |
body {
|
9 |
font-family: 'Inter', sans-serif;
|
|
|
10 |
margin: 20px;
|
11 |
+
text-align: center;
|
12 |
}
|
13 |
#recordButton {
|
14 |
width: 80px;
|
|
|
28 |
#recordButton:active {
|
29 |
transform: scale(0.95);
|
30 |
}
|
31 |
+
#status {
|
32 |
margin-top: 20px;
|
33 |
+
font-size: 16px;
|
34 |
+
color: #333;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
}
|
36 |
.settings-container {
|
37 |
display: flex;
|
|
|
65 |
label {
|
66 |
font-size: 14px;
|
67 |
}
|
68 |
+
/* Speaker-labeled transcript area */
|
69 |
+
#linesTranscript {
|
70 |
+
margin: 20px auto;
|
71 |
+
max-width: 600px;
|
72 |
+
text-align: left;
|
73 |
+
font-size: 16px;
|
74 |
+
}
|
75 |
+
#linesTranscript p {
|
76 |
+
margin: 5px 0;
|
77 |
+
}
|
78 |
+
#linesTranscript strong {
|
79 |
+
color: #333;
|
80 |
+
}
|
81 |
+
/* Grey buffer styling */
|
82 |
+
.buffer {
|
83 |
+
color: rgb(180, 180, 180);
|
84 |
+
font-style: italic;
|
85 |
+
margin-left: 4px;
|
86 |
+
}
|
87 |
</style>
|
88 |
</head>
|
89 |
<body>
|
90 |
+
|
91 |
<div class="settings-container">
|
92 |
<button id="recordButton">🎙️</button>
|
93 |
<div class="settings">
|
|
|
108 |
</div>
|
109 |
</div>
|
110 |
</div>
|
111 |
+
|
112 |
<p id="status"></p>
|
113 |
|
114 |
+
<!-- Speaker-labeled transcript -->
|
115 |
+
<div id="linesTranscript"></div>
|
116 |
|
117 |
<script>
|
118 |
let isRecording = false;
|
|
|
120 |
let recorder = null;
|
121 |
let chunkDuration = 1000;
|
122 |
let websocketUrl = "ws://localhost:8000/asr";
|
|
|
|
|
123 |
let userClosing = false;
|
124 |
|
125 |
const statusText = document.getElementById("status");
|
126 |
const recordButton = document.getElementById("recordButton");
|
127 |
const chunkSelector = document.getElementById("chunkSelector");
|
128 |
const websocketInput = document.getElementById("websocketInput");
|
129 |
+
const linesTranscriptDiv = document.getElementById("linesTranscript");
|
130 |
|
|
|
|
|
|
|
131 |
chunkSelector.addEventListener("change", () => {
|
132 |
chunkDuration = parseInt(chunkSelector.value);
|
133 |
});
|
134 |
|
|
|
135 |
websocketInput.addEventListener("change", () => {
|
136 |
const urlValue = websocketInput.value.trim();
|
|
|
|
|
137 |
if (!urlValue.startsWith("ws://") && !urlValue.startsWith("wss://")) {
|
138 |
+
statusText.textContent = "Invalid WebSocket URL (must start with ws:// or wss://)";
|
|
|
139 |
return;
|
140 |
}
|
141 |
websocketUrl = urlValue;
|
142 |
statusText.textContent = "WebSocket URL updated. Ready to connect.";
|
143 |
});
|
144 |
|
|
|
|
|
|
|
|
|
|
|
145 |
function setupWebSocket() {
|
146 |
return new Promise((resolve, reject) => {
|
147 |
try {
|
148 |
websocket = new WebSocket(websocketUrl);
|
149 |
} catch (error) {
|
150 |
+
statusText.textContent = "Invalid WebSocket URL. Please check and try again.";
|
|
|
151 |
reject(error);
|
152 |
return;
|
153 |
}
|
154 |
|
155 |
websocket.onopen = () => {
|
156 |
+
statusText.textContent = "Connected to server.";
|
157 |
resolve();
|
158 |
};
|
159 |
|
160 |
+
websocket.onclose = () => {
|
|
|
161 |
if (userClosing) {
|
162 |
statusText.textContent = "WebSocket closed by user.";
|
163 |
} else {
|
164 |
+
statusText.textContent =
|
165 |
+
"Disconnected from the WebSocket server. (Check logs if model is loading.)";
|
166 |
}
|
167 |
userClosing = false;
|
168 |
};
|
169 |
|
170 |
websocket.onerror = () => {
|
171 |
+
statusText.textContent = "Error connecting to WebSocket.";
|
172 |
reject(new Error("Error connecting to WebSocket"));
|
173 |
};
|
174 |
|
175 |
+
// Handle messages from server
|
176 |
websocket.onmessage = (event) => {
|
177 |
const data = JSON.parse(event.data);
|
178 |
+
/*
|
179 |
+
The server might send:
|
180 |
+
{
|
181 |
+
"lines": [
|
182 |
+
{"speaker": 0, "text": "Hello."},
|
183 |
+
{"speaker": 1, "text": "Bonjour."},
|
184 |
+
...
|
185 |
+
],
|
186 |
+
"buffer": "..."
|
187 |
+
}
|
188 |
+
*/
|
189 |
+
const { lines = [], buffer = "" } = data;
|
190 |
+
renderLinesWithBuffer(lines, buffer);
|
191 |
};
|
192 |
});
|
193 |
}
|
194 |
|
195 |
+
function renderLinesWithBuffer(lines, buffer) {
|
196 |
+
// Clears if no lines
|
197 |
+
if (!Array.isArray(lines) || lines.length === 0) {
|
198 |
+
linesTranscriptDiv.innerHTML = "";
|
199 |
+
return;
|
200 |
+
}
|
201 |
+
// Build the HTML
|
202 |
+
// The buffer is appended to the last line if it's non-empty
|
203 |
+
const linesHtml = lines.map((item, idx) => {
|
204 |
+
let textContent = item.text;
|
205 |
+
if (idx === lines.length - 1 && buffer) {
|
206 |
+
textContent += `<span class="buffer">${buffer}</span>`;
|
207 |
+
}
|
208 |
+
return `<p><strong>Speaker ${item.speaker}:</strong> ${textContent}</p>`;
|
209 |
+
}).join("");
|
210 |
+
|
211 |
+
linesTranscriptDiv.innerHTML = linesHtml;
|
212 |
+
}
|
213 |
+
|
214 |
async function startRecording() {
|
215 |
try {
|
216 |
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
|
224 |
isRecording = true;
|
225 |
updateUI();
|
226 |
} catch (err) {
|
227 |
+
statusText.textContent = "Error accessing microphone. Please allow microphone access.";
|
|
|
228 |
}
|
229 |
}
|
230 |
|
231 |
function stopRecording() {
|
232 |
userClosing = true;
|
|
|
|
|
233 |
if (recorder) {
|
234 |
recorder.stop();
|
235 |
recorder = null;
|
236 |
}
|
237 |
isRecording = false;
|
238 |
|
|
|
239 |
if (websocket) {
|
240 |
websocket.close();
|
241 |
websocket = null;
|
|
|
246 |
|
247 |
async function toggleRecording() {
|
248 |
if (!isRecording) {
|
249 |
+
linesTranscriptDiv.innerHTML = "";
|
|
|
|
|
250 |
try {
|
251 |
await setupWebSocket();
|
252 |
await startRecording();
|
253 |
} catch (err) {
|
254 |
+
statusText.textContent = "Could not connect to WebSocket or access mic. Aborted.";
|
|
|
255 |
}
|
256 |
} else {
|
257 |
stopRecording();
|
|
|
260 |
|
261 |
function updateUI() {
|
262 |
recordButton.classList.toggle("recording", isRecording);
|
263 |
+
statusText.textContent = isRecording ? "Recording..." : "Click to start transcription";
|
|
|
|
|
264 |
}
|
265 |
|
266 |
recordButton.addEventListener("click", toggleRecording);
|
whisper_fastapi_online_server.py
CHANGED
@@ -90,6 +90,7 @@ async def start_ffmpeg_decoder():
|
|
90 |
return process
|
91 |
|
92 |
|
|
|
93 |
@app.websocket("/asr")
|
94 |
async def websocket_endpoint(websocket: WebSocket):
|
95 |
await websocket.accept()
|
@@ -110,6 +111,9 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
110 |
loop = asyncio.get_event_loop()
|
111 |
full_transcription = ""
|
112 |
beg = time()
|
|
|
|
|
|
|
113 |
while True:
|
114 |
try:
|
115 |
elapsed_time = int(time() - beg)
|
@@ -137,8 +141,17 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
137 |
)
|
138 |
pcm_buffer = bytearray()
|
139 |
online.insert_audio_chunk(pcm_array)
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
if args.vac:
|
143 |
buffer = online.online.to_flush(
|
144 |
online.online.transcript_buffer.buffer
|
@@ -151,11 +164,30 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
151 |
buffer in full_transcription
|
152 |
): # With VAC, the buffer is not updated until the next chunk is processed
|
153 |
buffer = ""
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
if args.diarization:
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
await websocket.send_json(response)
|
160 |
|
161 |
except Exception as e:
|
|
|
90 |
return process
|
91 |
|
92 |
|
93 |
+
|
94 |
@app.websocket("/asr")
|
95 |
async def websocket_endpoint(websocket: WebSocket):
|
96 |
await websocket.accept()
|
|
|
111 |
loop = asyncio.get_event_loop()
|
112 |
full_transcription = ""
|
113 |
beg = time()
|
114 |
+
|
115 |
+
chunk_history = [] # Will store dicts: {beg, end, text, speaker}
|
116 |
+
|
117 |
while True:
|
118 |
try:
|
119 |
elapsed_time = int(time() - beg)
|
|
|
141 |
)
|
142 |
pcm_buffer = bytearray()
|
143 |
online.insert_audio_chunk(pcm_array)
|
144 |
+
beg_trans, end_trans, trans = online.process_iter()
|
145 |
+
|
146 |
+
if trans:
|
147 |
+
chunk_history.append({
|
148 |
+
"beg": beg_trans,
|
149 |
+
"end": end_trans,
|
150 |
+
"text": trans,
|
151 |
+
"speaker": "0"
|
152 |
+
})
|
153 |
+
|
154 |
+
full_transcription += trans
|
155 |
if args.vac:
|
156 |
buffer = online.online.to_flush(
|
157 |
online.online.transcript_buffer.buffer
|
|
|
164 |
buffer in full_transcription
|
165 |
): # With VAC, the buffer is not updated until the next chunk is processed
|
166 |
buffer = ""
|
167 |
+
|
168 |
+
lines = [
|
169 |
+
{
|
170 |
+
"speaker": "0",
|
171 |
+
"text": "",
|
172 |
+
}
|
173 |
+
]
|
174 |
+
|
175 |
if args.diarization:
|
176 |
+
await diarization.diarize(pcm_array)
|
177 |
+
diarization.assign_speakers_to_chunks(chunk_history)
|
178 |
+
|
179 |
+
for ch in chunk_history:
|
180 |
+
if args.diarization and ch["speaker"] and ch["speaker"][-1] != lines[-1]["speaker"]:
|
181 |
+
lines.append(
|
182 |
+
{
|
183 |
+
"speaker": ch["speaker"][-1],
|
184 |
+
"text": ch['text'],
|
185 |
+
}
|
186 |
+
)
|
187 |
+
else:
|
188 |
+
lines[-1]["text"] += ch['text']
|
189 |
+
|
190 |
+
response = {"lines": lines, "buffer": buffer}
|
191 |
await websocket.send_json(response)
|
192 |
|
193 |
except Exception as e:
|