qfuxa commited on
Commit
aafc196
·
1 Parent(s): 6a98078

first speaker is "0" no more None

Browse files
src/diarization/diarization_online.py CHANGED
@@ -57,9 +57,10 @@ def init_diart(SAMPLE_RATE):
57
  l_speakers = []
58
  annotation, audio = result
59
  for speaker in annotation._labels:
60
- segment = annotation._labels[speaker].__str__()
 
61
  asyncio.create_task(
62
- l_speakers_queue.put({"speaker": speaker, "segment": segment})
63
  )
64
 
65
  l_speakers_queue = asyncio.Queue()
@@ -74,13 +75,36 @@ def init_diart(SAMPLE_RATE):
74
  class DiartDiarization():
75
  def __init__(self, SAMPLE_RATE):
76
  self.inference, self.l_speakers_queue, self.ws_source = init_diart(SAMPLE_RATE)
 
77
 
78
- async def get_speakers(self, pcm_array):
79
  self.ws_source.push_audio(pcm_array)
80
- speakers = []
81
  while not self.l_speakers_queue.empty():
82
- speakers.append(await self.l_speakers_queue.get())
83
- return speakers
84
 
85
  def close(self):
86
  self.ws_source.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  l_speakers = []
58
  annotation, audio = result
59
  for speaker in annotation._labels:
60
+ segments_beg = annotation._labels[speaker].segments_boundaries_[0]
61
+ segments_end = annotation._labels[speaker].segments_boundaries_[-1]
62
  asyncio.create_task(
63
+ l_speakers_queue.put({"speaker": speaker, "beg": segments_beg, "end": segments_end})
64
  )
65
 
66
  l_speakers_queue = asyncio.Queue()
 
75
  class DiartDiarization():
76
  def __init__(self, SAMPLE_RATE):
77
  self.inference, self.l_speakers_queue, self.ws_source = init_diart(SAMPLE_RATE)
78
+ self.segment_speakers = []
79
 
80
+ async def diarize(self, pcm_array):
81
  self.ws_source.push_audio(pcm_array)
82
+ self.segment_speakers = []
83
  while not self.l_speakers_queue.empty():
84
+ self.segment_speakers.append(await self.l_speakers_queue.get())
 
85
 
86
  def close(self):
87
  self.ws_source.close()
88
+
89
+
90
+ def assign_speakers_to_chunks(self, chunks):
91
+ """
92
+ Go through each chunk and see which speaker(s) overlap
93
+ that chunk's time range in the Diart annotation.
94
+ Then store the speaker label(s) (or choose the most overlapping).
95
+ This modifies `chunks` in-place or returns a new list with assigned speakers.
96
+ """
97
+ if not self.segment_speakers:
98
+ return chunks
99
+
100
+ for segment in self.segment_speakers:
101
+ seg_beg = segment["beg"]
102
+ seg_end = segment["end"]
103
+ speaker = segment["speaker"]
104
+ for ch in chunks:
105
+ if seg_end <= ch["beg"] or seg_beg >= ch["end"]:
106
+ continue
107
+ # We have overlap. Let's just pick the speaker (could be more precise in a more complex implementation)
108
+ ch["speaker"] = speaker
109
+
110
+ return chunks
src/web/live_transcription.html CHANGED
@@ -7,8 +7,8 @@
7
  <style>
8
  body {
9
  font-family: 'Inter', sans-serif;
10
- text-align: center;
11
  margin: 20px;
 
12
  }
13
  #recordButton {
14
  width: 80px;
@@ -28,18 +28,10 @@
28
  #recordButton:active {
29
  transform: scale(0.95);
30
  }
31
- #transcriptions {
32
  margin-top: 20px;
33
- font-size: 18px;
34
- text-align: left;
35
- }
36
- .transcription {
37
- display: inline;
38
- color: black;
39
- }
40
- .buffer {
41
- display: inline;
42
- color: rgb(197, 197, 197);
43
  }
44
  .settings-container {
45
  display: flex;
@@ -73,9 +65,29 @@
73
  label {
74
  font-size: 14px;
75
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  </style>
77
  </head>
78
  <body>
 
79
  <div class="settings-container">
80
  <button id="recordButton">🎙️</button>
81
  <div class="settings">
@@ -96,9 +108,11 @@
96
  </div>
97
  </div>
98
  </div>
 
99
  <p id="status"></p>
100
 
101
- <div id="transcriptions"></div>
 
102
 
103
  <script>
104
  let isRecording = false;
@@ -106,89 +120,97 @@
106
  let recorder = null;
107
  let chunkDuration = 1000;
108
  let websocketUrl = "ws://localhost:8000/asr";
109
-
110
- // Tracks whether the user voluntarily closed the WebSocket
111
  let userClosing = false;
112
 
113
  const statusText = document.getElementById("status");
114
  const recordButton = document.getElementById("recordButton");
115
  const chunkSelector = document.getElementById("chunkSelector");
116
  const websocketInput = document.getElementById("websocketInput");
117
- const transcriptionsDiv = document.getElementById("transcriptions");
118
 
119
- let fullTranscription = ""; // Store confirmed transcription
120
-
121
- // Update chunk duration based on the selector
122
  chunkSelector.addEventListener("change", () => {
123
  chunkDuration = parseInt(chunkSelector.value);
124
  });
125
 
126
- // Update WebSocket URL dynamically, with some basic checks
127
  websocketInput.addEventListener("change", () => {
128
  const urlValue = websocketInput.value.trim();
129
-
130
- // Quick check to see if it starts with ws:// or wss://
131
  if (!urlValue.startsWith("ws://") && !urlValue.startsWith("wss://")) {
132
- statusText.textContent =
133
- "Invalid WebSocket URL. It should start with ws:// or wss://";
134
  return;
135
  }
136
  websocketUrl = urlValue;
137
  statusText.textContent = "WebSocket URL updated. Ready to connect.";
138
  });
139
 
140
- /**
141
- * Opens webSocket connection.
142
- * returns a Promise that resolves when the connection is open.
143
- * rejects if there was an error.
144
- */
145
  function setupWebSocket() {
146
  return new Promise((resolve, reject) => {
147
  try {
148
  websocket = new WebSocket(websocketUrl);
149
  } catch (error) {
150
- statusText.textContent =
151
- "Invalid WebSocket URL. Please check the URL and try again.";
152
  reject(error);
153
  return;
154
  }
155
 
156
  websocket.onopen = () => {
157
- statusText.textContent = "Connected to server";
158
  resolve();
159
  };
160
 
161
- websocket.onclose = (event) => {
162
- // If we manually closed it, we say so
163
  if (userClosing) {
164
  statusText.textContent = "WebSocket closed by user.";
165
  } else {
166
- statusText.textContent = "Disconnected from the websocket server. If this is the first launch, the model may be downloading in the backend. Check the API logs for more information.";
 
167
  }
168
  userClosing = false;
169
  };
170
 
171
  websocket.onerror = () => {
172
- statusText.textContent = "Error connecting to WebSocket";
173
  reject(new Error("Error connecting to WebSocket"));
174
  };
175
 
 
176
  websocket.onmessage = (event) => {
177
  const data = JSON.parse(event.data);
178
- const { transcription, buffer } = data;
179
-
180
- // Update confirmed transcription
181
- fullTranscription += transcription;
182
-
183
- // Update the transcription display
184
- transcriptionsDiv.innerHTML = `
185
- <span class="transcription">${fullTranscription}</span>
186
- <span class="buffer">${buffer}</span>
187
- `;
 
 
 
188
  };
189
  });
190
  }
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  async function startRecording() {
193
  try {
194
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
@@ -202,22 +224,18 @@
202
  isRecording = true;
203
  updateUI();
204
  } catch (err) {
205
- statusText.textContent =
206
- "Error accessing microphone. Please allow microphone access.";
207
  }
208
  }
209
 
210
  function stopRecording() {
211
  userClosing = true;
212
-
213
- // Stop the recorder if it exists
214
  if (recorder) {
215
  recorder.stop();
216
  recorder = null;
217
  }
218
  isRecording = false;
219
 
220
- // Close the websocket if it exists
221
  if (websocket) {
222
  websocket.close();
223
  websocket = null;
@@ -228,15 +246,12 @@
228
 
229
  async function toggleRecording() {
230
  if (!isRecording) {
231
- fullTranscription = "";
232
- transcriptionsDiv.innerHTML = "";
233
-
234
  try {
235
  await setupWebSocket();
236
  await startRecording();
237
  } catch (err) {
238
- statusText.textContent =
239
- "Could not connect to WebSocket or access mic. Recording aborted.";
240
  }
241
  } else {
242
  stopRecording();
@@ -245,9 +260,7 @@
245
 
246
  function updateUI() {
247
  recordButton.classList.toggle("recording", isRecording);
248
- statusText.textContent = isRecording
249
- ? "Recording..."
250
- : "Click to start transcription";
251
  }
252
 
253
  recordButton.addEventListener("click", toggleRecording);
 
7
  <style>
8
  body {
9
  font-family: 'Inter', sans-serif;
 
10
  margin: 20px;
11
+ text-align: center;
12
  }
13
  #recordButton {
14
  width: 80px;
 
28
  #recordButton:active {
29
  transform: scale(0.95);
30
  }
31
+ #status {
32
  margin-top: 20px;
33
+ font-size: 16px;
34
+ color: #333;
 
 
 
 
 
 
 
 
35
  }
36
  .settings-container {
37
  display: flex;
 
65
  label {
66
  font-size: 14px;
67
  }
68
+ /* Speaker-labeled transcript area */
69
+ #linesTranscript {
70
+ margin: 20px auto;
71
+ max-width: 600px;
72
+ text-align: left;
73
+ font-size: 16px;
74
+ }
75
+ #linesTranscript p {
76
+ margin: 5px 0;
77
+ }
78
+ #linesTranscript strong {
79
+ color: #333;
80
+ }
81
+ /* Grey buffer styling */
82
+ .buffer {
83
+ color: rgb(180, 180, 180);
84
+ font-style: italic;
85
+ margin-left: 4px;
86
+ }
87
  </style>
88
  </head>
89
  <body>
90
+
91
  <div class="settings-container">
92
  <button id="recordButton">🎙️</button>
93
  <div class="settings">
 
108
  </div>
109
  </div>
110
  </div>
111
+
112
  <p id="status"></p>
113
 
114
+ <!-- Speaker-labeled transcript -->
115
+ <div id="linesTranscript"></div>
116
 
117
  <script>
118
  let isRecording = false;
 
120
  let recorder = null;
121
  let chunkDuration = 1000;
122
  let websocketUrl = "ws://localhost:8000/asr";
 
 
123
  let userClosing = false;
124
 
125
  const statusText = document.getElementById("status");
126
  const recordButton = document.getElementById("recordButton");
127
  const chunkSelector = document.getElementById("chunkSelector");
128
  const websocketInput = document.getElementById("websocketInput");
129
+ const linesTranscriptDiv = document.getElementById("linesTranscript");
130
 
 
 
 
131
  chunkSelector.addEventListener("change", () => {
132
  chunkDuration = parseInt(chunkSelector.value);
133
  });
134
 
 
135
  websocketInput.addEventListener("change", () => {
136
  const urlValue = websocketInput.value.trim();
 
 
137
  if (!urlValue.startsWith("ws://") && !urlValue.startsWith("wss://")) {
138
+ statusText.textContent = "Invalid WebSocket URL (must start with ws:// or wss://)";
 
139
  return;
140
  }
141
  websocketUrl = urlValue;
142
  statusText.textContent = "WebSocket URL updated. Ready to connect.";
143
  });
144
 
 
 
 
 
 
145
  function setupWebSocket() {
146
  return new Promise((resolve, reject) => {
147
  try {
148
  websocket = new WebSocket(websocketUrl);
149
  } catch (error) {
150
+ statusText.textContent = "Invalid WebSocket URL. Please check and try again.";
 
151
  reject(error);
152
  return;
153
  }
154
 
155
  websocket.onopen = () => {
156
+ statusText.textContent = "Connected to server.";
157
  resolve();
158
  };
159
 
160
+ websocket.onclose = () => {
 
161
  if (userClosing) {
162
  statusText.textContent = "WebSocket closed by user.";
163
  } else {
164
+ statusText.textContent =
165
+ "Disconnected from the WebSocket server. (Check logs if model is loading.)";
166
  }
167
  userClosing = false;
168
  };
169
 
170
  websocket.onerror = () => {
171
+ statusText.textContent = "Error connecting to WebSocket.";
172
  reject(new Error("Error connecting to WebSocket"));
173
  };
174
 
175
+ // Handle messages from server
176
  websocket.onmessage = (event) => {
177
  const data = JSON.parse(event.data);
178
+ /*
179
+ The server might send:
180
+ {
181
+ "lines": [
182
+ {"speaker": 0, "text": "Hello."},
183
+ {"speaker": 1, "text": "Bonjour."},
184
+ ...
185
+ ],
186
+ "buffer": "..."
187
+ }
188
+ */
189
+ const { lines = [], buffer = "" } = data;
190
+ renderLinesWithBuffer(lines, buffer);
191
  };
192
  });
193
  }
194
 
195
+ function renderLinesWithBuffer(lines, buffer) {
196
+ // Clears if no lines
197
+ if (!Array.isArray(lines) || lines.length === 0) {
198
+ linesTranscriptDiv.innerHTML = "";
199
+ return;
200
+ }
201
+ // Build the HTML
202
+ // The buffer is appended to the last line if it's non-empty
203
+ const linesHtml = lines.map((item, idx) => {
204
+ let textContent = item.text;
205
+ if (idx === lines.length - 1 && buffer) {
206
+ textContent += `<span class="buffer">${buffer}</span>`;
207
+ }
208
+ return `<p><strong>Speaker ${item.speaker}:</strong> ${textContent}</p>`;
209
+ }).join("");
210
+
211
+ linesTranscriptDiv.innerHTML = linesHtml;
212
+ }
213
+
214
  async function startRecording() {
215
  try {
216
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
 
224
  isRecording = true;
225
  updateUI();
226
  } catch (err) {
227
+ statusText.textContent = "Error accessing microphone. Please allow microphone access.";
 
228
  }
229
  }
230
 
231
  function stopRecording() {
232
  userClosing = true;
 
 
233
  if (recorder) {
234
  recorder.stop();
235
  recorder = null;
236
  }
237
  isRecording = false;
238
 
 
239
  if (websocket) {
240
  websocket.close();
241
  websocket = null;
 
246
 
247
  async function toggleRecording() {
248
  if (!isRecording) {
249
+ linesTranscriptDiv.innerHTML = "";
 
 
250
  try {
251
  await setupWebSocket();
252
  await startRecording();
253
  } catch (err) {
254
+ statusText.textContent = "Could not connect to WebSocket or access mic. Aborted.";
 
255
  }
256
  } else {
257
  stopRecording();
 
260
 
261
  function updateUI() {
262
  recordButton.classList.toggle("recording", isRecording);
263
+ statusText.textContent = isRecording ? "Recording..." : "Click to start transcription";
 
 
264
  }
265
 
266
  recordButton.addEventListener("click", toggleRecording);
whisper_fastapi_online_server.py CHANGED
@@ -90,6 +90,7 @@ async def start_ffmpeg_decoder():
90
  return process
91
 
92
 
 
93
  @app.websocket("/asr")
94
  async def websocket_endpoint(websocket: WebSocket):
95
  await websocket.accept()
@@ -110,6 +111,9 @@ async def websocket_endpoint(websocket: WebSocket):
110
  loop = asyncio.get_event_loop()
111
  full_transcription = ""
112
  beg = time()
 
 
 
113
  while True:
114
  try:
115
  elapsed_time = int(time() - beg)
@@ -137,8 +141,17 @@ async def websocket_endpoint(websocket: WebSocket):
137
  )
138
  pcm_buffer = bytearray()
139
  online.insert_audio_chunk(pcm_array)
140
- transcription = online.process_iter()[2]
141
- full_transcription += transcription
 
 
 
 
 
 
 
 
 
142
  if args.vac:
143
  buffer = online.online.to_flush(
144
  online.online.transcript_buffer.buffer
@@ -151,11 +164,30 @@ async def websocket_endpoint(websocket: WebSocket):
151
  buffer in full_transcription
152
  ): # With VAC, the buffer is not updated until the next chunk is processed
153
  buffer = ""
154
- response = {"transcription": transcription, "buffer": buffer}
 
 
 
 
 
 
 
155
  if args.diarization:
156
- speakers = await diarization.get_speakers(pcm_array)
157
- response["speakers"] = speakers
158
-
 
 
 
 
 
 
 
 
 
 
 
 
159
  await websocket.send_json(response)
160
 
161
  except Exception as e:
 
90
  return process
91
 
92
 
93
+
94
  @app.websocket("/asr")
95
  async def websocket_endpoint(websocket: WebSocket):
96
  await websocket.accept()
 
111
  loop = asyncio.get_event_loop()
112
  full_transcription = ""
113
  beg = time()
114
+
115
+ chunk_history = [] # Will store dicts: {beg, end, text, speaker}
116
+
117
  while True:
118
  try:
119
  elapsed_time = int(time() - beg)
 
141
  )
142
  pcm_buffer = bytearray()
143
  online.insert_audio_chunk(pcm_array)
144
+ beg_trans, end_trans, trans = online.process_iter()
145
+
146
+ if trans:
147
+ chunk_history.append({
148
+ "beg": beg_trans,
149
+ "end": end_trans,
150
+ "text": trans,
151
+ "speaker": "0"
152
+ })
153
+
154
+ full_transcription += trans
155
  if args.vac:
156
  buffer = online.online.to_flush(
157
  online.online.transcript_buffer.buffer
 
164
  buffer in full_transcription
165
  ): # With VAC, the buffer is not updated until the next chunk is processed
166
  buffer = ""
167
+
168
+ lines = [
169
+ {
170
+ "speaker": "0",
171
+ "text": "",
172
+ }
173
+ ]
174
+
175
  if args.diarization:
176
+ await diarization.diarize(pcm_array)
177
+ diarization.assign_speakers_to_chunks(chunk_history)
178
+
179
+ for ch in chunk_history:
180
+ if args.diarization and ch["speaker"] and ch["speaker"][-1] != lines[-1]["speaker"]:
181
+ lines.append(
182
+ {
183
+ "speaker": ch["speaker"][-1],
184
+ "text": ch['text'],
185
+ }
186
+ )
187
+ else:
188
+ lines[-1]["text"] += ch['text']
189
+
190
+ response = {"lines": lines, "buffer": buffer}
191
  await websocket.send_json(response)
192
 
193
  except Exception as e: