Luigi commited on
Commit
7f58b81
·
1 Parent(s): c4814b5

remove raw transcript, enabling diarization by default, disable emojis in sensevoice output

Browse files
Files changed (1) hide show
  1. app.py +110 -373
app.py CHANGED
@@ -116,6 +116,7 @@ def get_sense_model(model_id: str, device_str: str):
116
  vad_model="fsmn-vad",
117
  vad_kwargs={"max_single_segment_time": 300000},
118
  device=device_str,
 
119
  hub="hf",
120
  )
121
  return sense_models[key]
@@ -141,22 +142,7 @@ def get_diarization_pipe():
141
 
142
 
143
  # —————— Whisper Transcription ——————
144
- def transcribe_with_fwhisper(model: WhisperModel, audio_path: str, language: str) -> str:
145
- """
146
- Runs faster-whisper's .transcribe(), then concatenates all segments.
147
- If language == "auto", detection is automatic.
148
- """
149
- lang_arg = None if language == "auto" else language
150
- segments, _ = model.transcribe(
151
- audio_path,
152
- beam_size=1,
153
- best_of=1,
154
- language=lang_arg,
155
- vad_filter=True,
156
- )
157
- return "".join(seg.text for seg in segments).strip()
158
-
159
- def _transcribe_fwhisper_cpu_stream(model_id, language, audio_path, enable_diar):
160
  """
161
  Generator-based streaming transcription with accumulation using Faster-Whisper on CPU.
162
  Yields (accumulated_text, diar_html) tuples for Gradio streaming.
@@ -165,50 +151,36 @@ def _transcribe_fwhisper_cpu_stream(model_id, language, audio_path, enable_diar)
165
  cprint('Whisper (faster-whisper) using CPU [stream]', 'red')
166
 
167
  # Diarization branch: accumulate snippets and yield full HTML each turn
168
- if enable_diar:
169
- diarizer = get_diarization_pipe()
170
- waveform, sample_rate = torchaudio.load(audio_path)
171
- diarizer.to(torch.device('cpu'))
172
- with ProgressHook() as hook:
173
- diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
174
- snippets = []
175
- for turn, _, speaker in diary.itertracks(yield_label=True):
176
- # extract segment
177
- start_ms = int(turn.start * 1000)
178
- end_ms = int(turn.end * 1000)
179
- segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
180
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
181
- segment.export(tmp.name, format="wav")
182
- segments, _ = pipe.transcribe(
183
- tmp.name,
184
- beam_size=1,
185
- best_of=1,
186
- language=None if language == "auto" else language,
187
- vad_filter=True,
188
- )
189
- os.unlink(tmp.name)
190
- text = converter.convert("".join(s.text for s in segments).strip())
191
- snippets.append(f"[{speaker}] {text}")
192
- # yield accumulated diarization HTML
193
- yield "", format_diarization_html(snippets)
194
- return
195
-
196
- # Raw transcription: accumulate text segments and yield full transcript
197
- accumulated = []
198
- lang_arg = None if language == "auto" else language
199
- for seg in pipe.transcribe(
200
- audio_path,
201
- beam_size=1,
202
- best_of=1,
203
- language=lang_arg,
204
- vad_filter=True,
205
- ):
206
- txt = converter.convert(seg.text.strip())
207
- accumulated.append(txt)
208
- yield "\n".join(accumulated), ""
209
 
210
  @spaces.GPU
211
- def _transcribe_fwhisper_gpu_stream(model_id, language, audio_path, enable_diar):
212
  """
213
  Generator-based streaming transcription with accumulation using Faster-Whisper on CUDA.
214
  Yields (accumulated_text, diar_html) tuples for Gradio streaming.
@@ -217,336 +189,109 @@ def _transcribe_fwhisper_gpu_stream(model_id, language, audio_path, enable_diar)
217
  cprint('Whisper (faster-whisper) using CUDA [stream]', 'green')
218
 
219
  # Diarization branch: accumulate snippets and yield full HTML each turn
220
- if enable_diar:
221
- diarizer = get_diarization_pipe()
222
- device = torch.device('cuda')
223
- diarizer.to(device)
224
- waveform, sample_rate = torchaudio.load(audio_path)
225
- waveform = waveform.to(device)
226
- with ProgressHook() as hook:
227
- diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
228
- snippets = []
229
- for turn, _, speaker in diary.itertracks(yield_label=True):
230
- start_ms = int(turn.start * 1000)
231
- end_ms = int(turn.end * 1000)
232
- segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
233
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
234
- segment.export(tmp.name, format="wav")
235
- segments, _ = pipe.transcribe(
236
- tmp.name,
237
- beam_size=1,
238
- best_of=1,
239
- language=None if language == "auto" else language,
240
- vad_filter=True,
241
- )
242
- os.unlink(tmp.name)
243
- text = converter.convert("".join(s.text for s in segments).strip())
244
- snippets.append(f"[{speaker}] {text}")
245
- yield "", format_diarization_html(snippets)
246
- return
247
-
248
- # Raw transcription: accumulate text segments and yield full transcript
249
- accumulated = []
250
- lang_arg = None if language == "auto" else language
251
- for seg in pipe.transcribe(
252
- audio_path,
253
- beam_size=1,
254
- best_of=1,
255
- language=lang_arg,
256
- vad_filter=True,
257
- ):
258
- txt = converter.convert(seg.text.strip())
259
- accumulated.append(txt)
260
- yield "\n".join(accumulated), ""
261
-
262
-
263
- def _transcribe_fwhisper_cpu(model_id, language, audio_path, enable_diar):
264
- model = get_fwhisper_model(model_id, "cpu")
265
- cprint('Whisper (faster-whisper) using CPU', 'red')
266
- # Diarization-only branch
267
- if enable_diar:
268
- diarizer = get_diarization_pipe()
269
- # Pre-loading audio files in memory may result in faster processing
270
- waveform, sample_rate = torchaudio.load(audio_path)
271
- diarizer.to(torch.device('cpu'))
272
- with ProgressHook() as hook:
273
- diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
274
- snippets = []
275
- for turn, _, speaker in diary.itertracks(yield_label=True):
276
- start_ms = int(turn.start * 1000)
277
- end_ms = int(turn.end * 1000)
278
- segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
279
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
280
- segment.export(tmp.name, format="wav")
281
- txt = transcribe_with_fwhisper(model, tmp.name, language)
282
- os.unlink(tmp.name)
283
- text = converter.convert(txt.strip())
284
- snippets.append(f"[{speaker}] {text}")
285
- return "", format_diarization_html(snippets)
286
- # Raw-only branch
287
- text = transcribe_with_fwhisper(model, audio_path, language)
288
- transcript = converter.convert(text.strip())
289
- return transcript, ""
290
-
291
-
292
- @spaces.GPU
293
- def _transcribe_fwhisper_gpu(model_id, language, audio_path, enable_diar):
294
- pipe = get_fwhisper_model(model_id, "cuda")
295
- cprint('Whisper (faster-whisper) using CUDA', 'green')
296
- # Diarization-only branch
297
- if enable_diar:
298
- diarizer = get_diarization_pipe()
299
- diarizer.to(torch.device('cuda'))
300
- # Pre-loading audio files in memory may result in faster processing
301
- waveform, sample_rate = torchaudio.load(audio_path)
302
- waveform.to(torch.device('cuda'))
303
- with ProgressHook() as hook:
304
- diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
305
- snippets = []
306
- for turn, _, speaker in diary.itertracks(yield_label=True):
307
- start_ms = int(turn.start * 1000)
308
- end_ms = int(turn.end * 1000)
309
- segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
310
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
311
- segment.export(tmp.name, format="wav")
312
- txt = transcribe_with_fwhisper(pipe, tmp.name, language)
313
- os.unlink(tmp.name)
314
- text = converter.convert(txt.strip())
315
- snippets.append(f"[{speaker}] {text}")
316
- return "", format_diarization_html(snippets)
317
- # Raw-only branch
318
- text = transcribe_with_fwhisper(pipe, tmp.name, language)
319
- transcript = converter.convert(text.strip())
320
- return transcript, ""
321
-
322
-
323
- def transcribe_fwhisper(model_id, language, audio_path, device_sel, enable_diar):
324
- if device_sel == "GPU" and torch.cuda.is_available():
325
- return _transcribe_fwhisper_gpu(model_id, language, audio_path, enable_diar)
326
- return _transcribe_fwhisper_cpu(model_id, language, audio_path, enable_diar)
327
 
328
- def transcribe_fwhisper_stream(model_id, language, audio_path, device_sel, enable_diar):
329
  """Dispatch to CPU or GPU streaming generators, preserving two-value yields."""
330
  if device_sel == "GPU" and torch.cuda.is_available():
331
- yield from _transcribe_fwhisper_gpu_stream(model_id, language, audio_path, enable_diar)
332
  else:
333
- yield from _transcribe_fwhisper_cpu_stream(model_id, language, audio_path, enable_diar)
334
 
335
  # —————— SenseVoice Transcription ——————
336
  def _transcribe_sense_cpu_stream(model_id: str, language: str, audio_path: str,
337
- enable_punct: bool, enable_diar: bool):
338
  model = get_sense_model(model_id, "cpu")
339
  cprint('SenseVoiceSmall using CPU [stream]', 'red')
340
 
341
- if enable_diar:
342
- diarizer = get_diarization_pipe()
343
- diarizer.to(torch.device('cpu'))
344
- waveform, sample_rate = torchaudio.load(audio_path)
345
- with ProgressHook() as hook:
346
- diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
347
- snippets = []
348
- for turn, _, speaker in diary.itertracks(yield_label=True):
349
- start_ms, end_ms = int(turn.start*1000), int(turn.end*1000)
350
- segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
351
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
352
- segment.export(tmp.name, format="wav")
353
- segs = model.generate(input=tmp.name, cache={}, language=language,
354
- use_itn=True, batch_size_s=300,
355
- merge_vad=False, merge_length_s=0)
356
- os.unlink(tmp.name)
357
- txt = rich_transcription_postprocess(segs[0]['text'])
358
- if not enable_punct:
359
- txt = re.sub(r"[^\w\s]", "", txt)
360
- txt = converter.convert(txt)
361
- snippets.append(f"[{speaker}] {txt}")
362
- yield "", format_diarization_html(snippets)
363
- return
364
-
365
- segs = model.generate(input=audio_path, cache={}, language=language,
366
- use_itn=True, batch_size_s=300,
367
- merge_vad=False, merge_length_s=0)
368
- accumulated = []
369
- for s in segs:
370
- t = rich_transcription_postprocess(s['text'])
371
  if not enable_punct:
372
- t = re.sub(r"[^\w\s]", "", t)
373
- t = converter.convert(t)
374
- accumulated.append(t)
375
- yield "\n".join(accumulated), ""
 
376
 
377
 
 
378
  def _transcribe_sense_gpu_stream(model_id: str, language: str, audio_path: str,
379
- enable_punct: bool, enable_diar: bool):
380
  model = get_sense_model(model_id, "cuda:0")
381
  cprint('SenseVoiceSmall using CUDA [stream]', 'green')
382
 
383
- if enable_diar:
384
- diarizer = get_diarization_pipe()
385
- diarizer.to(torch.device('cuda'))
386
- waveform, sample_rate = torchaudio.load(audio_path)
387
- waveform = waveform.to(torch.device('cuda'))
388
- with ProgressHook() as hook:
389
- diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
390
- snippets = []
391
- for turn, _, speaker in diary.itertracks(yield_label=True):
392
- start_ms, end_ms = int(turn.start*1000), int(turn.end*1000)
393
- segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
394
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
395
- segment.export(tmp.name, format="wav")
396
- segs = model.generate(input=tmp.name, cache={}, language=language,
397
- use_itn=True, batch_size_s=300,
398
- merge_vad=False, merge_length_s=0)
399
- os.unlink(tmp.name)
400
- txt = rich_transcription_postprocess(segs[0]['text'])
401
- if not enable_punct:
402
- txt = re.sub(r"[^\w\s]", "", txt)
403
- txt = converter.convert(txt)
404
- snippets.append(f"[{speaker}] {txt}")
405
- yield "", format_diarization_html(snippets)
406
- return
407
-
408
- segs = model.generate(input=audio_path, cache={}, language=language,
409
- use_itn=True, batch_size_s=300,
410
- merge_vad=False, merge_length_s=0)
411
- accumulated = []
412
- for s in segs:
413
- t = rich_transcription_postprocess(s['text'])
414
  if not enable_punct:
415
- t = re.sub(r"[^\w\s]", "", t)
416
- t = converter.convert(t)
417
- accumulated.append(t)
418
- yield "\n".join(accumulated), ""
419
-
420
- def _transcribe_sense_cpu(model_id: str,
421
- language: str,
422
- audio_path: str,
423
- enable_punct: bool,
424
- enable_diar: bool):
425
- model = get_sense_model(model_id, "cpu")
426
- # Diarization-only branch
427
- if enable_diar:
428
- diarizer = get_diarization_pipe()
429
- diarizer.to(torch.device('cpu'))
430
- # Pre-loading audio files in memory may result in faster processing
431
- waveform, sample_rate = torchaudio.load(audio_path)
432
- diarizer.to(torch.device('cpu'))
433
- with ProgressHook() as hook:
434
- diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
435
- snippets = []
436
- for turn, _, speaker in diary.itertracks(yield_label=True):
437
- start_ms = int(turn.start * 1000)
438
- end_ms = int(turn.end * 1000)
439
- segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
440
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
441
- segment.export(tmp.name, format="wav")
442
- segs = model.generate(
443
- input=tmp.name,
444
- cache={},
445
- language=language,
446
- use_itn=True,
447
- batch_size_s=300,
448
- merge_vad=False,
449
- merge_length_s=0,
450
- )
451
- os.unlink(tmp.name)
452
- txt = rich_transcription_postprocess(segs[0]['text'])
453
- if not enable_punct:
454
- txt = re.sub(r"[^\w\s]", "", txt)
455
- txt = converter.convert(txt)
456
- snippets.append(f"[{speaker}] {txt}")
457
- return "", format_diarization_html(snippets)
458
- # Raw-only branch
459
- segs = model.generate(
460
- input=audio_path,
461
- cache={},
462
- language=language,
463
- use_itn=True,
464
- batch_size_s=300,
465
- merge_vad=True,
466
- merge_length_s=15,
467
- )
468
- text = rich_transcription_postprocess(segs[0]['text'])
469
- if not enable_punct:
470
- text = re.sub(r"[^\w\s]", "", text)
471
- text = converter.convert(text)
472
- return text, ""
473
 
474
 
475
- @spaces.GPU
476
- def _transcribe_sense_gpu(model_id: str,
477
- language: str,
478
- audio_path: str,
479
- enable_punct: bool,
480
- enable_diar: bool):
481
- model = get_sense_model(model_id, "cuda:0")
482
- # Diarization-only branch
483
- if enable_diar:
484
- diarizer = get_diarization_pipe()
485
- diarizer.to(torch.device('cuda'))
486
- # Pre-loading audio files in memory may result in faster processing
487
- waveform, sample_rate = torchaudio.load(audio_path)
488
- waveform.to(torch.device('cuda'))
489
- with ProgressHook() as hook:
490
- diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
491
- snippets = []
492
- for turn, _, speaker in diary.itertracks(yield_label=True):
493
- start_ms = int(turn.start * 1000)
494
- end_ms = int(turn.end * 1000)
495
- segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
496
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
497
- segment.export(tmp.name, format="wav")
498
- segs = model.generate(
499
- input=tmp.name,
500
- cache={},
501
- language=language,
502
- use_itn=True,
503
- batch_size_s=300,
504
- merge_vad=False,
505
- merge_length_s=0,
506
- )
507
- os.unlink(tmp.name)
508
- txt = rich_transcription_postprocess(segs[0]['text'])
509
- if not enable_punct:
510
- txt = re.sub(r"[^\w\s]", "", txt)
511
- txt = converter.convert(txt)
512
- snippets.append(f"[{speaker}] {txt}")
513
- return "", format_diarization_html(snippets)
514
- # Raw-only branch
515
- segs = model.generate(
516
- input=audio_path,
517
- cache={},
518
- language=language,
519
- use_itn=True,
520
- batch_size_s=300,
521
- merge_vad=True,
522
- merge_length_s=15,
523
- )
524
- text = rich_transcription_postprocess(segs[0]['text'])
525
- if not enable_punct:
526
- text = re.sub(r"[^\w\s]", "", text)
527
- text = converter.convert(text)
528
- return text, ""
529
-
530
-
531
- def transcribe_sense(model_id: str,
532
- language: str,
533
- audio_path: str,
534
- enable_punct: bool,
535
- enable_diar: bool,
536
- device_sel: str):
537
- if device_sel == "GPU" and torch.cuda.is_available():
538
- return _transcribe_sense_gpu(model_id, language, audio_path, enable_punct, enable_diar)
539
- return _transcribe_sense_cpu(model_id, language, audio_path, enable_punct, enable_diar)
540
-
541
  def transcribe_sense_steam(model_id: str,
542
  language: str,
543
  audio_path: str,
544
  enable_punct: bool,
545
- enable_diar: bool,
546
  device_sel: str):
547
  if device_sel == "GPU" and torch.cuda.is_available():
548
- yield from _transcribe_sense_gpu_stream(model_id, language, audio_path, enable_punct, enable_diar)
549
- yield from _transcribe_sense_cpu_stream(model_id, language, audio_path, enable_punct, enable_diar)
550
 
551
  # —————— Gradio UI ——————
552
  DEMO_CSS = """
@@ -560,7 +305,7 @@ DEMO_CSS = """
560
  """
561
  Demo = gr.Blocks(css=DEMO_CSS)
562
  with Demo:
563
- gr.Markdown("## Whisper vs. SenseVoice (…)")
564
  audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input")
565
  examples = gr.Examples(
566
  examples=[["interview.mp3"], ["news.mp3"]],
@@ -576,7 +321,6 @@ with Demo:
576
  whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
577
  whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
578
  device_radio = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
579
- diar_check = gr.Checkbox(label="Enable Diarization", value=True)
580
  btn_w = gr.Button("Transcribe with Faster-Whisper")
581
 
582
  with gr.Column():
@@ -585,7 +329,6 @@ with Demo:
585
  sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
586
  device_radio_s = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
587
  punct_chk = gr.Checkbox(label="Enable Punctuation", value=True)
588
- diar_s_chk = gr.Checkbox(label="Enable Diarization", value=True)
589
  btn_s = gr.Button("Transcribe with SenseVoice")
590
 
591
  # ────────────────────────────────────────────────────────────────
@@ -603,22 +346,16 @@ with Demo:
603
 
604
  # ────────��───────────────────────────────────────────────────────
605
  # 3) WIRING UP TOGGLES & BUTTONS
606
- # toggle raw ↔ diarized for each system
607
- diar_check.change(lambda e: gr.update(visible=not e), diar_check, out_w)
608
- diar_check.change(lambda e: gr.update(visible=e), diar_check, out_w_d)
609
-
610
- diar_s_chk.change(lambda e: gr.update(visible=not e), diar_s_chk, out_s)
611
- diar_s_chk.change(lambda e: gr.update(visible=e), diar_s_chk, out_s_d)
612
 
613
  # wire the callbacks into those shared boxes
614
  btn_w.click(
615
  fn=transcribe_fwhisper_stream,
616
- inputs=[whisper_dd, whisper_lang, audio_input, device_radio, diar_check],
617
  outputs=[out_w, out_w_d]
618
  )
619
  btn_s.click(
620
  fn=transcribe_sense_steam,
621
- inputs=[sense_dd, sense_lang, audio_input, punct_chk, diar_s_chk, device_radio_s],
622
  outputs=[out_s, out_s_d]
623
  )
624
 
 
116
  vad_model="fsmn-vad",
117
  vad_kwargs={"max_single_segment_time": 300000},
118
  device=device_str,
119
+ ban_emo_unk=False,
120
  hub="hf",
121
  )
122
  return sense_models[key]
 
142
 
143
 
144
  # —————— Whisper Transcription ——————
145
+ def _transcribe_fwhisper_cpu_stream(model_id, language, audio_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  """
147
  Generator-based streaming transcription with accumulation using Faster-Whisper on CPU.
148
  Yields (accumulated_text, diar_html) tuples for Gradio streaming.
 
151
  cprint('Whisper (faster-whisper) using CPU [stream]', 'red')
152
 
153
  # Diarization branch: accumulate snippets and yield full HTML each turn
154
+ diarizer = get_diarization_pipe()
155
+ waveform, sample_rate = torchaudio.load(audio_path)
156
+ diarizer.to(torch.device('cpu'))
157
+ with ProgressHook() as hook:
158
+ diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
159
+ snippets = []
160
+ for turn, _, speaker in diary.itertracks(yield_label=True):
161
+ # extract segment
162
+ start_ms = int(turn.start * 1000)
163
+ end_ms = int(turn.end * 1000)
164
+ segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
165
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
166
+ segment.export(tmp.name, format="wav")
167
+ segments, _ = pipe.transcribe(
168
+ tmp.name,
169
+ beam_size=3,
170
+ best_of=3,
171
+ language=None if language == "auto" else language,
172
+ vad_filter=True,
173
+ )
174
+ os.unlink(tmp.name)
175
+ text = converter.convert("".join(s.text for s in segments).strip())
176
+ snippets.append(f"[{speaker}] {text}")
177
+ # yield accumulated diarization HTML
178
+ yield "", format_diarization_html(snippets)
179
+ return
180
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  @spaces.GPU
183
+ def _transcribe_fwhisper_gpu_stream(model_id, language, audio_path):
184
  """
185
  Generator-based streaming transcription with accumulation using Faster-Whisper on CUDA.
186
  Yields (accumulated_text, diar_html) tuples for Gradio streaming.
 
189
  cprint('Whisper (faster-whisper) using CUDA [stream]', 'green')
190
 
191
  # Diarization branch: accumulate snippets and yield full HTML each turn
192
+ diarizer = get_diarization_pipe()
193
+ device = torch.device('cuda')
194
+ diarizer.to(device)
195
+ waveform, sample_rate = torchaudio.load(audio_path)
196
+ waveform = waveform.to(device)
197
+ with ProgressHook() as hook:
198
+ diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
199
+ snippets = []
200
+ for turn, _, speaker in diary.itertracks(yield_label=True):
201
+ start_ms = int(turn.start * 1000)
202
+ end_ms = int(turn.end * 1000)
203
+ segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
204
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
205
+ segment.export(tmp.name, format="wav")
206
+ segments, _ = pipe.transcribe(
207
+ tmp.name,
208
+ beam_size=3,
209
+ best_of=3,
210
+ language=None if language == "auto" else language,
211
+ vad_filter=True,
212
+ )
213
+ os.unlink(tmp.name)
214
+ text = converter.convert("".join(s.text for s in segments).strip())
215
+ snippets.append(f"[{speaker}] {text}")
216
+ yield "", format_diarization_html(snippets)
217
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
+ def transcribe_fwhisper_stream(model_id, language, audio_path, device_sel):
220
  """Dispatch to CPU or GPU streaming generators, preserving two-value yields."""
221
  if device_sel == "GPU" and torch.cuda.is_available():
222
+ yield from _transcribe_fwhisper_gpu_stream(model_id, language, audio_path)
223
  else:
224
+ yield from _transcribe_fwhisper_cpu_stream(model_id, language, audio_path)
225
 
226
  # —————— SenseVoice Transcription ——————
227
  def _transcribe_sense_cpu_stream(model_id: str, language: str, audio_path: str,
228
+ enable_punct: bool):
229
  model = get_sense_model(model_id, "cpu")
230
  cprint('SenseVoiceSmall using CPU [stream]', 'red')
231
 
232
+ diarizer = get_diarization_pipe()
233
+ diarizer.to(torch.device('cpu'))
234
+ waveform, sample_rate = torchaudio.load(audio_path)
235
+ with ProgressHook() as hook:
236
+ diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
237
+ snippets = []
238
+ for turn, _, speaker in diary.itertracks(yield_label=True):
239
+ start_ms, end_ms = int(turn.start*1000), int(turn.end*1000)
240
+ segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
241
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
242
+ segment.export(tmp.name, format="wav")
243
+ segs = model.generate(input=tmp.name, cache={}, language=language,
244
+ use_itn=True, batch_size_s=300,
245
+ merge_vad=False, merge_length_s=0)
246
+ os.unlink(tmp.name)
247
+ txt = rich_transcription_postprocess(segs[0]['text'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  if not enable_punct:
249
+ txt = re.sub(r"[^\w\s]", "", txt)
250
+ txt = converter.convert(txt)
251
+ snippets.append(f"[{speaker}] {txt}")
252
+ yield "", format_diarization_html(snippets)
253
+ return
254
 
255
 
256
+ @spaces.GPU
257
  def _transcribe_sense_gpu_stream(model_id: str, language: str, audio_path: str,
258
+ enable_punct: bool):
259
  model = get_sense_model(model_id, "cuda:0")
260
  cprint('SenseVoiceSmall using CUDA [stream]', 'green')
261
 
262
+ diarizer = get_diarization_pipe()
263
+ diarizer.to(torch.device('cuda'))
264
+ waveform, sample_rate = torchaudio.load(audio_path)
265
+ waveform = waveform.to(torch.device('cuda'))
266
+ with ProgressHook() as hook:
267
+ diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
268
+ snippets = []
269
+ for turn, _, speaker in diary.itertracks(yield_label=True):
270
+ start_ms, end_ms = int(turn.start*1000), int(turn.end*1000)
271
+ segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]
272
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
273
+ segment.export(tmp.name, format="wav")
274
+ segs = model.generate(input=tmp.name, cache={}, language=language,
275
+ use_itn=True, batch_size_s=300,
276
+ merge_vad=False, merge_length_s=0)
277
+ os.unlink(tmp.name)
278
+ txt = rich_transcription_postprocess(segs[0]['text'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  if not enable_punct:
280
+ txt = re.sub(r"[^\w\s]", "", txt)
281
+ txt = converter.convert(txt)
282
+ snippets.append(f"[{speaker}] {txt}")
283
+ yield "", format_diarization_html(snippets)
284
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
  def transcribe_sense_steam(model_id: str,
288
  language: str,
289
  audio_path: str,
290
  enable_punct: bool,
 
291
  device_sel: str):
292
  if device_sel == "GPU" and torch.cuda.is_available():
293
+ yield from _transcribe_sense_gpu_stream(model_id, language, audio_path, enable_punct)
294
+ yield from _transcribe_sense_cpu_stream(model_id, language, audio_path, enable_punct)
295
 
296
  # —————— Gradio UI ——————
297
  DEMO_CSS = """
 
305
  """
306
  Demo = gr.Blocks(css=DEMO_CSS)
307
  with Demo:
308
+ gr.Markdown("## Faster-Whisper vs. SenseVoice")
309
  audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input")
310
  examples = gr.Examples(
311
  examples=[["interview.mp3"], ["news.mp3"]],
 
321
  whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
322
  whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
323
  device_radio = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
 
324
  btn_w = gr.Button("Transcribe with Faster-Whisper")
325
 
326
  with gr.Column():
 
329
  sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
330
  device_radio_s = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
331
  punct_chk = gr.Checkbox(label="Enable Punctuation", value=True)
 
332
  btn_s = gr.Button("Transcribe with SenseVoice")
333
 
334
  # ────────────────────────────────────────────────────────────────
 
346
 
347
  # ────────��───────────────────────────────────────────────────────
348
  # 3) WIRING UP TOGGLES & BUTTONS
 
 
 
 
 
 
349
 
350
  # wire the callbacks into those shared boxes
351
  btn_w.click(
352
  fn=transcribe_fwhisper_stream,
353
+ inputs=[whisper_dd, whisper_lang, audio_input, device_radio],
354
  outputs=[out_w, out_w_d]
355
  )
356
  btn_s.click(
357
  fn=transcribe_sense_steam,
358
+ inputs=[sense_dd, sense_lang, audio_input, punct_chk, device_radio_s],
359
  outputs=[out_s, out_s_d]
360
  )
361