whisper-webui-translate

Build error

aadnk commited on Sep 27, 2023

Commit

aa33666

1 Parent(s): 18bb72f

Fix diarization in CLI

Files changed (2) hide show

app.py CHANGED Viewed

@@ -240,19 +240,6 @@ class WhisperTranscriber:
                     # Update progress
                     current_progress += source_audio_duration
-                    # Diarization
-                    if self.diarization and self.diarization_kwargs:
-                        print("Diarizing ", source.source_path)
-                        diarization_result = list(self.diarization.run(source.source_path, **self.diarization_kwargs))
-                        # Print result
-                        print("Diarization result: ")
-                        for entry in diarization_result:
-                            print(f"  start={entry.start:.1f}s stop={entry.end:.1f}s speaker_{entry.speaker}")
-                        # Add speakers to result
-                        result = self.diarization.mark_speakers(diarization_result, result)
                     source_download, source_text, source_vtt = self.write_result(result, filePrefix, outputDirectory, highlight_words)
                     if len(sources) > 1:
@@ -373,6 +360,19 @@ class WhisperTranscriber:
             else:
                 # Default VAD
                 result = whisperCallable.invoke(audio_path, 0, None, None, progress_listener=progressListener)
         return result

                     # Update progress
                     current_progress += source_audio_duration
                     source_download, source_text, source_vtt = self.write_result(result, filePrefix, outputDirectory, highlight_words)
                     if len(sources) > 1:
             else:
                 # Default VAD
                 result = whisperCallable.invoke(audio_path, 0, None, None, progress_listener=progressListener)
+        # Diarization
+        if self.diarization and self.diarization_kwargs:
+            print("Diarizing ", audio_path)
+            diarization_result = list(self.diarization.run(audio_path, **self.diarization_kwargs))
+            # Print result
+            print("Diarization result: ")
+            for entry in diarization_result:
+                print(f"  start={entry.start:.1f}s stop={entry.end:.1f}s speaker_{entry.speaker}")
+            # Add speakers to result
+            result = self.diarization.mark_speakers(diarization_result, result)
         return result

cli.py CHANGED Viewed

@@ -111,9 +111,9 @@ def cli():
     parser.add_argument('--auth_token', type=str, default=None, help='HuggingFace API Token (optional)')
     parser.add_argument("--diarization", type=str2bool, default=app_config.diarization, \
                         help="whether to perform speaker diarization")
-    parser.add_argument("--num_speakers", type=int, default=None, help="Number of speakers")
-    parser.add_argument("--min_speakers", type=int, default=None, help="Minimum number of speakers")
-    parser.add_argument("--max_speakers", type=int, default=None, help="Maximum number of speakers")
     args = parser.parse_args().__dict__
     model_name: str = args.pop("model")
@@ -151,11 +151,11 @@ def cli():
     compute_type = args.pop("compute_type")
     highlight_words = args.pop("highlight_words")
-    diarization = args.pop("diarization")
     auth_token = args.pop("auth_token")
-    num_speakers = args.pop("num_speakers")
-    min_speakers = args.pop("min_speakers")
-    max_speakers = args.pop("max_speakers")
     transcriber = WhisperTranscriber(delete_uploaded_files=False, vad_cpu_cores=vad_cpu_cores, app_config=app_config)
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))

     parser.add_argument('--auth_token', type=str, default=None, help='HuggingFace API Token (optional)')
     parser.add_argument("--diarization", type=str2bool, default=app_config.diarization, \
                         help="whether to perform speaker diarization")
+    parser.add_argument("--diarization_num_speakers", type=int, default=None, help="Number of speakers")
+    parser.add_argument("--diarization_min_speakers", type=int, default=None, help="Minimum number of speakers")
+    parser.add_argument("--diarization_max_speakers", type=int, default=None, help="Maximum number of speakers")
     args = parser.parse_args().__dict__
     model_name: str = args.pop("model")
     compute_type = args.pop("compute_type")
     highlight_words = args.pop("highlight_words")
     auth_token = args.pop("auth_token")
+    diarization = args.pop("diarization")
+    num_speakers = args.pop("diarization_num_speakers")
+    min_speakers = args.pop("diarization_min_speakers")
+    max_speakers = args.pop("diarization_max_speakers")
     transcriber = WhisperTranscriber(delete_uploaded_files=False, vad_cpu_cores=vad_cpu_cores, app_config=app_config)
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))