Spaces:

Luigi
/

Whisper-vs-Sensevoice-Small

Runtime error

App Files Files Community

Luigi commited on May 27

Commit

1b8a47d

1 Parent(s): d0509e1

add s2t conversion, enable spk diaraization bydefault

Browse files

Files changed (1) hide show

app.py +30 -28

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import gradio as gr
 from transformers import pipeline
 from pydub import AudioSegment
 from pyannote.audio import Pipeline as DiarizationPipeline
 import spaces  # zeroGPU support
 from funasr import AutoModel
@@ -14,36 +15,21 @@ from funasr.utils.postprocess_utils import rich_transcription_postprocess
 # —————— Model Lists ——————
 WHISPER_MODELS = [
     "openai/whisper-large-v3-turbo",
     "openai/whisper-large-v3",
-    "openai/whisper-tiny",
-    "openai/whisper-small",
     "openai/whisper-medium",
     "openai/whisper-base",
     "JacobLinCool/whisper-large-v3-turbo-common_voice_19_0-zh-TW",
     "Jingmiao/whisper-small-zh_tw",
     "DDTChen/whisper-medium-zh-tw",
     "kimbochen/whisper-small-zh-tw",
-    "JacobLinCool/whisper-large-v3-turbo-zh-TW-clean-1",
-    "JunWorks/whisper-small-zhTW",
-    "WANGTINGTING/whisper-large-v2-zh-TW-vol2",
-    "xmzhu/whisper-tiny-zh-TW",
-    "ingrenn/whisper-small-common-voice-13-zh-TW",
-    "jun-han/whisper-small-zh-TW",
-    "xmzhu/whisper-tiny-zh-TW-baseline",
-    "JacobLinCool/whisper-large-v3-turbo-common_voice_16_1-zh-TW-2",
-    "JacobLinCool/whisper-large-v3-common_voice_19_0-zh-TW-full-1",
-    "momo103197/whisper-small-zh-TW-mix",
-    "JacobLinCool/whisper-large-v3-turbo-zh-TW-clean-1-merged",
-    "JacobLinCool/whisper-large-v2-common_voice_19_0-zh-TW-full-1",
-    "kimas1269/whisper-meduim_zhtw",
-    "JunWorks/whisper-base-zhTW",
-    "JunWorks/whisper-small-zhTW-frozenDecoder",
-    "sandy1990418/whisper-large-v3-turbo-zh-tw",
-    "JacobLinCool/whisper-large-v3-turbo-common_voice_16_1-zh-TW-pissa-merged",
-    "momo103197/whisper-small-zh-TW-16",
-    "k1nto/Belle-whisper-large-v3-zh-punct-ct2"
 ]
 SENSEVOICE_MODELS = [
     "FunAudioLLM/SenseVoiceSmall",
     "AXERA-TECH/SenseVoice",
@@ -65,6 +51,7 @@ WHISPER_LANGUAGES = [
     "th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo",
     "zh","yue"
 ]
 SENSEVOICE_LANGUAGES = ["auto", "zh", "yue", "en", "ja", "ko", "nospeech"]
 # —————— Caches ——————
@@ -72,6 +59,9 @@ whisper_pipes = {}
 sense_models = {}
 dar_pipe = None
 # —————— Helpers ——————
 def get_whisper_pipe(model_id: str, device: int):
     key = (model_id, device)
@@ -105,14 +95,14 @@ def get_diarization_pipe():
     if dar_pipe is None:
         # Pull token from environment (HF_TOKEN or HUGGINGFACE_TOKEN)
         token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
-        # Attempt to load the latest 3.1 pipeline, fallback to 2.1 if gated segmentation-3.0 isn't accepted
         try:
             dar_pipe = DiarizationPipeline.from_pretrained(
                 "pyannote/speaker-diarization-3.1",
                 use_auth_token=token or True
             )
         except Exception as e:
-            print(f"Failed to load pyannote/speaker-diarization-3.1: {e} Falling back to pyannote/[email protected].")
             dar_pipe = DiarizationPipeline.from_pretrained(
                 "pyannote/[email protected]",
                 use_auth_token=token or True
@@ -133,6 +123,8 @@ def transcribe_whisper(model_id: str,
     result = (pipe(audio_path) if language == "auto"
               else pipe(audio_path, generate_kwargs={"language": language}))
     transcript = result.get("text", "").strip()
     diar_text = ""
     # optional speaker diarization
     if enable_diar:
@@ -148,6 +140,8 @@ def transcribe_whisper(model_id: str,
                            else pipe(tmp.name, generate_kwargs={"language": language}))
             os.unlink(tmp.name)
             text = seg_out.get("text", "").strip()
             snippets.append(f"[{speaker}] {text}")
         diar_text = "\n".join(snippets)
     return transcript, diar_text
@@ -173,6 +167,8 @@ def transcribe_sense(model_id: str,
         text = rich_transcription_postprocess(segs[0]['text'])
         if not enable_punct:
             text = re.sub(r"[^\w\s]", "", text)
         return text, ""
     # with diarization
     diarizer = get_diarization_pipe()
@@ -196,6 +192,8 @@ def transcribe_sense(model_id: str,
         txt = rich_transcription_postprocess(segs[0]['text'])
         if not enable_punct:
             txt = re.sub(r"[^\w\s]", "", txt)
         snippets.append(f"[{speaker}] {txt}")
     full = rich_transcription_postprocess(model.generate(
         input=audio_path,
@@ -208,20 +206,23 @@ def transcribe_sense(model_id: str,
     )[0]['text'])
     if not enable_punct:
         full = re.sub(r"[^\w\s]", "", full)
     return full, "\n".join(snippets)
 # —————— Gradio UI ——————
 demo = gr.Blocks()
 with demo:
-    gr.Markdown("## Whisper vs. SenseVoice (Language, Device & Diarization)")
     audio_input = gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio Input")
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Whisper ASR")
             whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
             whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
             device_radio = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
-            diar_check = gr.Checkbox(label="Enable Diarization")
             btn_w = gr.Button("Transcribe with Whisper")
             out_w = gr.Textbox(label="Transcript")
             out_w_d = gr.Textbox(label="Diarized Transcript")
@@ -232,13 +233,14 @@ with demo:
             gr.Markdown("### FunASR SenseVoice ASR")
             sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
             sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
-            punct = gr.Checkbox(label="Enable Punctuation", value=True)
-            diar_s = gr.Checkbox(label="Enable Diarization")
             btn_s = gr.Button("Transcribe with SenseVoice")
             out_s = gr.Textbox(label="Transcript")
             out_s_d = gr.Textbox(label="Diarized Transcript")
             btn_s.click(fn=transcribe_sense,
-                        inputs=[sense_dd, sense_lang, audio_input, punct, diar_s],
                         outputs=[out_s, out_s_d])
 if __name__ == "__main__":
     demo.launch()

 from transformers import pipeline
 from pydub import AudioSegment
 from pyannote.audio import Pipeline as DiarizationPipeline
+import opencc
 import spaces  # zeroGPU support
 from funasr import AutoModel
 # —————— Model Lists ——————
 WHISPER_MODELS = [
+    # Base Whisper models
     "openai/whisper-large-v3-turbo",
     "openai/whisper-large-v3",
     "openai/whisper-medium",
+    "openai/whisper-small",
     "openai/whisper-base",
+    "openai/whisper-tiny",
+    # Community fine-tuned Chinese models
     "JacobLinCool/whisper-large-v3-turbo-common_voice_19_0-zh-TW",
     "Jingmiao/whisper-small-zh_tw",
     "DDTChen/whisper-medium-zh-tw",
     "kimbochen/whisper-small-zh-tw",
+    # ...etc...
 ]
 SENSEVOICE_MODELS = [
     "FunAudioLLM/SenseVoiceSmall",
     "AXERA-TECH/SenseVoice",
     "th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo",
     "zh","yue"
 ]
 SENSEVOICE_LANGUAGES = ["auto", "zh", "yue", "en", "ja", "ko", "nospeech"]
 # —————— Caches ——————
 sense_models = {}
 dar_pipe = None
+# Initialize OpenCC converter for simplified to traditional Chinese
+converter = opencc.OpenCC('s2t.json')
 # —————— Helpers ——————
 def get_whisper_pipe(model_id: str, device: int):
     key = (model_id, device)
     if dar_pipe is None:
         # Pull token from environment (HF_TOKEN or HUGGINGFACE_TOKEN)
         token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
+        # Try loading latest 3.1 pipeline, fallback to 2.1 on gated model error
         try:
             dar_pipe = DiarizationPipeline.from_pretrained(
                 "pyannote/speaker-diarization-3.1",
                 use_auth_token=token or True
             )
         except Exception as e:
+            print(f"Failed to load pyannote/speaker-diarization-3.1: {e}\nFalling back to pyannote/[email protected].")
             dar_pipe = DiarizationPipeline.from_pretrained(
                 "pyannote/[email protected]",
                 use_auth_token=token or True
     result = (pipe(audio_path) if language == "auto"
               else pipe(audio_path, generate_kwargs={"language": language}))
     transcript = result.get("text", "").strip()
+    # convert simplified Chinese to traditional
+    transcript = converter.convert(transcript)
     diar_text = ""
     # optional speaker diarization
     if enable_diar:
                            else pipe(tmp.name, generate_kwargs={"language": language}))
             os.unlink(tmp.name)
             text = seg_out.get("text", "").strip()
+            # convert simplified Chinese to traditional
+            text = converter.convert(text)
             snippets.append(f"[{speaker}] {text}")
         diar_text = "\n".join(snippets)
     return transcript, diar_text
         text = rich_transcription_postprocess(segs[0]['text'])
         if not enable_punct:
             text = re.sub(r"[^\w\s]", "", text)
+        # convert simplified Chinese to traditional
+        text = converter.convert(text)
         return text, ""
     # with diarization
     diarizer = get_diarization_pipe()
         txt = rich_transcription_postprocess(segs[0]['text'])
         if not enable_punct:
             txt = re.sub(r"[^\w\s]", "", txt)
+        # convert simplified Chinese to traditional
+        txt = converter.convert(txt)
         snippets.append(f"[{speaker}] {txt}")
     full = rich_transcription_postprocess(model.generate(
         input=audio_path,
     )[0]['text'])
     if not enable_punct:
         full = re.sub(r"[^\w\s]", "", full)
+    full = converter.convert(full)
     return full, "\n".join(snippets)
 # —————— Gradio UI ——————
 demo = gr.Blocks()
 with demo:
+    gr.Markdown("## Whisper vs. SenseVoice (Language, Device & Diarization with Simplified→Traditional Chinese)")
     audio_input = gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio Input")
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Whisper ASR")
             whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
             whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
             device_radio = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
+            diar_check = gr.Checkbox(label="Enable Diarization", value=True)
             btn_w = gr.Button("Transcribe with Whisper")
             out_w = gr.Textbox(label="Transcript")
             out_w_d = gr.Textbox(label="Diarized Transcript")
             gr.Markdown("### FunASR SenseVoice ASR")
             sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
             sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
+            punct_chk = gr.Checkbox(label="Enable Punctuation", value=True)
+            diar_s_chk = gr.Checkbox(label="Enable Diarization", value=True)
             btn_s = gr.Button("Transcribe with SenseVoice")
             out_s = gr.Textbox(label="Transcript")
             out_s_d = gr.Textbox(label="Diarized Transcript")
             btn_s.click(fn=transcribe_sense,
+                        inputs=[sense_dd, sense_lang, audio_input, punct_chk, diar_s_chk],
                         outputs=[out_s, out_s_d])
 if __name__ == "__main__":
     demo.launch()