Spaces:
Running
on
Zero
Running
on
Zero
initial commit
Browse files- README.md +22 -1
- app.py +109 -0
- requirements.txt +0 -0
README.md
CHANGED
@@ -11,4 +11,25 @@ license: mit
|
|
11 |
short_description: Compare OpenAI Whisper against Sensevoice Small Resultssssss
|
12 |
---
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
short_description: Compare OpenAI Whisper against Sensevoice Small Resultssssss
|
12 |
---
|
13 |
|
14 |
+
# Whisper vs. FunASR SenseVoice Comparison
|
15 |
+
|
16 |
+
This Space lets you compare OpenAI Whisper variants against FunAudioLLM’s SenseVoice models for automatic speech recognition (ASR), all via a simple Gradio 5 UI.
|
17 |
+
|
18 |
+
## 🚀 Demo
|
19 |
+
|
20 |
+
1. **Select Whisper model** from the dropdown.
|
21 |
+
2. **Select SenseVoice model** from the dropdown.
|
22 |
+
3. (Optional) **Toggle punctuation** for SenseVoice.
|
23 |
+
4. **Upload** an audio file (wav, mp3, etc.) or **record** with your microphone.
|
24 |
+
5. Click **Transcribe** to run both ASRs side-by-side.
|
25 |
+
|
26 |
+
## 📁 Files
|
27 |
+
|
28 |
+
- **app.py**
|
29 |
+
Main Gradio application. Sets up two HF-ASR pipelines and displays their outputs.
|
30 |
+
|
31 |
+
- **requirements.txt**
|
32 |
+
Python dependencies: Gradio, Transformers, Torch, Torchaudio, Accelerate, ffmpeg-python.
|
33 |
+
|
34 |
+
- **readme.md**
|
35 |
+
This documentation.
|
app.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app.py
|
2 |
+
import spaces
|
3 |
+
import re
|
4 |
+
import torch
|
5 |
+
import gradio as gr
|
6 |
+
from transformers import pipeline
|
7 |
+
|
8 |
+
# List of Whisper model IDs
|
9 |
+
WHISPER_MODELS = [
|
10 |
+
"openai/whisper-large-v3-turbo",
|
11 |
+
"openai/whisper-large-v3",
|
12 |
+
"openai/whisper-tiny",
|
13 |
+
"openai/whisper-small",
|
14 |
+
"openai/whisper-medium",
|
15 |
+
"openai/whisper-base",
|
16 |
+
"JacobLinCool/whisper-large-v3-turbo-common_voice_19_0-zh-TW",
|
17 |
+
"Jingmiao/whisper-small-zh_tw",
|
18 |
+
"DDTChen/whisper-medium-zh-tw",
|
19 |
+
"kimbochen/whisper-small-zh-tw",
|
20 |
+
"ChrisTorng/whisper-large-v3-turbo-common_voice_19_0-zh-TW-ct2",
|
21 |
+
"JacobLinCool/whisper-large-v3-turbo-zh-TW-clean-1",
|
22 |
+
"JunWorks/whisper-small-zhTW",
|
23 |
+
"WANGTINGTING/whisper-large-v2-zh-TW-vol2",
|
24 |
+
"xmzhu/whisper-tiny-zh-TW",
|
25 |
+
"ingrenn/whisper-small-common-voice-13-zh-TW",
|
26 |
+
"jun-han/whisper-small-zh-TW",
|
27 |
+
"xmzhu/whisper-tiny-zh-TW-baseline",
|
28 |
+
"JacobLinCool/whisper-large-v3-turbo-common_voice_16_1-zh-TW-2",
|
29 |
+
"JacobLinCool/whisper-large-v3-common_voice_19_0-zh-TW-full-1",
|
30 |
+
"momo103197/whisper-small-zh-TW-mix",
|
31 |
+
"JacobLinCool/whisper-large-v3-turbo-zh-TW-clean-1-merged",
|
32 |
+
"JacobLinCool/whisper-large-v2-common_voice_19_0-zh-TW-full-1",
|
33 |
+
"kimas1269/whisper-meduim_zhtw",
|
34 |
+
"JunWorks/whisper-base-zhTW",
|
35 |
+
"JunWorks/whisper-small-zhTW-frozenDecoder",
|
36 |
+
"sandy1990418/whisper-large-v3-turbo-zh-tw",
|
37 |
+
"JacobLinCool/whisper-large-v3-turbo-common_voice_16_1-zh-TW-pissa-merged",
|
38 |
+
"momo103197/whisper-small-zh-TW-16",
|
39 |
+
"k1nto/Belle-whisper-large-v3-zh-punct-ct2"
|
40 |
+
]
|
41 |
+
|
42 |
+
# List of SenseVoice model IDs
|
43 |
+
SENSEVOICE_MODELS = [
|
44 |
+
"FunAudioLLM/SenseVoiceSmall",
|
45 |
+
"AXERA-TECH/SenseVoice",
|
46 |
+
"alextomcat/SenseVoiceSmall",
|
47 |
+
"ChenChenyu/SenseVoiceSmall-finetuned",
|
48 |
+
"apinge/sensevoice-small"
|
49 |
+
]
|
50 |
+
|
51 |
+
# Cache pipelines
|
52 |
+
pipes = {}
|
53 |
+
|
54 |
+
def get_asr_pipe(model_id):
|
55 |
+
if model_id not in pipes:
|
56 |
+
# run on GPU if available
|
57 |
+
device = 0 if torch.cuda.is_available() else -1
|
58 |
+
pipes[model_id] = pipeline("automatic-speech-recognition", model=model_id, device=device)
|
59 |
+
return pipes[model_id]
|
60 |
+
|
61 |
+
@spaces.GPU
|
62 |
+
def transcribe(whisper_model, sense_model, audio_path, enable_punct):
|
63 |
+
# 1) Whisper
|
64 |
+
whisper_pipe = get_asr_pipe(whisper_model)
|
65 |
+
whisper_out = whisper_pipe(audio_path)
|
66 |
+
text_whisper = whisper_out.get("text", "").strip()
|
67 |
+
|
68 |
+
# 2) SenseVoice
|
69 |
+
sense_pipe = get_asr_pipe(sense_model)
|
70 |
+
sense_out = sense_pipe(audio_path)
|
71 |
+
text_sense = sense_out.get("text", "").strip()
|
72 |
+
|
73 |
+
# 3) strip punctuation if disabled
|
74 |
+
if not enable_punct:
|
75 |
+
text_sense = re.sub(r"[^\w\s]", "", text_sense)
|
76 |
+
|
77 |
+
return text_whisper, text_sense
|
78 |
+
|
79 |
+
with gr.Blocks() as demo:
|
80 |
+
gr.Markdown("## Whisper vs. FunASR SenseVoice Comparison")
|
81 |
+
with gr.Row():
|
82 |
+
whisper_dd = gr.Dropdown(
|
83 |
+
choices=WHISPER_MODELS,
|
84 |
+
value=WHISPER_MODELS[0],
|
85 |
+
label="Whisper Model"
|
86 |
+
)
|
87 |
+
sense_dd = gr.Dropdown(
|
88 |
+
choices=SENSEVOICE_MODELS,
|
89 |
+
value=SENSEVOICE_MODELS[0],
|
90 |
+
label="SenseVoice Model"
|
91 |
+
)
|
92 |
+
punct = gr.Checkbox(label="Enable Punctuation (SenseVoice)", value=True)
|
93 |
+
audio_in = gr.Audio(
|
94 |
+
source="upload+microphone",
|
95 |
+
type="filepath",
|
96 |
+
label="Upload or Record Audio"
|
97 |
+
)
|
98 |
+
with gr.Row():
|
99 |
+
out_whisper = gr.Textbox(label="Whisper Transcript")
|
100 |
+
out_sense = gr.Textbox(label="SenseVoice Transcript")
|
101 |
+
btn = gr.Button("Transcribe")
|
102 |
+
btn.click(
|
103 |
+
fn=transcribe,
|
104 |
+
inputs=[whisper_dd, sense_dd, audio_in, punct],
|
105 |
+
outputs=[out_whisper, out_sense]
|
106 |
+
)
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
demo.launch()
|
requirements.txt
ADDED
File without changes
|