whisper-large-v3-srt

Running on Zero

App Files Files Community

datxy commited on Aug 27

Commit

261c97f

verified ·

1 Parent(s): cdddc7e

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -149

app.py CHANGED Viewed

@@ -2,32 +2,30 @@
 import spaces
 import torch
 import gradio as gr
 import tempfile
 import os
-# ====== 可选：faster-whisper，若存在则自动使用（vad_filter=True）======
-USE_FASTER_WHISPER = True
-try:
-    if USE_FASTER_WHISPER:
-        from faster_whisper import WhisperModel  # type: ignore
-    _HAS_FW = True
-except Exception:
-    _HAS_FW = False
-# ====== 可调参数 ======
-ASR_MODEL = "openai/whisper-large-v3"  # Transformers 管线备用模型
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
-MAX_SEG_DUR = 6.0            # 每行最大时长（秒）
-MAX_SEG_CHARS = 28           # 每行最大字符数（中文可适当减小）
-PAUSE_LONG = 0.9             # 认为是“句末停顿”的阈值（秒）→ 补句号
-PAUSE_SHORT = 0.45           # 认为是“轻停顿”的阈值（秒）→ 补逗号
-MIN_PIECE_DUR = 0.2          # 每小片的最小时长，避免 0
-device = "cuda" if torch.cuda.is_available() else "cpu"
-dtype = "float16" if torch.cuda.is_available() else "float32"
-# ====== 时间戳工具 ======
 def _srt_timestamp(seconds: float | None) -> str:
     if seconds is None or seconds < 0:
         seconds = 0.0
@@ -37,25 +35,9 @@ def _srt_timestamp(seconds: float | None) -> str:
     s, ms = divmod(ms, 1000)
     return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
-# ====== 简易标点：根据停顿/片段长度补标点 ======
-_STRONG = "。！？.!?"
-def _ensure_sentence_punct(s: str) -> str:
-    s = s.strip()
-    if not s:
-        return s
-    if s[-1] not in _STRONG:
-        s += "。"
-    return s
-def _maybe_add_comma(s: str) -> str:
-    s = s.rstrip()
-    if s and s[-1] not in _STRONG + "，,、；;":
-        s += "，"
-    return s
-# ====== 文本切分（标点优先，其次按长度兜底）=====
 def _split_text_units(txt: str, max_seg_chars: int) -> list[str]:
-    strong = _STRONG
     units, cur = [], []
     for ch in txt:
         cur.append(ch)
@@ -70,16 +52,22 @@ def _split_text_units(txt: str, max_seg_chars: int) -> list[str]:
         if len(u) <= max_seg_chars:
             refined.append(u)
         else:
-            # 粗切为不超过 max_seg_chars 的块
             for i in range(0, len(u), max_seg_chars):
-                refined.append(u[i:i + max_seg_chars].strip())
     return [x for x in refined if x]
-# ====== 基于 chunk 的字符比例细分 + 简易标点 ======
-def _chunks_to_srt_no_number(chunks, max_seg_dur=MAX_SEG_DUR, max_seg_chars=MAX_SEG_CHARS) -> str:
     lines = []
-    prev_end = None
     for ch in chunks or []:
         text = (ch.get("text") or "").strip()
         if not text:
@@ -91,13 +79,6 @@ def _chunks_to_srt_no_number(chunks, max_seg_dur=MAX_SEG_DUR, max_seg_chars=MAX_
         else:
             c_start, c_end = 0.0, 2.0
-        # 根据与上一段的“停顿”补逗号/句号（软提示，真正的标点在行尾处理）
-        if prev_end is not None:
-            gap = max(c_start - prev_end, 0.0)
-            # 我们不直接把标点写进上一行文本，而是作为划分参考
-            # 具体标点在每行最终输出前处理（见下方）
-        prev_end = c_end
         units = _split_text_units(text, max_seg_chars)
         if not units:
             units = [text]
@@ -107,100 +88,32 @@ def _chunks_to_srt_no_number(chunks, max_seg_dur=MAX_SEG_DUR, max_seg_chars=MAX_
         cur_t = c_start
         for u in units:
-            alloc = max(total_dur * (len(u) / total_chars), MIN_PIECE_DUR)
-            # 若超长，继续细分为不超过 max_seg_chars 的片，并均分时长
             if alloc <= max_seg_dur:
                 pieces = [u]
                 per = alloc
             else:
-                smalls = [u[i:i + max_seg_chars] for i in range(0, len(u), max_seg_chars)]
-                pieces = [s for s in smalls if s.strip()]
-                per = max(min(max_seg_dur, alloc / max(1, len(pieces))), MIN_PIECE_DUR)
-            for i, p in enumerate(pieces):
                 st = cur_t
                 en = st + per
-                frag = p.strip()
-                # 行尾自动补标点：更自然
-                # 规则：
-                # 1) 若该小片接近一句末尾（片段较长 或 达到 max_seg_chars）→ 句号
-                # 2) 否则可能是轻停顿 → 逗号
-                # 3) 若已有强标点，保持不动
-                if frag and frag[-1] not in _STRONG:
-                    # 按时长和长度推断语气
-                    if per >= PAUSE_LONG or len(frag) >= max_seg_chars * 0.9:
-                        frag = _ensure_sentence_punct(frag)
-                    elif per >= PAUSE_SHORT:
-                        frag = _maybe_add_comma(frag)
-                    # else: 很短的片段不强制加标点（避免过密）
                 lines.append(f"{_srt_timestamp(st)} --> {_srt_timestamp(en)}")
-                lines.append(frag)
                 lines.append("")
                 cur_t = en
     return "\n".join(lines).strip() + ("\n" if lines else "")
-# ====== 如果有 faster-whisper：使用 VAD 直接拿到更干净的分段 ======
-def _fw_transcribe_to_chunks(audio_path: str):
-    """
-    使用 faster-whisper + vad_filter=True 返回“类 chunk”结构：
-    [{'text': '...', 'timestamp': [start, end]}...]
-    """
-    # 模型建议设为 medium/large-v3；这里默认 large-v3
-    model = WhisperModel("large-v3", device=device, compute_type="auto")
-    # vad_filter=True：用 Silero VAD 过滤静音/噪音
-    segments, _info = model.transcribe(
-        audio_path,
-        vad_filter=True,
-        vad_parameters=dict(min_silence_duration_ms=int(PAUSE_SHORT * 1000)),
-        beam_size=5,
-        best_of=5,
-    )
-    chunks = []
-    for seg in segments:
-        chunks.append({
-            "text": seg.text.strip(),
-            "timestamp": [float(seg.start or 0.0), float(seg.end or 0.0)],
-        })
-    return chunks
-# ====== Transformers 回退方案 ======
-from transformers import pipeline as hf_pipeline
-def _hf_transcribe_to_chunks(audio_path: str):
-    pipe = hf_pipeline(
-        task="automatic-speech-recognition",
-        model=ASR_MODEL,
-        chunk_length_s=30,
-        device=0 if torch.cuda.is_available() else "cpu",
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        return_timestamps=True,
-    )
-    result = pipe(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"})
-    # 期望返回形如：{"text": "…", "chunks": [{"text": "...", "timestamp": (s,e)}, ...]}
-    chunks = result.get("chunks") or []
-    # 如无 chunks，用整段兜底
-    if not chunks:
-        total_text = (result.get("text") or "").strip()
-        if total_text:
-            chunks = [{"text": total_text, "timestamp": (0.0, max(MAX_SEG_DUR, 2.0))}]
-    # 统一结构
-    norm = []
-    for ch in chunks:
-        ts = ch.get("timestamp") or ch.get("timestamps") or [0.0, 2.0]
-        if isinstance(ts, (list, tuple)) and len(ts) == 2:
-            s, e = ts
-        else:
-            s, e = 0.0, 2.0
-        norm.append({"text": (ch.get("text") or "").strip(), "timestamp": [float(s or 0.0), float(e or 0.0)]})
-    return norm
-# ====== 主函数：上传音频 → SRT（无编号 + 简易标点 + 可选 VAD）======
 @spaces.GPU
-def transcribe_file_to_srt(audio_path: str, task: str, use_vad: bool):
     if not audio_path:
         raise gr.Error("请先上传音频文件。")
     try:
@@ -210,17 +123,14 @@ def transcribe_file_to_srt(audio_path: str, task: str, use_vad: bool):
     except OSError:
         pass
-    # 选择后端：优先 faster-whisper + VAD，其次 Transformers
-    if use_vad and _HAS_FW:
-        chunks = _fw_transcribe_to_chunks(audio_path)
-    else:
-        chunks = _hf_transcribe_to_chunks(audio_path)
-    srt_str = _chunks_to_srt_no_number(chunks, MAX_SEG_DUR, MAX_SEG_CHARS)
-    if not srt_str:
-        srt_str = "00:00:00,000 --> 00:00:02,000\n（空）\n"
-    # 输出文件
     tmpdir = tempfile.mkdtemp(prefix="srt_")
     base = os.path.splitext(os.path.basename(audio_path))[0] or "subtitle"
     srt_path = os.path.join(tmpdir, f"{base}.srt")
@@ -229,23 +139,19 @@ def transcribe_file_to_srt(audio_path: str, task: str, use_vad: bool):
     return srt_str, srt_path
-# ====== UI ======
 demo = gr.Interface(
     fn=transcribe_file_to_srt,
     inputs=[
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
-        gr.Checkbox(label="Use VAD + Auto Punctuation (simple)", value=True),
     ],
     outputs=[
-        gr.Textbox(label="SRT Preview (no numbering, auto punctuation)", lines=18),
         gr.File(label="Download SRT"),
     ],
-    title="Upload Audio → SRT (VAD + Auto Punctuation, No Numbering)",
-    description=(
-        "Optional VAD (via faster-whisper) for cleaner segments. "
-        "Adds simple punctuation by pause/length. Adjustable MAX_SEG_DUR / MAX_SEG_CHARS / PAUSE_*."
-    ),
     allow_flagging="never",
 )

 import spaces
 import torch
 import gradio as gr
+from transformers import pipeline
 import tempfile
 import os
+# ===== 参数 =====
+MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
+MAX_SEG_DUR = 6.0
+MAX_SEG_CHARS = 28
+device = 0 if torch.cuda.is_available() else "cpu"
+dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=MODEL_NAME,
+    chunk_length_s=30,
+    device=device,
+    torch_dtype=dtype,
+    return_timestamps=True,
+)
+# ===== 时间戳格式化 =====
 def _srt_timestamp(seconds: float | None) -> str:
     if seconds is None or seconds < 0:
         seconds = 0.0
     s, ms = divmod(ms, 1000)
     return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+# ===== 文本切分 + 自动补标点 =====
 def _split_text_units(txt: str, max_seg_chars: int) -> list[str]:
+    strong = "。！？.!?"
     units, cur = [], []
     for ch in txt:
         cur.append(ch)
         if len(u) <= max_seg_chars:
             refined.append(u)
         else:
+            # 长句继续切分，并自动补句号
             for i in range(0, len(u), max_seg_chars):
+                piece = u[i:i+max_seg_chars].strip()
+                if piece and piece[-1] not in strong:
+                    piece += "。"
+                refined.append(piece)
+    # 如果最后一段没有标点，补句号
+    if refined and refined[-1][-1] not in strong:
+        refined[-1] += "。"
     return [x for x in refined if x]
+# ===== chunks 转 SRT (无编号 + 自动标点) =====
+def chunks_to_srt(chunks: list[dict],
+                  max_seg_dur: float = MAX_SEG_DUR,
+                  max_seg_chars: int = MAX_SEG_CHARS) -> str:
     lines = []
     for ch in chunks or []:
         text = (ch.get("text") or "").strip()
         if not text:
         else:
             c_start, c_end = 0.0, 2.0
         units = _split_text_units(text, max_seg_chars)
         if not units:
             units = [text]
         cur_t = c_start
         for u in units:
+            alloc = total_dur * (len(u) / total_chars)
+            alloc = max(alloc, 0.2)
             if alloc <= max_seg_dur:
                 pieces = [u]
                 per = alloc
             else:
+                # 再次切分，均匀分时长
+                smalls = [u[i:i+max_seg_chars] for i in range(0, len(u), max_seg_chars)]
+                pieces = [s.strip() + ("。" if not s.endswith("。") else "") for s in smalls if s.strip()]
+                per = min(max_seg_dur, alloc / max(1, len(pieces)))
+            for p in pieces:
+                if p and p[-1] not in "。！？.!?":
+                    p += "。"
                 st = cur_t
                 en = st + per
                 lines.append(f"{_srt_timestamp(st)} --> {_srt_timestamp(en)}")
+                lines.append(p.strip())
                 lines.append("")
                 cur_t = en
     return "\n".join(lines).strip() + ("\n" if lines else "")
+# ===== 上传音频 → SRT =====
 @spaces.GPU
+def transcribe_file_to_srt(audio_path: str, task: str):
     if not audio_path:
         raise gr.Error("请先上传音频文件。")
     try:
     except OSError:
         pass
+    result = pipe(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
+    text = result.get("text", "") or ""
+    chunks = result.get("chunks") or []
+    srt_str = chunks_to_srt(chunks)
+    if not srt_str and text.strip():
+        srt_str = "00:00:00,000 --> 00:00:02,000\n" + (text.strip() + "。") + "\n"
     tmpdir = tempfile.mkdtemp(prefix="srt_")
     base = os.path.splitext(os.path.basename(audio_path))[0] or "subtitle"
     srt_path = os.path.join(tmpdir, f"{base}.srt")
     return srt_str, srt_path
+# ===== 界面 =====
 demo = gr.Interface(
     fn=transcribe_file_to_srt,
     inputs=[
         gr.Audio(sources="upload", type="filepath", label="Audio file"),
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs=[
+        gr.Textbox(label="Transcript (SRT Preview)", lines=18),
         gr.File(label="Download SRT"),
     ],
+    title="Upload Audio → SRT Subtitle",
+    description=f"Upload an audio file to generate time-stamped SRT subtitles (auto punctuation, no numbering). Model: {MODEL_NAME}",
     allow_flagging="never",
 )