whisper-large-v3-srt

Running on Zero

App Files Files Community

datxy commited on 25 days ago

Commit

a33f970

verified ·

1 Parent(s): caa6c38

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -97

app.py CHANGED Viewed

@@ -1,16 +1,17 @@
 import spaces
 import torch
 import gradio as gr
 from transformers import pipeline
 import tempfile
 import os
-from datetime import timedelta
-# ===== 配置 =====
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
-FILE_LIMIT_MB = 1000  # 最大 1000MB
 device = 0 if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if torch.cuda.is_available() else torch.float32
@@ -21,12 +22,11 @@ pipe = pipeline(
     chunk_length_s=30,
     device=device,
     torch_dtype=dtype,
-    return_timestamps="word",   # 关键：逐词时间戳，便于细分
 )
-# ===== 工具函数：时间戳/SRT =====
-def _srt_timestamp(seconds):
-    """秒 -> SRT 时间戳 00:00:00,000。None/负数时归零。"""
     if seconds is None or seconds < 0:
         seconds = 0.0
     ms = int(float(seconds) * 1000 + 0.5)
@@ -35,92 +35,83 @@ def _srt_timestamp(seconds):
     s, ms = divmod(ms, 1000)
     return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
-def chunks_to_srt(chunks, text_fallback="", max_seg_dur=6.0, max_seg_chars=42):
-    """
-    用逐词时间戳把长 chunk 细分成更短的 SRT 行：
-    - 每行最长持续 max_seg_dur 秒
-    - 或字符数约 max_seg_chars
-    - 遇到句末标点（。！？.!?）优先断句
-    """
-    segs = []
-    cur_words = []
-    cur_start = None
-    cur_len = 0
-    def flush_seg():
-        nonlocal cur_words, cur_start, cur_len
-        if not cur_words:
-            return
-        # 兼容多种时间戳字段
-        st = cur_start if cur_start is not None else cur_words[0].get("start", 0.0)
-        en = cur_words[-1].get("end", cur_words[-1].get("timestamp", [0.0, 0.0])[-1] if isinstance(cur_words[-1].get("timestamp"), (list, tuple)) else 0.0)
-        if isinstance(st, (list, tuple)): st = st[0]
-        if isinstance(en, (list, tuple)): en = en[-1]
-        text = "".join(w.get("word", "").strip() for w in cur_words).strip()
-        if text:
-            segs.append((float(st or 0.0), float(en or 0.0), text))
-        cur_words = []
-        cur_start = None
-        cur_len = 0
-    def maybe_flush(force=False, strong_punct=False):
-        if not cur_words:
-            return
-        st = cur_start if cur_start is not None else cur_words[0].get("start", 0.0)
-        en = cur_words[-1].get("end", cur_words[-1].get("timestamp", [0.0, 0.0])[-1] if isinstance(cur_words[-1].get("timestamp"), (list, tuple)) else 0.0)
-        if isinstance(st, (list, tuple)): st = st[0]
-        if isinstance(en, (list, tuple)): en = en[-1]
-        dur = float((en or 0.0) - (st or 0.0))
-        if force or strong_punct or dur >= max_seg_dur or cur_len >= max_seg_chars:
-            flush_seg()
-    # 汇总所有词
-    all_words = []
     for ch in chunks or []:
-        words = ch.get("words") or []
-        if not words and ch.get("text"):
-            ts = ch.get("timestamp") or ch.get("timestamps") or [0.0, 2.0]
-            if isinstance(ts, (list, tuple)) and len(ts) == 2:
-                all_words.append({"word": ch["text"], "start": ts[0], "end": ts[1]})
-            else:
-                all_words.append({"word": ch["text"], "start": 0.0, "end": 2.0})
             continue
-        for w in words:
-            token = (w.get("word") or "").replace("\n", " ")
-            start = w.get("start")
-            end = w.get("end")
-            if (start is None or end is None) and isinstance(w.get("timestamp"), (list, tuple)) and len(w["timestamp"]) == 2:
-                start, end = w["timestamp"]
-            all_words.append({"word": token, "start": start, "end": end})
-    # 若依旧拿不到逐词，回退整段文本
-    if not all_words and text_fallback.strip():
-        all_words = [{"word": text_fallback.strip(), "start": 0.0, "end": max_seg_dur}]
-    # 按规则切分
-    for w in all_words:
-        token = w.get("word", "")
-        if not token:
-            continue
-        if cur_start is None:
-            cur_start = w.get("start", 0.0)
-        cur_words.append(w)
-        cur_len += len(token)
-        strong = token.endswith(("。", "！", "？", ".", "!", "?"))
-        maybe_flush(force=False, strong_punct=strong)
-    maybe_flush(force=True)
-    # 生成 SRT
-    lines = []
-    for i, (st, en, txt) in enumerate(segs, 1):
-        lines.append(str(i))
-        lines.append(f"{_srt_timestamp(st)} --> {_srt_timestamp(en)}")
-        lines.append(txt)
-        lines.append("")
     return "\n".join(lines).strip() + ("\n" if lines else "")
-# ===== 上传音频 -> SRT 导出 =====
 @spaces.GPU
 def transcribe_file_to_srt(audio_path: str, task: str):
     if not audio_path:
@@ -136,20 +127,19 @@ def transcribe_file_to_srt(audio_path: str, task: str):
     text = result.get("text", "") or ""
     chunks = result.get("chunks") or []
-    # 转 SRT（预览即为 SRT）
-    srt_str = chunks_to_srt(chunks, text_fallback=text)
-    # 写入临时文件供下载
     tmpdir = tempfile.mkdtemp(prefix="srt_")
     base = os.path.splitext(os.path.basename(audio_path))[0] or "subtitle"
     srt_path = os.path.join(tmpdir, f"{base}.srt")
     with open(srt_path, "w", encoding="utf-8") as f:
         f.write(srt_str)
-    # 第一个输出显示 SRT 字符串，第二个输出提供下载
     return srt_str, srt_path
-# ===== Gradio 界面 =====
 demo = gr.Interface(
     fn=transcribe_file_to_srt,
     inputs=[
@@ -161,10 +151,7 @@ demo = gr.Interface(
         gr.File(label="Download SRT"),
     ],
     title="Upload Audio → SRT Subtitle",
-    description=(
-        "Upload an audio file to generate time-stamped SRT subtitles. "
-        f"Backed by [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME})."
-    ),
     allow_flagging="never",
 )

+# app.py
 import spaces
 import torch
 import gradio as gr
 from transformers import pipeline
 import tempfile
 import os
+# ===== 参数 =====
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
+FILE_LIMIT_MB = 1000
+MAX_SEG_DUR = 6.0
+MAX_SEG_CHARS = 28
 device = 0 if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if torch.cuda.is_available() else torch.float32
     chunk_length_s=30,
     device=device,
     torch_dtype=dtype,
+    return_timestamps=True,
 )
+# ===== 时间戳格式化 =====
+def _srt_timestamp(seconds: float | None) -> str:
     if seconds is None or seconds < 0:
         seconds = 0.0
     ms = int(float(seconds) * 1000 + 0.5)
     s, ms = divmod(ms, 1000)
     return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+# ===== 文本切分 + 自动补标点 =====
+def _split_text_units(txt: str, max_seg_chars: int) -> list[str]:
+    strong = "。！？.!?"
+    units, cur = [], []
+    for ch in txt:
+        cur.append(ch)
+        if ch in strong:
+            units.append("".join(cur).strip())
+            cur = []
+    if cur:
+        units.append("".join(cur).strip())
+    refined = []
+    for u in units:
+        if len(u) <= max_seg_chars:
+            refined.append(u)
+        else:
+            # 长句继续切分，并自动补句号
+            for i in range(0, len(u), max_seg_chars):
+                piece = u[i:i+max_seg_chars].strip()
+                if piece and piece[-1] not in strong:
+                    piece += "。"
+                refined.append(piece)
+    # 如果最后一段没有标点，补句号
+    if refined and refined[-1][-1] not in strong:
+        refined[-1] += "。"
+    return [x for x in refined if x]
+# ===== chunks 转 SRT (无编号 + 自动标点) =====
+def chunks_to_srt(chunks: list[dict],
+                  max_seg_dur: float = MAX_SEG_DUR,
+                  max_seg_chars: int = MAX_SEG_CHARS) -> str:
+    lines = []
     for ch in chunks or []:
+        text = (ch.get("text") or "").strip()
+        if not text:
             continue
+        ts = ch.get("timestamp") or ch.get("timestamps") or [0.0, 2.0]
+        if isinstance(ts, (list, tuple)) and len(ts) == 2:
+            c_start, c_end = float(ts[0] or 0.0), float(ts[1] or 0.0)
+        else:
+            c_start, c_end = 0.0, 2.0
+        units = _split_text_units(text, max_seg_chars)
+        if not units:
+            units = [text]
+        total_chars = sum(len(u) for u in units) or 1
+        total_dur = max(c_end - c_start, 0.0)
+        cur_t = c_start
+        for u in units:
+            alloc = total_dur * (len(u) / total_chars)
+            alloc = max(alloc, 0.2)
+            if alloc <= max_seg_dur:
+                pieces = [u]
+                per = alloc
+            else:
+                # 再次切分，均匀分时长
+                smalls = [u[i:i+max_seg_chars] for i in range(0, len(u), max_seg_chars)]
+                pieces = [s.strip() + ("。" if not s.endswith("。") else "") for s in smalls if s.strip()]
+                per = min(max_seg_dur, alloc / max(1, len(pieces)))
+            for p in pieces:
+                if p and p[-1] not in "。！？.!?":
+                    p += "。"
+                st = cur_t
+                en = st + per
+                lines.append(f"{_srt_timestamp(st)} --> {_srt_timestamp(en)}")
+                lines.append(p.strip())
+                lines.append("")
+                cur_t = en
     return "\n".join(lines).strip() + ("\n" if lines else "")
+# ===== 上传音频 → SRT =====
 @spaces.GPU
 def transcribe_file_to_srt(audio_path: str, task: str):
     if not audio_path:
     text = result.get("text", "") or ""
     chunks = result.get("chunks") or []
+    srt_str = chunks_to_srt(chunks)
+    if not srt_str and text.strip():
+        srt_str = "00:00:00,000 --> 00:00:02,000\n" + (text.strip() + "。") + "\n"
     tmpdir = tempfile.mkdtemp(prefix="srt_")
     base = os.path.splitext(os.path.basename(audio_path))[0] or "subtitle"
     srt_path = os.path.join(tmpdir, f"{base}.srt")
     with open(srt_path, "w", encoding="utf-8") as f:
         f.write(srt_str)
     return srt_str, srt_path
+# ===== 界面 =====
 demo = gr.Interface(
     fn=transcribe_file_to_srt,
     inputs=[
         gr.File(label="Download SRT"),
     ],
     title="Upload Audio → SRT Subtitle",
+    description=f"Upload an audio file to generate time-stamped SRT subtitles (auto punctuation, no numbering). Model: {MODEL_NAME}",
     allow_flagging="never",
 )