whisper-large-v3-srt

Running on Zero

App Files Files Community

datxy commited on 21 days ago

Commit

5085deb

verified ·

1 Parent(s): cf205ff

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -133

app.py CHANGED Viewed

@@ -6,30 +6,32 @@ from transformers import pipeline
 import tempfile
 import os
-# ================== 可调参数 ==================
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
-TARGET_SENT_CHARS = 12     # 目标每句字数（中文场景）
-COMMA_EVERY = 0            # 如需更细粒度短停顿，可设 6/8（表示每 N 字加一个“，”并收句）；0 表示关闭
-MAX_SEG_DUR = 6.0          # 每句最长时长（秒）
-MIN_PIECE_DUR = 0.30       # 每句最小时长（秒），避免闪烁
-STRONG_PUNCT = "。！？.!?"
 device = 0 if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 asr = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
     chunk_length_s=30,
     device=device,
     torch_dtype=dtype,
-    return_timestamps=True,   # 仅需 chunk 级 (start,end)
 )
 def _ts(t: float | None) -> str:
     if t is None or t < 0:
         t = 0.0
     ms = int(float(t) * 1000 + 0.5)
@@ -39,17 +41,20 @@ def _ts(t: float | None) -> str:
     return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
 def _norm_chunks(chunks: list[dict]) -> list[dict]:
-    """规范化 chunks: [{'text': str, 'start': float, 'end': float}]"""
     out = []
     for ch in chunks or []:
         text = (ch.get("text") or "").strip()
-        ts = ch.get("timestamp") or ch.get("timestamps") or [0.0, 2.0]
         if not text:
             continue
         if isinstance(ts, (list, tuple)) and len(ts) == 2:
             s, e = float(ts[0] or 0.0), float(ts[1] or 0.0)
         else:
-            s, e = 0.0, 2.0
         if e < s:
             e = s
         out.append({"text": text, "start": s, "end": e})
@@ -57,7 +62,7 @@ def _norm_chunks(chunks: list[dict]) -> list[dict]:
 def _char_timeline(chunk: dict) -> list[tuple[str, float, float]]:
     """
-    把一个 chunk 的文本按字符建立时间轴：
     返回 [(char, char_start, char_end), ...]
     """
     text = chunk["text"]
@@ -73,147 +78,65 @@ def _char_timeline(chunk: dict) -> list[tuple[str, float, float]]:
         cur = nxt
     return timeline
-def _segment_short_sentences(char_stream: list[tuple[str, float, float]],
-                             target_chars: int = TARGET_SENT_CHARS,
-                             comma_every: int = COMMA_EVERY,
-                             max_seg_dur: float = MAX_SEG_DUR) -> list[tuple[float, float, str]]:
     """
-    核心切分：
-    - 累积字符直到遇到强标点 或 达到 target_chars
-    - 可选：每 comma_every 个字符插入逗号并收句
-    - 强标点永远并入本句，绝不产生“单独标点句”
-    - 超长句再按时长 <= max_seg_dur 均匀切
     """
     segments = []
     buf_chars: list[str] = []
-    buf_start = None
-    last_char_end = None
-    since_last_comma = 0
-    def flush_sentence(force_punct=False):
-        nonlocal buf_chars, buf_start, last_char_end, since_last_comma
         if not buf_chars:
             return
         text = "".join(buf_chars).strip()
-        if not text:
-            buf_chars = []
-            buf_start = None
-            since_last_comma = 0
-            return
-        # 保证句末有强标点
-        if force_punct and text[-1] not in STRONG_PUNCT:
-            text += "。"
-        elif text[-1] not in STRONG_PUNCT:
-            text += "。"
         st = buf_start if buf_start is not None else 0.0
-        en = last_char_end if last_char_end is not None else st
-        # 时长保护
-        if en - st < MIN_PIECE_DUR:
-            en = st + MIN_PIECE_DUR
         segments.append((st, en, text))
-        buf_chars = []
-        buf_start = None
-        since_last_comma = 0
-    def try_hard_wrap_long(st: float, en: float, text: str):
-        """
-        如果单句太长（> max_seg_dur），按时长把文本均匀切成多块，每块 <= max_seg_dur，句末补句号。
-        使用 [st,en] 线性映射。
-        """
-        out = []
-        dur = max(en - st, 0.0)
-        if dur <= max_seg_dur:
-            return [(st, en, text if text[-1] in STRONG_PUNCT else (text + "。"))]
-        # 需要切成 k 块
-        k = int(dur // max_seg_dur) + 1
-        # 按字符再均匀切
-        L = len(text)
-        piece_len = max(L // k, 1)
-        pos = 0
-        for i in range(k):
-            sub = text[pos:pos + piece_len]
-            if not sub:
-                continue
-            sub_st = st + (i / k) * dur
-            sub_en = st + ((i + 1) / k) * dur
-            if sub[-1] not in STRONG_PUNCT:
-                sub += "。"
-            if sub_en - sub_st < MIN_PIECE_DUR:
-                sub_en = sub_st + MIN_PIECE_DUR
-            out.append((sub_st, sub_en, sub))
-            pos += piece_len
-        # 余数
-        if pos < L:
-            sub = text[pos:]
-            sub_st = st + (len("".join([t for _,_,t in out])) / max(L,1)) * dur
-            sub_en = en
-            if sub and sub[-1] not in STRONG_PUNCT:
-                sub += "。"
-            if sub_en - sub_st < MIN_PIECE_DUR:
-                sub_en = sub_st + MIN_PIECE_DUR
-            out.append((sub_st, sub_en, sub))
-        return out
-    # 遍历逐字时间线
-    for ch, ch_st, ch_en in char_stream:
         if buf_start is None:
-            buf_start = ch_st
-        buf_chars.append(ch)
-        last_char_end = ch_en
-        since_last_comma += 1
-        # 强标点：直接并入当前句并收句
-        if ch in STRONG_PUNCT:
-            flush_sentence(force_punct=False)
-            continue
-        # 逗号式短停顿（可选）
-        if comma_every and since_last_comma >= comma_every:
-            # 只在当前累积达到目标一半以上时才加逗号，避免太碎
-            if len(buf_chars) >= max(6, target_chars // 2):
-                if buf_chars and buf_chars[-1] not in "，,、；;" and buf_chars[-1] not in STRONG_PUNCT:
-                    buf_chars.append("，")
-                flush_sentence(force_punct=False)
-                continue
-            else:
-                since_last_comma = 0  # 重置计数，继续攒
-        # 达到目标长度：收句并补句号
-        if len(buf_chars) >= target_chars:
-            flush_sentence(force_punct=True)
     # 收尾
-    flush_sentence(force_punct=True)
-    # 二次处理：把任何超时长的句子再按时长切块（<= MAX_SEG_DUR）
-    final_segments = []
-    for st, en, tx in segments:
-        if en - st > max_seg_dur:
-            final_segments.extend(try_hard_wrap_long(st, en, tx))
-        else:
-            final_segments.append((st, en, tx if tx[-1] in STRONG_PUNCT else (tx + "。")))
-    return final_segments
 def chunks_to_srt_no_number(chunks: list[dict]) -> str:
     """
-    外层封装：逐 chunk 建立字符时间线 → 合并 → 切分 → 输出无编号 SRT。
     """
     norm = _norm_chunks(chunks)
-    # 构建全局逐字时间线（按 chunk 顺序拼接）
     char_stream = []
     for ch in norm:
         char_stream.extend(_char_timeline(ch))
-    # 切分为短句片段
-    segs = _segment_short_sentences(
-        char_stream,
-        target_chars=TARGET_SENT_CHARS,
-        comma_every=COMMA_EVERY,
-        max_seg_dur=MAX_SEG_DUR,
-    )
     # 输出（无编号）
     lines = []
@@ -223,7 +146,7 @@ def chunks_to_srt_no_number(chunks: list[dict]) -> str:
         lines.append("")
     return "\n".join(lines).strip() + ("\n" if lines else "")
-# ================== 推理与UI ==================
 @spaces.GPU
 def transcribe_file_to_srt(audio_path: str, task: str):
     if not audio_path:
@@ -233,16 +156,22 @@ def transcribe_file_to_srt(audio_path: str, task: str):
         if size_mb > FILE_LIMIT_MB:
             raise gr.Error(f"文件过大：{size_mb:.1f} MB，超过限制 {FILE_LIMIT_MB} MB。")
     except OSError:
         pass
     result = asr(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
     text = (result.get("text") or "").strip()
     chunks = result.get("chunks") or []
     srt_str = chunks_to_srt_no_number(chunks)
     if not srt_str and text:
-        srt_str = "00:00:00,000 --> 00:00:02,000\n" + (text + ("。" if text[-1] not in STRONG_PUNCT else "")) + "\n"
     tmpdir = tempfile.mkdtemp(prefix="srt_")
     base = os.path.splitext(os.path.basename(audio_path))[0] or "subtitle"
     srt_path = os.path.join(tmpdir, f"{base}.srt")
@@ -258,12 +187,12 @@ demo = gr.Interface(
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs=[
-        gr.Textbox(label="Transcript (SRT Preview — short sentences, no numbering)", lines=18),
         gr.File(label="Download SRT"),
     ],
-    title="Upload Audio → SRT (Short Sentences, No Numbering)",
-    description=f"Character-timeline resegmentation. Natural short sentences like “他跟三国志他不一样。/ 他也是在那个基础上。/ …”. Model: {MODEL_NAME}",
     allow_flagging="never",
 )
-demo.queue().launch()

 import tempfile
 import os
+# ================== 配置与可调参数 ==================
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
+# —— 静音切分策略（本次需求）——
+SILENCE_GAP = 0.2   # 相邻字符开始时间的间隔 >= 0.2s 触发切分
+MIN_SEG_DUR = 0.30  # 每段最小时长，避免字幕闪烁
+# 设备/精度
 device = 0 if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# ASR 推理器：开启 chunk 级时间戳
 asr = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
     chunk_length_s=30,
     device=device,
     torch_dtype=dtype,
+    return_timestamps=True,   # 返回 chunk 级别时间戳（start, end）
 )
+# ================== 工具函数 ==================
 def _ts(t: float | None) -> str:
+    """将秒数转为 SRT 时间戳 00:00:00,000"""
     if t is None or t < 0:
         t = 0.0
     ms = int(float(t) * 1000 + 0.5)
     return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
 def _norm_chunks(chunks: list[dict]) -> list[dict]:
+    """
+    规范化 chunks: [{'text': str, 'start': float, 'end': float}]
+    兼容不同字段名：'timestamp' 或 'timestamps'
+    """
     out = []
     for ch in chunks or []:
         text = (ch.get("text") or "").strip()
+        ts = ch.get("timestamp") or ch.get("timestamps") or [0.0, 0.0]
         if not text:
             continue
         if isinstance(ts, (list, tuple)) and len(ts) == 2:
             s, e = float(ts[0] or 0.0), float(ts[1] or 0.0)
         else:
+            s, e = 0.0, 0.0
         if e < s:
             e = s
         out.append({"text": text, "start": s, "end": e})
 def _char_timeline(chunk: dict) -> list[tuple[str, float, float]]:
     """
+    对单个 chunk 逐字符线性插值时间：
     返回 [(char, char_start, char_end), ...]
     """
     text = chunk["text"]
         cur = nxt
     return timeline
+# ================== 按静音间隔切分（不依赖标点） ==================
+def _segment_by_silence(char_stream: list[tuple[str, float, float]],
+                        silence_gap: float = SILENCE_GAP,
+                        min_seg_dur: float = MIN_SEG_DUR) -> list[tuple[float, float, str]]:
     """
+    按相邻字符的“开始时间间隔”切分：
+    - 若当前字符开始时间 与 上一字符结束时间 的差值 >= silence_gap → 立刻切一段
+    - 不做标点处理，所有字符原样拼接
+    - 每段最小时长保护
     """
     segments = []
     buf_chars: list[str] = []
+    buf_start: float | None = None
+    last_end: float | None = None
+    prev_end: float | None = None
+    def flush():
+        nonlocal buf_chars, buf_start, last_end
         if not buf_chars:
             return
         text = "".join(buf_chars).strip()
         st = buf_start if buf_start is not None else 0.0
+        en = last_end if last_end is not None else st
+        if en - st < min_seg_dur:
+            en = st + min_seg_dur
         segments.append((st, en, text))
+        buf_chars, buf_start, last_end = [], None, None
+    for ch, st, en in char_stream:
+        # 如果缓存为空，初始化起点
         if buf_start is None:
+            buf_start = st
+        # 与上一字符“结束时间”的间隔用于判断静音切分
+        if prev_end is not None and (st - prev_end) >= silence_gap:
+            flush()
+            buf_start = st  # 新段的起点从当前字符开始
+        buf_chars.append(ch)
+        last_end = en
+        prev_end = en
     # 收尾
+    flush()
+    return segments
 def chunks_to_srt_no_number(chunks: list[dict]) -> str:
     """
+    流程：规范化 chunks → 全局逐字符时间线 → 按静音切分 → 输出无编号 SRT
     """
     norm = _norm_chunks(chunks)
+    # 拼接全局字符流
     char_stream = []
     for ch in norm:
         char_stream.extend(_char_timeline(ch))
+    # 静音切分
+    segs = _segment_by_silence(char_stream)
     # 输出（无编号）
     lines = []
         lines.append("")
     return "\n".join(lines).strip() + ("\n" if lines else "")
+# ================== 推理与 UI ==================
 @spaces.GPU
 def transcribe_file_to_srt(audio_path: str, task: str):
     if not audio_path:
         if size_mb > FILE_LIMIT_MB:
             raise gr.Error(f"文件过大：{size_mb:.1f} MB，超过限制 {FILE_LIMIT_MB} MB。")
     except OSError:
+        # 某些远端路径可能拿不到大小，忽略即可
         pass
+    # 运行 ASR
     result = asr(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
     text = (result.get("text") or "").strip()
     chunks = result.get("chunks") or []
+    # 基于静音切分生成 SRT（无编号）
     srt_str = chunks_to_srt_no_number(chunks)
+    # 兜底：若无 chunks，则给出整段兜底字幕
     if not srt_str and text:
+        srt_str = "00:00:00,000 --> 00:00:02,000\n" + text + "\n"
+    # 写入临时文件，供下载
     tmpdir = tempfile.mkdtemp(prefix="srt_")
     base = os.path.splitext(os.path.basename(audio_path))[0] or "subtitle"
     srt_path = os.path.join(tmpdir, f"{base}.srt")
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs=[
+        gr.Textbox(label="Transcript (SRT Preview — silence-based, no numbering)", lines=18),
         gr.File(label="Download SRT"),
     ],
+    title="Upload Audio → SRT (Silence-Based Segmentation, No Numbering)",
+    description=f"切分规则：相邻字符静音间隔 ≥ {SILENCE_GAP}s 则切段；不依赖标点。模型：{MODEL_NAME}",
     allow_flagging="never",
 )
+demo.queue().launch()