whisper-large-v3-srt

Running on Zero

App Files Files Community

datxy commited on 20 days ago

Commit

cf205ff

verified ·

1 Parent(s): 261c97f

Update app.py

Browse files

Files changed (1) hide show

app.py +198 -87

app.py CHANGED Viewed

@@ -6,112 +6,224 @@ from transformers import pipeline
 import tempfile
 import os
-# ===== 参数 =====
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
-MAX_SEG_DUR = 6.0
-MAX_SEG_CHARS = 28
 device = 0 if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
     chunk_length_s=30,
     device=device,
     torch_dtype=dtype,
-    return_timestamps=True,
 )
-# ===== 时间戳格式化 =====
-def _srt_timestamp(seconds: float | None) -> str:
-    if seconds is None or seconds < 0:
-        seconds = 0.0
-    ms = int(float(seconds) * 1000 + 0.5)
     h, ms = divmod(ms, 3600000)
     m, ms = divmod(ms, 60000)
     s, ms = divmod(ms, 1000)
     return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
-# ===== 文本切分 + 自动补标点 =====
-def _split_text_units(txt: str, max_seg_chars: int) -> list[str]:
-    strong = "。！？.!?"
-    units, cur = [], []
-    for ch in txt:
-        cur.append(ch)
-        if ch in strong:
-            units.append("".join(cur).strip())
-            cur = []
-    if cur:
-        units.append("".join(cur).strip())
-    refined = []
-    for u in units:
-        if len(u) <= max_seg_chars:
-            refined.append(u)
-        else:
-            # 长句继续切分，并自动补句号
-            for i in range(0, len(u), max_seg_chars):
-                piece = u[i:i+max_seg_chars].strip()
-                if piece and piece[-1] not in strong:
-                    piece += "。"
-                refined.append(piece)
-    # 如果最后一段没有标点，补句号
-    if refined and refined[-1][-1] not in strong:
-        refined[-1] += "。"
-    return [x for x in refined if x]
-# ===== chunks 转 SRT (无编号 + 自动标点) =====
-def chunks_to_srt(chunks: list[dict],
-                  max_seg_dur: float = MAX_SEG_DUR,
-                  max_seg_chars: int = MAX_SEG_CHARS) -> str:
-    lines = []
     for ch in chunks or []:
         text = (ch.get("text") or "").strip()
         if not text:
             continue
-        ts = ch.get("timestamp") or ch.get("timestamps") or [0.0, 2.0]
         if isinstance(ts, (list, tuple)) and len(ts) == 2:
-            c_start, c_end = float(ts[0] or 0.0), float(ts[1] or 0.0)
         else:
-            c_start, c_end = 0.0, 2.0
-        units = _split_text_units(text, max_seg_chars)
-        if not units:
-            units = [text]
-        total_chars = sum(len(u) for u in units) or 1
-        total_dur = max(c_end - c_start, 0.0)
-        cur_t = c_start
-        for u in units:
-            alloc = total_dur * (len(u) / total_chars)
-            alloc = max(alloc, 0.2)
-            if alloc <= max_seg_dur:
-                pieces = [u]
-                per = alloc
             else:
-                # 再次切分，均匀分时长
-                smalls = [u[i:i+max_seg_chars] for i in range(0, len(u), max_seg_chars)]
-                pieces = [s.strip() + ("。" if not s.endswith("。") else "") for s in smalls if s.strip()]
-                per = min(max_seg_dur, alloc / max(1, len(pieces)))
-            for p in pieces:
-                if p and p[-1] not in "。！？.!?":
-                    p += "。"
-                st = cur_t
-                en = st + per
-                lines.append(f"{_srt_timestamp(st)} --> {_srt_timestamp(en)}")
-                lines.append(p.strip())
-                lines.append("")
-                cur_t = en
     return "\n".join(lines).strip() + ("\n" if lines else "")
-# ===== 上传音频 → SRT =====
 @spaces.GPU
 def transcribe_file_to_srt(audio_path: str, task: str):
     if not audio_path:
@@ -123,13 +235,13 @@ def transcribe_file_to_srt(audio_path: str, task: str):
     except OSError:
         pass
-    result = pipe(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
-    text = result.get("text", "") or ""
     chunks = result.get("chunks") or []
-    srt_str = chunks_to_srt(chunks)
-    if not srt_str and text.strip():
-        srt_str = "00:00:00,000 --> 00:00:02,000\n" + (text.strip() + "。") + "\n"
     tmpdir = tempfile.mkdtemp(prefix="srt_")
     base = os.path.splitext(os.path.basename(audio_path))[0] or "subtitle"
@@ -139,7 +251,6 @@ def transcribe_file_to_srt(audio_path: str, task: str):
     return srt_str, srt_path
-# ===== 界面 =====
 demo = gr.Interface(
     fn=transcribe_file_to_srt,
     inputs=[
@@ -147,12 +258,12 @@ demo = gr.Interface(
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs=[
-        gr.Textbox(label="Transcript (SRT Preview)", lines=18),
         gr.File(label="Download SRT"),
     ],
-    title="Upload Audio → SRT Subtitle",
-    description=f"Upload an audio file to generate time-stamped SRT subtitles (auto punctuation, no numbering). Model: {MODEL_NAME}",
     allow_flagging="never",
 )
-demo.queue().launch()

 import tempfile
 import os
+# ================== 可调参数 ==================
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
+TARGET_SENT_CHARS = 12     # 目标每句字数（中文场景）
+COMMA_EVERY = 0            # 如需更细粒度短停顿，可设 6/8（表示每 N 字加一个“，”并收句）；0 表示关闭
+MAX_SEG_DUR = 6.0          # 每句最长时长（秒）
+MIN_PIECE_DUR = 0.30       # 每句最小时长（秒），避免闪烁
+STRONG_PUNCT = "。！？.!?"
 device = 0 if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+asr = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
     chunk_length_s=30,
     device=device,
     torch_dtype=dtype,
+    return_timestamps=True,   # 仅需 chunk 级 (start,end)
 )
+def _ts(t: float | None) -> str:
+    if t is None or t < 0:
+        t = 0.0
+    ms = int(float(t) * 1000 + 0.5)
     h, ms = divmod(ms, 3600000)
     m, ms = divmod(ms, 60000)
     s, ms = divmod(ms, 1000)
     return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+def _norm_chunks(chunks: list[dict]) -> list[dict]:
+    """规范化 chunks: [{'text': str, 'start': float, 'end': float}]"""
+    out = []
     for ch in chunks or []:
         text = (ch.get("text") or "").strip()
+        ts = ch.get("timestamp") or ch.get("timestamps") or [0.0, 2.0]
         if not text:
             continue
         if isinstance(ts, (list, tuple)) and len(ts) == 2:
+            s, e = float(ts[0] or 0.0), float(ts[1] or 0.0)
         else:
+            s, e = 0.0, 2.0
+        if e < s:
+            e = s
+        out.append({"text": text, "start": s, "end": e})
+    return out
+def _char_timeline(chunk: dict) -> list[tuple[str, float, float]]:
+    """
+    把一个 chunk 的文本按字符建立时间轴：
+    返回 [(char, char_start, char_end), ...]
+    """
+    text = chunk["text"]
+    s, e = chunk["start"], chunk["end"]
+    dur = max(e - s, 0.0)
+    n = max(len(text), 1)
+    step = dur / n if n > 0 else 0.0
+    timeline = []
+    cur = s
+    for i, ch in enumerate(text):
+        nxt = s + (i + 1) * step
+        timeline.append((ch, cur, nxt))
+        cur = nxt
+    return timeline
+def _segment_short_sentences(char_stream: list[tuple[str, float, float]],
+                             target_chars: int = TARGET_SENT_CHARS,
+                             comma_every: int = COMMA_EVERY,
+                             max_seg_dur: float = MAX_SEG_DUR) -> list[tuple[float, float, str]]:
+    """
+    核心切分：
+    - 累积字符直到遇到强标点 或 达到 target_chars
+    - 可选：每 comma_every 个字符插入逗号并收句
+    - 强标点永远并入本句，绝不产生“单独标点句”
+    - 超长句再按时长 <= max_seg_dur 均匀切
+    """
+    segments = []
+    buf_chars: list[str] = []
+    buf_start = None
+    last_char_end = None
+    since_last_comma = 0
+    def flush_sentence(force_punct=False):
+        nonlocal buf_chars, buf_start, last_char_end, since_last_comma
+        if not buf_chars:
+            return
+        text = "".join(buf_chars).strip()
+        if not text:
+            buf_chars = []
+            buf_start = None
+            since_last_comma = 0
+            return
+        # 保证句末有强标点
+        if force_punct and text[-1] not in STRONG_PUNCT:
+            text += "。"
+        elif text[-1] not in STRONG_PUNCT:
+            text += "。"
+        st = buf_start if buf_start is not None else 0.0
+        en = last_char_end if last_char_end is not None else st
+        # 时长保护
+        if en - st < MIN_PIECE_DUR:
+            en = st + MIN_PIECE_DUR
+        segments.append((st, en, text))
+        buf_chars = []
+        buf_start = None
+        since_last_comma = 0
+    def try_hard_wrap_long(st: float, en: float, text: str):
+        """
+        如果单句太长（> max_seg_dur），按时长把文本均匀切成多块，每块 <= max_seg_dur，句末补句号。
+        使用 [st,en] 线性映射。
+        """
+        out = []
+        dur = max(en - st, 0.0)
+        if dur <= max_seg_dur:
+            return [(st, en, text if text[-1] in STRONG_PUNCT else (text + "。"))]
+        # 需要切成 k 块
+        k = int(dur // max_seg_dur) + 1
+        # 按字符再均匀切
+        L = len(text)
+        piece_len = max(L // k, 1)
+        pos = 0
+        for i in range(k):
+            sub = text[pos:pos + piece_len]
+            if not sub:
+                continue
+            sub_st = st + (i / k) * dur
+            sub_en = st + ((i + 1) / k) * dur
+            if sub[-1] not in STRONG_PUNCT:
+                sub += "。"
+            if sub_en - sub_st < MIN_PIECE_DUR:
+                sub_en = sub_st + MIN_PIECE_DUR
+            out.append((sub_st, sub_en, sub))
+            pos += piece_len
+        # 余数
+        if pos < L:
+            sub = text[pos:]
+            sub_st = st + (len("".join([t for _,_,t in out])) / max(L,1)) * dur
+            sub_en = en
+            if sub and sub[-1] not in STRONG_PUNCT:
+                sub += "。"
+            if sub_en - sub_st < MIN_PIECE_DUR:
+                sub_en = sub_st + MIN_PIECE_DUR
+            out.append((sub_st, sub_en, sub))
+        return out
+    # 遍历逐字时间线
+    for ch, ch_st, ch_en in char_stream:
+        if buf_start is None:
+            buf_start = ch_st
+        buf_chars.append(ch)
+        last_char_end = ch_en
+        since_last_comma += 1
+        # 强标点：直接并入当前句并收句
+        if ch in STRONG_PUNCT:
+            flush_sentence(force_punct=False)
+            continue
+        # 逗号式短停顿（可选）
+        if comma_every and since_last_comma >= comma_every:
+            # 只在当前累积达到目标一半以上时才加逗号，避免太碎
+            if len(buf_chars) >= max(6, target_chars // 2):
+                if buf_chars and buf_chars[-1] not in "，,、；;" and buf_chars[-1] not in STRONG_PUNCT:
+                    buf_chars.append("，")
+                flush_sentence(force_punct=False)
+                continue
             else:
+                since_last_comma = 0  # 重置计数，继续攒
+        # 达到目标长度：收句并补句号
+        if len(buf_chars) >= target_chars:
+            flush_sentence(force_punct=True)
+    # 收尾
+    flush_sentence(force_punct=True)
+    # 二次处理：把任何超时长的句子再按时长切块（<= MAX_SEG_DUR）
+    final_segments = []
+    for st, en, tx in segments:
+        if en - st > max_seg_dur:
+            final_segments.extend(try_hard_wrap_long(st, en, tx))
+        else:
+            final_segments.append((st, en, tx if tx[-1] in STRONG_PUNCT else (tx + "。")))
+    return final_segments
+def chunks_to_srt_no_number(chunks: list[dict]) -> str:
+    """
+    外层封装：逐 chunk 建立字符时间线 → 合并 → 切分 → 输出无编号 SRT。
+    """
+    norm = _norm_chunks(chunks)
+    # 构建全局逐字时间线（按 chunk 顺序拼接）
+    char_stream = []
+    for ch in norm:
+        char_stream.extend(_char_timeline(ch))
+    # 切分为短句片段
+    segs = _segment_short_sentences(
+        char_stream,
+        target_chars=TARGET_SENT_CHARS,
+        comma_every=COMMA_EVERY,
+        max_seg_dur=MAX_SEG_DUR,
+    )
+    # 输出（无编号）
+    lines = []
+    for st, en, tx in segs:
+        lines.append(f"{_ts(st)} --> {_ts(en)}")
+        lines.append(tx.strip())
+        lines.append("")
     return "\n".join(lines).strip() + ("\n" if lines else "")
+# ================== 推理与UI ==================
 @spaces.GPU
 def transcribe_file_to_srt(audio_path: str, task: str):
     if not audio_path:
     except OSError:
         pass
+    result = asr(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
+    text = (result.get("text") or "").strip()
     chunks = result.get("chunks") or []
+    srt_str = chunks_to_srt_no_number(chunks)
+    if not srt_str and text:
+        srt_str = "00:00:00,000 --> 00:00:02,000\n" + (text + ("。" if text[-1] not in STRONG_PUNCT else "")) + "\n"
     tmpdir = tempfile.mkdtemp(prefix="srt_")
     base = os.path.splitext(os.path.basename(audio_path))[0] or "subtitle"
     return srt_str, srt_path
 demo = gr.Interface(
     fn=transcribe_file_to_srt,
     inputs=[
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs=[
+        gr.Textbox(label="Transcript (SRT Preview — short sentences, no numbering)", lines=18),
         gr.File(label="Download SRT"),
     ],
+    title="Upload Audio → SRT (Short Sentences, No Numbering)",
+    description=f"Character-timeline resegmentation. Natural short sentences like “他跟三国志他不一样。/ 他也是在那个基础上。/ …”. Model: {MODEL_NAME}",
     allow_flagging="never",
 )
+demo.queue().launch()