whisper-large-v3-srt

Running on Zero

App Files Files Community

datxy commited on 23 days ago

Commit

d36ea49

verified ·

1 Parent(s): 38c2998

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -54

app.py CHANGED Viewed

@@ -1,37 +1,40 @@
 # app.py
 import os
 import spaces
 import torch
 import gradio as gr
 from transformers import pipeline
-import tempfile
 # ================== 配置与可调参数 ==================
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
-# —— 静音切分策略（本次需求）——
-SILENCE_GAP = 0.2   # 相邻字符之间的间隔 >= 0.2s 触发切分
-MIN_SEG_DUR = 0.30  # 每段最小时长，避免字幕闪烁
 # ================== 设备/精度（自动降级，避免 AcceleratorError） ==================
 def _pick_device_and_dtype():
-    """
-    - GPU 空间：CUDA:0，优先 FP16（sm >= 7）
-    - ZeroGPU / CPU：强制 CPU(-1) + FP32
-    - 避免在 CPU/ZeroGPU 上使用 @spaces.GPU 和 float16 导致 AcceleratorError
-    """
     is_zero_gpu = (
         os.environ.get("SYSTEM") == "spaces"
         and os.environ.get("SPACE_ACCELERATOR", "").lower() == "zero-gpu"
     )
     if torch.cuda.is_available() and not is_zero_gpu:
         try:
-            major, minor = torch.cuda.get_device_capability(0)
         except Exception:
-            major = 7  # 安全兜底
         use_fp16 = major >= 7
         return 0, (torch.float16 if use_fp16 else torch.float32)
     else:
@@ -49,7 +52,7 @@ asr = pipeline(
     return_timestamps=True, # 让结果包含 chunks 的 (start, end)
 )
-# ================== 工具函数 ==================
 def _ts(t: float | None) -> str:
     """秒 -> SRT 时间戳 00:00:00,000"""
     if t is None or t < 0:
@@ -99,62 +102,142 @@ def _char_timeline(chunk: dict) -> list[tuple[str, float, float]]:
         cur = nxt
     return timeline
-# ================== 按静音间隔切分（不依赖标点） ==================
-def _segment_by_silence(char_stream: list[tuple[str, float, float]],
-                        silence_gap: float = SILENCE_GAP,
-                        min_seg_dur: float = MIN_SEG_DUR) -> list[tuple[float, float, str]]:
     """
-    按相邻字符的“时间间隔”切分：
-    - 若当前字符开始时间 与 上一字符结束时间 的差值 >= silence_gap → 立刻切一段
-    - 不做标点处理，所有字符原样拼接
-    - 每段最小时长保护
     """
     segments = []
-    buf_chars: list[str] = []
-    buf_start: float | None = None
-    last_end: float | None = None
-    prev_end: float | None = None
-    def flush():
-        nonlocal buf_chars, buf_start, last_end
         if not buf_chars:
-            return
-        text = "".join(buf_chars).strip()
-        st = buf_start if buf_start is not None else 0.0
-        en = last_end if last_end is not None else st
-        if en - st < min_seg_dur:
-            en = st + min_seg_dur
-        segments.append((st, en, text))
-        buf_chars, buf_start, last_end = [], None, None
     for ch, st, en in char_stream:
-        if buf_start is None:
-            buf_start = st
-        if prev_end is not None and (st - prev_end) >= silence_gap:
-            flush()
-            buf_start = st
         buf_chars.append(ch)
-        last_end = en
-        prev_end = en
-    flush()
     return segments
-def chunks_to_srt_no_number(chunks: list[dict]) -> str:
     """
-    规范化 chunks → 全局逐字符时间线 → 按静音切分 → 输出无编号 SRT
     """
     norm = _norm_chunks(chunks)
     char_stream = []
     for ch in norm:
         char_stream.extend(_char_timeline(ch))
-    segs = _segment_by_silence(char_stream)
     lines = []
     for st, en, tx in segs:
         lines.append(f"{_ts(st)} --> {_ts(en)}")
-        lines.append(tx.strip())
         lines.append("")
     return "\n".join(lines).strip() + ("\n" if lines else "")
@@ -177,13 +260,12 @@ def transcribe_file_to_srt(audio_path: str, task: str):
     # 运行 ASR
     result = asr(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
-    # 兼容 transformers 返回格式
     text = (result.get("text") or "").strip() if isinstance(result, dict) else ""
     chunks = result.get("chunks") if isinstance(result, dict) else None
     chunks = chunks or []
-    # 基于静音切分生成 SRT（无编号）
-    srt_str = chunks_to_srt_no_number(chunks)
     # 兜底：若无 chunks，则给出整段兜底字幕
     if not srt_str and text:
@@ -205,12 +287,14 @@ demo = gr.Interface(
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs=[
-        gr.Textbox(label="Transcript (SRT Preview — silence-based, no numbering)", lines=18),
         gr.File(label="Download SRT"),
     ],
-    title="Upload Audio → SRT (Silence-Based Segmentation, No Numbering)",
-    description=f"切分规则：相邻字符静音间隔 ≥ {SILENCE_GAP}s；不依赖标点。模型：{MODEL_NAME}",
     allow_flagging="never",
 )
-demo.queue().launch()

 # app.py
 import os
+import tempfile
+import numpy as np
 import spaces
 import torch
 import gradio as gr
 from transformers import pipeline
+# 用于音频能量分析
+import librosa
 # ================== 配置与可调参数 ==================
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
+# —— 静音切分策略（能量/VAD风格）——
+SILENCE_MIN_LEN = 0.20   # 静音段最短持续时间(秒)，静音连续 ≥ 此长度才作为切分点
+FRAME_LEN_MS    = 25     # 能量分析帧长(毫秒)
+HOP_LEN_MS      = 10     # 帧移(毫秒)
+DB_DROP         = 25.0   # 相对峰值下落阈值（最大能量-25dB 以下视为静音候选）
+PCTL_FLOOR      = 20.0   # 也参考能量第20分位数，以避免环境底噪过低造成过切
+MIN_SEG_DUR     = 0.30   # 每段最小时长，避免字幕闪烁
 # ================== 设备/精度（自动降级，避免 AcceleratorError） ==================
 def _pick_device_and_dtype():
     is_zero_gpu = (
         os.environ.get("SYSTEM") == "spaces"
         and os.environ.get("SPACE_ACCELERATOR", "").lower() == "zero-gpu"
     )
     if torch.cuda.is_available() and not is_zero_gpu:
         try:
+            major, _ = torch.cuda.get_device_capability(0)
         except Exception:
+            major = 7
         use_fp16 = major >= 7
         return 0, (torch.float16 if use_fp16 else torch.float32)
     else:
     return_timestamps=True, # 让结果包含 chunks 的 (start, end)
 )
+# ================== 基础工具 ==================
 def _ts(t: float | None) -> str:
     """秒 -> SRT 时间戳 00:00:00,000"""
     if t is None or t < 0:
         cur = nxt
     return timeline
+# ================== 音频能量分析（librosa） ==================
+def _load_audio_mono(audio_path: str, sr: int = 16000):
     """
+    使用 librosa 读取为单声道、目标采样率。
+    返回: (y: np.ndarray[float32], sr: int)
     """
+    y, ysr = librosa.load(audio_path, sr=sr, mono=True)
+    if y.size == 0:
+        return np.zeros(1, dtype=np.float32), sr
+    return y.astype(np.float32), sr
+def _detect_silence_cuts(audio_path: str,
+                         min_silence_len: float = SILENCE_MIN_LEN,
+                         frame_len_ms: float = FRAME_LEN_MS,
+                         hop_len_ms: float = HOP_LEN_MS,
+                         db_drop: float = DB_DROP,
+                         pctl_floor: float = PCTL_FLOOR) -> tuple[list[float], float]:
+    """
+    返回切分边界时间点（秒）（在静音段的中心点），以及音频总时长。
+    - 阈值为 max_db - db_drop 与 第 pctl_floor 分位数二者取较高者（更保守）
+    - 连续低于阈值且持续 >= min_silence_len 记为静音段
+    """
+    y, sr = _load_audio_mono(audio_path, sr=16000)
+    dur = len(y) / float(sr)
+    if dur <= 0:
+        return [], 0.0
+    frame_len = int(sr * frame_len_ms / 1000.0)  # e.g., 25ms
+    hop_len   = int(sr * hop_len_ms / 1000.0)    # e.g., 10ms
+    frame_len = max(frame_len, 1)
+    hop_len   = max(hop_len, 1)
+    # RMS 能量（librosa 返回 shape=(1, n_frames)）
+    rms = librosa.feature.rms(y=y, frame_length=frame_len, hop_length=hop_len, center=True)[0]
+    # 转 dB，避免 log(0)
+    rms_db = 20.0 * np.log10(np.maximum(rms, 1e-9))
+    max_db = float(np.max(rms_db))
+    floor_db = float(np.percentile(rms_db, pctl_floor))
+    thr = max(max_db - db_drop, floor_db)  # 自适应阈值
+    below = rms_db < thr
+    # 找连续 True 的区间
+    cuts = []
+    i = 0
+    n = len(below)
+    min_frames = int(np.ceil(min_silence_len * sr / hop_len))  # 以 hop 为单位的最小连续帧数
+    while i < n:
+        if below[i]:
+            j = i + 1
+            while j < n and below[j]:
+                j += 1
+            # [i, j) 是连续静音
+            span_frames = j - i
+            if span_frames * hop_len / sr >= min_silence_len:
+                # 取静音段中心作为切分点（更稳）
+                mid_frame = i + span_frames // 2
+                cut_time = mid_frame * hop_len / sr
+                # 忽略过于靠近 0 或 末尾的切分点
+                if 0.05 < cut_time < (dur - 0.05):
+                    cuts.append(float(cut_time))
+            i = j
+        else:
+            i += 1
+    return cuts, dur
+# ================== 依据静音切分边界，合并字符时间线 ==================
+def _segment_by_energy(char_stream: list[tuple[str, float, float]],
+                       cut_times: list[float],
+                       min_seg_dur: float = MIN_SEG_DUR) -> list[tuple[float, float, str]]:
+    """
+    将全局逐字符时间线按 cut_times（静音中心点）切分。
+    """
+    if not char_stream:
+        return []
+    start_time = char_stream[0][1]
+    end_time   = char_stream[-1][2]
+    boundaries = [t for t in cut_times if start_time < t < end_time]
+    boundaries = sorted(set(boundaries))
     segments = []
+    buf_chars = []
+    seg_start = start_time
+    boundary_idx = 0
+    def flush(seg_end):
+        nonlocal buf_chars, seg_start
         if not buf_chars:
+            # 即使没有字符（极端情况），也保护时长
+            st = seg_start
+            en = max(seg_start + min_seg_dur, seg_end)
+            segments.append((st, en, ""))
+        else:
+            st = seg_start
+            en = max(seg_start + min_seg_dur, seg_end)
+            text = "".join(buf_chars).strip()
+            segments.append((st, en, text))
+        buf_chars = []
+    # 逐字符推进，遇到边界就 flush
     for ch, st, en in char_stream:
+        # 处理可能跨越多个边界的字符（罕见）
+        while boundary_idx < len(boundaries) and boundaries[boundary_idx] <= st:
+            cut_t = boundaries[boundary_idx]
+            flush(seg_end=cut_t)
+            seg_start = cut_t
+            boundary_idx += 1
         buf_chars.append(ch)
+    # 收尾
+    flush(seg_end=end_time)
     return segments
+def chunks_to_srt_no_number(chunks: list[dict], audio_path: str) -> str:
     """
+    规范化 chunks → 全局逐字符时间线 → 基于能量分析得到静音边界 → 切分 → 输出无编号 SRT
     """
     norm = _norm_chunks(chunks)
     char_stream = []
     for ch in norm:
         char_stream.extend(_char_timeline(ch))
+    # 从音频里检测静音切分点
+    cut_times, _dur = _detect_silence_cuts(audio_path)
+    # 用能量切分边界来合并字符
+    segs = _segment_by_energy(char_stream, cut_times)
     lines = []
     for st, en, tx in segs:
         lines.append(f"{_ts(st)} --> {_ts(en)}")
+        lines.append(tx)
         lines.append("")
     return "\n".join(lines).strip() + ("\n" if lines else "")
     # 运行 ASR
     result = asr(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task})
     text = (result.get("text") or "").strip() if isinstance(result, dict) else ""
     chunks = result.get("chunks") if isinstance(result, dict) else None
     chunks = chunks or []
+    # 基于“能量静音”生成 SRT（无编号）
+    srt_str = chunks_to_srt_no_number(chunks, audio_path)
     # 兜底：若无 chunks，则给出整段兜底字幕
     if not srt_str and text:
         gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
     ],
     outputs=[
+        gr.Textbox(label="Transcript (SRT Preview — energy-based silence, no numbering)", lines=18),
         gr.File(label="Download SRT"),
     ],
+    title="Upload Audio → SRT (Energy/VAD-Based Segmentation, No Numbering)",
+    description=(
+        f"切分规则：检测音频帧能量（RMS→dB），当静音连续 ≥ {SILENCE_MIN_LEN}s 时在中心点切分；不依赖标点。模型：{MODEL_NAME}"
+    ),
     allow_flagging="never",
 )
+demo.queue().launch()