whisper-large-v3-srt

Running on Zero

App Files Files Community

datxy commited on 18 days ago

Commit

1a6e7e3

verified ·

1 Parent(s): fef4a21

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -2

app.py CHANGED Viewed

@@ -15,12 +15,13 @@ BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
 # 默认值（UI 初始值）
-DEF_SILENCE_MIN_LEN = 0.54   # 停顿（静音段）最短持续秒数
 DEF_DB_DROP         = 25.0   # 相对峰值下落阈值（max_db - DB_DROP）
 DEF_PCTL_FLOOR      = 20.0   # 能量分位（dB）下限（越大越保守）
 DEF_MIN_SEG_DUR     = 0.30   # 每段最短显示时长
 DEF_FRAME_LEN_MS    = 25     # 能量分析帧长
 DEF_HOP_LEN_MS      = 10     # 帧移
 # ================== 设备/精度（自动兼容 GPU/CPU/ZeroGPU） ==================
 def _pick_device_and_dtype():
@@ -152,6 +153,46 @@ def _detect_silence_cuts(
     return cuts, dur
 # ================== 用切分边界合并逐字符时间线 ==================
 def _segment_by_energy(
     char_stream: list[tuple[str, float, float]],
@@ -203,6 +244,7 @@ def _srt_from_chunks_with_energy(
     min_seg_dur: float,
     frame_len_ms: float,
     hop_len_ms: float,
 ) -> str:
     norm = _norm_chunks(chunks)
     char_stream = []
@@ -218,6 +260,15 @@ def _srt_from_chunks_with_energy(
         pctl_floor=pctl_floor,
     )
     segs = _segment_by_energy(char_stream, cut_times, min_seg_dur=min_seg_dur)
     lines = []
@@ -243,6 +294,7 @@ def transcribe_file_to_srt(
     min_seg_dur: float,
     frame_len_ms: float,
     hop_len_ms: float,
 ):
     if not audio_path:
         raise gr.Error("请先上传音频文件。")
@@ -267,6 +319,7 @@ def transcribe_file_to_srt(
         min_seg_dur=min_seg_dur,
         frame_len_ms=frame_len_ms,
         hop_len_ms=hop_len_ms,
     )
     if not srt_str and text:
@@ -300,6 +353,10 @@ with gr.Blocks(title="Upload Audio → SRT (Energy/VAD-Based, Tunable)") as demo
                                 label="相对峰值下落阈值 (dB) — 越大越容易视为静音")
             pctl_floor = gr.Slider(0, 60, value=DEF_PCTL_FLOOR, step=1,
                                    label="能量分位下限 (dB) — 提高可在噪声环境下更保守")
     with gr.Accordion("Advanced (Frame Settings)", open=False):
         with gr.Row():
             frame_len_ms = gr.Slider(10, 50, value=DEF_FRAME_LEN_MS, step=1, label="帧长 (ms)")
@@ -311,7 +368,11 @@ with gr.Blocks(title="Upload Audio → SRT (Energy/VAD-Based, Tunable)") as demo
     run_btn.click(
         fn=transcribe_file_to_srt,
-        inputs=[audio_in, task, silence_min_len, db_drop, pctl_floor, min_seg_dur, frame_len_ms, hop_len_ms],
         outputs=[srt_preview, srt_file],
     )

 FILE_LIMIT_MB = 1000
 # 默认值（UI 初始值）
+DEF_SILENCE_MIN_LEN = 0.45   # 停顿（静音段）最短持续秒数
 DEF_DB_DROP         = 25.0   # 相对峰值下落阈值（max_db - DB_DROP）
 DEF_PCTL_FLOOR      = 20.0   # 能量分位（dB）下限（越大越保守）
 DEF_MIN_SEG_DUR     = 0.30   # 每段最短显示时长
 DEF_FRAME_LEN_MS    = 25     # 能量分析帧长
 DEF_HOP_LEN_MS      = 10     # 帧移
+DEF_CUT_OFFSET_SEC  = 0.00   # 切分偏移：负数=左移，正数=右移
 # ================== 设备/精度（自动兼容 GPU/CPU/ZeroGPU） ==================
 def _pick_device_and_dtype():
     return cuts, dur
+# =============== 切点偏移（左移=负，右移=正） ===============
+def _offset_cut_positions(
+    cut_times: list[float],
+    char_stream: list[tuple[str, float, float]],
+    offset_sec: float = 0.0,
+    min_gap: float = 0.08,
+    guard: float = 0.03
+) -> list[float]:
+    """
+    将每个切点整体平移 offset_sec 秒：
+      - 负值：向左（提前）
+      - 正值：向右（延后）
+    并确保切点递增，距离上一个切点 >= min_gap，且不越过全局起止边界（guard保护）。
+    """
+    if not cut_times or not char_stream or offset_sec == 0.0:
+        return cut_times
+    starts = [st for _, st, _ in char_stream]
+    ends   = [en for _, _, en in char_stream]
+    global_start = starts[0] if starts else 0.0
+    global_end   = ends[-1] if ends else (cut_times[-1] if cut_times else 0.0)
+    shifted = []
+    for t in cut_times:
+        new_t = t + offset_sec
+        new_t = max(new_t, global_start + guard)
+        new_t = min(new_t, global_end - guard)
+        shifted.append(float(new_t))
+    # 排序、去重、并施加最小间隔
+    shifted = sorted(shifted)
+    filtered = []
+    for t in shifted:
+        if not filtered or t - filtered[-1] >= min_gap:
+            filtered.append(t)
+        else:
+            # 如果间隔太小，略过该切点（也可以选择推后到 min_gap）
+            continue
+    return filtered
 # ================== 用切分边界合并逐字符时间线 ==================
 def _segment_by_energy(
     char_stream: list[tuple[str, float, float]],
     min_seg_dur: float,
     frame_len_ms: float,
     hop_len_ms: float,
+    cut_offset_sec: float,   # ★ 新增：切分偏移（负=左移，正=右移）
 ) -> str:
     norm = _norm_chunks(chunks)
     char_stream = []
         pctl_floor=pctl_floor,
     )
+    # ★ 应用偏移（例如左移 0.10：-0.10；右移 0.20：+0.20）
+    cut_times = _offset_cut_positions(
+        cut_times,
+        char_stream,
+        offset_sec=cut_offset_sec,
+        min_gap=0.08,
+        guard=0.03,
+    )
     segs = _segment_by_energy(char_stream, cut_times, min_seg_dur=min_seg_dur)
     lines = []
     min_seg_dur: float,
     frame_len_ms: float,
     hop_len_ms: float,
+    cut_offset_sec: float,     # ★ 新增：UI 传入偏移
 ):
     if not audio_path:
         raise gr.Error("请先上传音频文件。")
         min_seg_dur=min_seg_dur,
         frame_len_ms=frame_len_ms,
         hop_len_ms=hop_len_ms,
+        cut_offset_sec=cut_offset_sec,   # ★ 传入偏移
     )
     if not srt_str and text:
                                 label="相对峰值下落阈值 (dB) — 越大越容易视为静音")
             pctl_floor = gr.Slider(0, 60, value=DEF_PCTL_FLOOR, step=1,
                                    label="能量分位下限 (dB) — 提高可在噪声环境下更保守")
+        with gr.Row():
+            cut_offset_sec = gr.Slider(-0.50, 0.50, value=DEF_CUT_OFFSET_SEC, step=0.01,
+                                       label="切分偏移 (s) — 负数=左移(提前)，正数=右移(延后)")
     with gr.Accordion("Advanced (Frame Settings)", open=False):
         with gr.Row():
             frame_len_ms = gr.Slider(10, 50, value=DEF_FRAME_LEN_MS, step=1, label="帧长 (ms)")
     run_btn.click(
         fn=transcribe_file_to_srt,
+        inputs=[
+            audio_in, task,
+            silence_min_len, db_drop, pctl_floor, min_seg_dur,
+            frame_len_ms, hop_len_ms, cut_offset_sec  # ★ 新增偏移输入
+        ],
         outputs=[srt_preview, srt_file],
     )