datxy commited on
Commit
1a6e7e3
·
verified ·
1 Parent(s): fef4a21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -2
app.py CHANGED
@@ -15,12 +15,13 @@ BATCH_SIZE = 8
15
  FILE_LIMIT_MB = 1000
16
 
17
  # 默认值(UI 初始值)
18
- DEF_SILENCE_MIN_LEN = 0.54 # 停顿(静音段)最短持续秒数
19
  DEF_DB_DROP = 25.0 # 相对峰值下落阈值(max_db - DB_DROP)
20
  DEF_PCTL_FLOOR = 20.0 # 能量分位(dB)下限(越大越保守)
21
  DEF_MIN_SEG_DUR = 0.30 # 每段最短显示时长
22
  DEF_FRAME_LEN_MS = 25 # 能量分析帧长
23
  DEF_HOP_LEN_MS = 10 # 帧移
 
24
 
25
  # ================== 设备/精度(自动兼容 GPU/CPU/ZeroGPU) ==================
26
  def _pick_device_and_dtype():
@@ -152,6 +153,46 @@ def _detect_silence_cuts(
152
 
153
  return cuts, dur
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  # ================== 用切分边界合并逐字符时间线 ==================
156
  def _segment_by_energy(
157
  char_stream: list[tuple[str, float, float]],
@@ -203,6 +244,7 @@ def _srt_from_chunks_with_energy(
203
  min_seg_dur: float,
204
  frame_len_ms: float,
205
  hop_len_ms: float,
 
206
  ) -> str:
207
  norm = _norm_chunks(chunks)
208
  char_stream = []
@@ -218,6 +260,15 @@ def _srt_from_chunks_with_energy(
218
  pctl_floor=pctl_floor,
219
  )
220
 
 
 
 
 
 
 
 
 
 
221
  segs = _segment_by_energy(char_stream, cut_times, min_seg_dur=min_seg_dur)
222
 
223
  lines = []
@@ -243,6 +294,7 @@ def transcribe_file_to_srt(
243
  min_seg_dur: float,
244
  frame_len_ms: float,
245
  hop_len_ms: float,
 
246
  ):
247
  if not audio_path:
248
  raise gr.Error("请先上传音频文件。")
@@ -267,6 +319,7 @@ def transcribe_file_to_srt(
267
  min_seg_dur=min_seg_dur,
268
  frame_len_ms=frame_len_ms,
269
  hop_len_ms=hop_len_ms,
 
270
  )
271
 
272
  if not srt_str and text:
@@ -300,6 +353,10 @@ with gr.Blocks(title="Upload Audio → SRT (Energy/VAD-Based, Tunable)") as demo
300
  label="相对峰值下落阈值 (dB) — 越大越容易视为静音")
301
  pctl_floor = gr.Slider(0, 60, value=DEF_PCTL_FLOOR, step=1,
302
  label="能量分位下限 (dB) — 提高可在噪声环境下更保守")
 
 
 
 
303
  with gr.Accordion("Advanced (Frame Settings)", open=False):
304
  with gr.Row():
305
  frame_len_ms = gr.Slider(10, 50, value=DEF_FRAME_LEN_MS, step=1, label="帧长 (ms)")
@@ -311,7 +368,11 @@ with gr.Blocks(title="Upload Audio → SRT (Energy/VAD-Based, Tunable)") as demo
311
 
312
  run_btn.click(
313
  fn=transcribe_file_to_srt,
314
- inputs=[audio_in, task, silence_min_len, db_drop, pctl_floor, min_seg_dur, frame_len_ms, hop_len_ms],
 
 
 
 
315
  outputs=[srt_preview, srt_file],
316
  )
317
 
 
15
  FILE_LIMIT_MB = 1000
16
 
17
  # 默认值(UI 初始值)
18
+ DEF_SILENCE_MIN_LEN = 0.45 # 停顿(静音段)最短持续秒数
19
  DEF_DB_DROP = 25.0 # 相对峰值下落阈值(max_db - DB_DROP)
20
  DEF_PCTL_FLOOR = 20.0 # 能量分位(dB)下限(越大越保守)
21
  DEF_MIN_SEG_DUR = 0.30 # 每段最短显示时长
22
  DEF_FRAME_LEN_MS = 25 # 能量分析帧长
23
  DEF_HOP_LEN_MS = 10 # 帧移
24
+ DEF_CUT_OFFSET_SEC = 0.00 # 切分偏移:负数=左移,正数=右移
25
 
26
  # ================== 设备/精度(自动兼容 GPU/CPU/ZeroGPU) ==================
27
  def _pick_device_and_dtype():
 
153
 
154
  return cuts, dur
155
 
156
+ # =============== 切点偏移(左移=负,右移=正) ===============
157
+ def _offset_cut_positions(
158
+ cut_times: list[float],
159
+ char_stream: list[tuple[str, float, float]],
160
+ offset_sec: float = 0.0,
161
+ min_gap: float = 0.08,
162
+ guard: float = 0.03
163
+ ) -> list[float]:
164
+ """
165
+ 将每个切点整体平移 offset_sec 秒:
166
+ - 负值:向左(提前)
167
+ - 正值:向右(延后)
168
+ 并确保切点递增,距离上一个切点 >= min_gap,且不越过全局起止边界(guard保护)。
169
+ """
170
+ if not cut_times or not char_stream or offset_sec == 0.0:
171
+ return cut_times
172
+
173
+ starts = [st for _, st, _ in char_stream]
174
+ ends = [en for _, _, en in char_stream]
175
+ global_start = starts[0] if starts else 0.0
176
+ global_end = ends[-1] if ends else (cut_times[-1] if cut_times else 0.0)
177
+
178
+ shifted = []
179
+ for t in cut_times:
180
+ new_t = t + offset_sec
181
+ new_t = max(new_t, global_start + guard)
182
+ new_t = min(new_t, global_end - guard)
183
+ shifted.append(float(new_t))
184
+
185
+ # 排序、去重、并施加最小间隔
186
+ shifted = sorted(shifted)
187
+ filtered = []
188
+ for t in shifted:
189
+ if not filtered or t - filtered[-1] >= min_gap:
190
+ filtered.append(t)
191
+ else:
192
+ # 如果间隔太小,略过该切点(也可以选择推后到 min_gap)
193
+ continue
194
+ return filtered
195
+
196
  # ================== 用切分边界合并逐字符时间线 ==================
197
  def _segment_by_energy(
198
  char_stream: list[tuple[str, float, float]],
 
244
  min_seg_dur: float,
245
  frame_len_ms: float,
246
  hop_len_ms: float,
247
+ cut_offset_sec: float, # ★ 新增:切分偏移(负=左移,正=右移)
248
  ) -> str:
249
  norm = _norm_chunks(chunks)
250
  char_stream = []
 
260
  pctl_floor=pctl_floor,
261
  )
262
 
263
+ # ★ 应用偏移(例如左移 0.10:-0.10;右移 0.20:+0.20)
264
+ cut_times = _offset_cut_positions(
265
+ cut_times,
266
+ char_stream,
267
+ offset_sec=cut_offset_sec,
268
+ min_gap=0.08,
269
+ guard=0.03,
270
+ )
271
+
272
  segs = _segment_by_energy(char_stream, cut_times, min_seg_dur=min_seg_dur)
273
 
274
  lines = []
 
294
  min_seg_dur: float,
295
  frame_len_ms: float,
296
  hop_len_ms: float,
297
+ cut_offset_sec: float, # ★ 新增:UI 传入偏移
298
  ):
299
  if not audio_path:
300
  raise gr.Error("请先上传音频文件。")
 
319
  min_seg_dur=min_seg_dur,
320
  frame_len_ms=frame_len_ms,
321
  hop_len_ms=hop_len_ms,
322
+ cut_offset_sec=cut_offset_sec, # ★ 传入偏移
323
  )
324
 
325
  if not srt_str and text:
 
353
  label="相对峰值下落阈值 (dB) — 越大越容易视为静音")
354
  pctl_floor = gr.Slider(0, 60, value=DEF_PCTL_FLOOR, step=1,
355
  label="能量分位下限 (dB) — 提高可在噪声环境下更保守")
356
+ with gr.Row():
357
+ cut_offset_sec = gr.Slider(-0.50, 0.50, value=DEF_CUT_OFFSET_SEC, step=0.01,
358
+ label="切分偏移 (s) — 负数=左移(提前),正数=右移(延后)")
359
+
360
  with gr.Accordion("Advanced (Frame Settings)", open=False):
361
  with gr.Row():
362
  frame_len_ms = gr.Slider(10, 50, value=DEF_FRAME_LEN_MS, step=1, label="帧长 (ms)")
 
368
 
369
  run_btn.click(
370
  fn=transcribe_file_to_srt,
371
+ inputs=[
372
+ audio_in, task,
373
+ silence_min_len, db_drop, pctl_floor, min_seg_dur,
374
+ frame_len_ms, hop_len_ms, cut_offset_sec # ★ 新增偏移输入
375
+ ],
376
  outputs=[srt_preview, srt_file],
377
  )
378