Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -15,12 +15,13 @@ BATCH_SIZE = 8
|
|
15 |
FILE_LIMIT_MB = 1000
|
16 |
|
17 |
# 默认值(UI 初始值)
|
18 |
-
DEF_SILENCE_MIN_LEN = 0.
|
19 |
DEF_DB_DROP = 25.0 # 相对峰值下落阈值(max_db - DB_DROP)
|
20 |
DEF_PCTL_FLOOR = 20.0 # 能量分位(dB)下限(越大越保守)
|
21 |
DEF_MIN_SEG_DUR = 0.30 # 每段最短显示时长
|
22 |
DEF_FRAME_LEN_MS = 25 # 能量分析帧长
|
23 |
DEF_HOP_LEN_MS = 10 # 帧移
|
|
|
24 |
|
25 |
# ================== 设备/精度(自动兼容 GPU/CPU/ZeroGPU) ==================
|
26 |
def _pick_device_and_dtype():
|
@@ -152,6 +153,46 @@ def _detect_silence_cuts(
|
|
152 |
|
153 |
return cuts, dur
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
# ================== 用切分边界合并逐字符时间线 ==================
|
156 |
def _segment_by_energy(
|
157 |
char_stream: list[tuple[str, float, float]],
|
@@ -203,6 +244,7 @@ def _srt_from_chunks_with_energy(
|
|
203 |
min_seg_dur: float,
|
204 |
frame_len_ms: float,
|
205 |
hop_len_ms: float,
|
|
|
206 |
) -> str:
|
207 |
norm = _norm_chunks(chunks)
|
208 |
char_stream = []
|
@@ -218,6 +260,15 @@ def _srt_from_chunks_with_energy(
|
|
218 |
pctl_floor=pctl_floor,
|
219 |
)
|
220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
segs = _segment_by_energy(char_stream, cut_times, min_seg_dur=min_seg_dur)
|
222 |
|
223 |
lines = []
|
@@ -243,6 +294,7 @@ def transcribe_file_to_srt(
|
|
243 |
min_seg_dur: float,
|
244 |
frame_len_ms: float,
|
245 |
hop_len_ms: float,
|
|
|
246 |
):
|
247 |
if not audio_path:
|
248 |
raise gr.Error("请先上传音频文件。")
|
@@ -267,6 +319,7 @@ def transcribe_file_to_srt(
|
|
267 |
min_seg_dur=min_seg_dur,
|
268 |
frame_len_ms=frame_len_ms,
|
269 |
hop_len_ms=hop_len_ms,
|
|
|
270 |
)
|
271 |
|
272 |
if not srt_str and text:
|
@@ -300,6 +353,10 @@ with gr.Blocks(title="Upload Audio → SRT (Energy/VAD-Based, Tunable)") as demo
|
|
300 |
label="相对峰值下落阈值 (dB) — 越大越容易视为静音")
|
301 |
pctl_floor = gr.Slider(0, 60, value=DEF_PCTL_FLOOR, step=1,
|
302 |
label="能量分位下限 (dB) — 提高可在噪声环境下更保守")
|
|
|
|
|
|
|
|
|
303 |
with gr.Accordion("Advanced (Frame Settings)", open=False):
|
304 |
with gr.Row():
|
305 |
frame_len_ms = gr.Slider(10, 50, value=DEF_FRAME_LEN_MS, step=1, label="帧长 (ms)")
|
@@ -311,7 +368,11 @@ with gr.Blocks(title="Upload Audio → SRT (Energy/VAD-Based, Tunable)") as demo
|
|
311 |
|
312 |
run_btn.click(
|
313 |
fn=transcribe_file_to_srt,
|
314 |
-
inputs=[
|
|
|
|
|
|
|
|
|
315 |
outputs=[srt_preview, srt_file],
|
316 |
)
|
317 |
|
|
|
15 |
FILE_LIMIT_MB = 1000
|
16 |
|
17 |
# 默认值(UI 初始值)
|
18 |
+
DEF_SILENCE_MIN_LEN = 0.45 # 停顿(静音段)最短持续秒数
|
19 |
DEF_DB_DROP = 25.0 # 相对峰值下落阈值(max_db - DB_DROP)
|
20 |
DEF_PCTL_FLOOR = 20.0 # 能量分位(dB)下限(越大越保守)
|
21 |
DEF_MIN_SEG_DUR = 0.30 # 每段最短显示时长
|
22 |
DEF_FRAME_LEN_MS = 25 # 能量分析帧长
|
23 |
DEF_HOP_LEN_MS = 10 # 帧移
|
24 |
+
DEF_CUT_OFFSET_SEC = 0.00 # 切分偏移:负数=左移,正数=右移
|
25 |
|
26 |
# ================== 设备/精度(自动兼容 GPU/CPU/ZeroGPU) ==================
|
27 |
def _pick_device_and_dtype():
|
|
|
153 |
|
154 |
return cuts, dur
|
155 |
|
156 |
+
# =============== 切点偏移(左移=负,右移=正) ===============
|
157 |
+
def _offset_cut_positions(
|
158 |
+
cut_times: list[float],
|
159 |
+
char_stream: list[tuple[str, float, float]],
|
160 |
+
offset_sec: float = 0.0,
|
161 |
+
min_gap: float = 0.08,
|
162 |
+
guard: float = 0.03
|
163 |
+
) -> list[float]:
|
164 |
+
"""
|
165 |
+
将每个切点整体平移 offset_sec 秒:
|
166 |
+
- 负值:向左(提前)
|
167 |
+
- 正值:向右(延后)
|
168 |
+
并确保切点递增,距离上一个切点 >= min_gap,且不越过全局起止边界(guard保护)。
|
169 |
+
"""
|
170 |
+
if not cut_times or not char_stream or offset_sec == 0.0:
|
171 |
+
return cut_times
|
172 |
+
|
173 |
+
starts = [st for _, st, _ in char_stream]
|
174 |
+
ends = [en for _, _, en in char_stream]
|
175 |
+
global_start = starts[0] if starts else 0.0
|
176 |
+
global_end = ends[-1] if ends else (cut_times[-1] if cut_times else 0.0)
|
177 |
+
|
178 |
+
shifted = []
|
179 |
+
for t in cut_times:
|
180 |
+
new_t = t + offset_sec
|
181 |
+
new_t = max(new_t, global_start + guard)
|
182 |
+
new_t = min(new_t, global_end - guard)
|
183 |
+
shifted.append(float(new_t))
|
184 |
+
|
185 |
+
# 排序、去重、并施加最小间隔
|
186 |
+
shifted = sorted(shifted)
|
187 |
+
filtered = []
|
188 |
+
for t in shifted:
|
189 |
+
if not filtered or t - filtered[-1] >= min_gap:
|
190 |
+
filtered.append(t)
|
191 |
+
else:
|
192 |
+
# 如果间隔太小,略过该切点(也可以选择推后到 min_gap)
|
193 |
+
continue
|
194 |
+
return filtered
|
195 |
+
|
196 |
# ================== 用切分边界合并逐字符时间线 ==================
|
197 |
def _segment_by_energy(
|
198 |
char_stream: list[tuple[str, float, float]],
|
|
|
244 |
min_seg_dur: float,
|
245 |
frame_len_ms: float,
|
246 |
hop_len_ms: float,
|
247 |
+
cut_offset_sec: float, # ★ 新增:切分偏移(负=左移,正=右移)
|
248 |
) -> str:
|
249 |
norm = _norm_chunks(chunks)
|
250 |
char_stream = []
|
|
|
260 |
pctl_floor=pctl_floor,
|
261 |
)
|
262 |
|
263 |
+
# ★ 应用偏移(例如左移 0.10:-0.10;右移 0.20:+0.20)
|
264 |
+
cut_times = _offset_cut_positions(
|
265 |
+
cut_times,
|
266 |
+
char_stream,
|
267 |
+
offset_sec=cut_offset_sec,
|
268 |
+
min_gap=0.08,
|
269 |
+
guard=0.03,
|
270 |
+
)
|
271 |
+
|
272 |
segs = _segment_by_energy(char_stream, cut_times, min_seg_dur=min_seg_dur)
|
273 |
|
274 |
lines = []
|
|
|
294 |
min_seg_dur: float,
|
295 |
frame_len_ms: float,
|
296 |
hop_len_ms: float,
|
297 |
+
cut_offset_sec: float, # ★ 新增:UI 传入偏移
|
298 |
):
|
299 |
if not audio_path:
|
300 |
raise gr.Error("请先上传音频文件。")
|
|
|
319 |
min_seg_dur=min_seg_dur,
|
320 |
frame_len_ms=frame_len_ms,
|
321 |
hop_len_ms=hop_len_ms,
|
322 |
+
cut_offset_sec=cut_offset_sec, # ★ 传入偏移
|
323 |
)
|
324 |
|
325 |
if not srt_str and text:
|
|
|
353 |
label="相对峰值下落阈值 (dB) — 越大越容易视为静音")
|
354 |
pctl_floor = gr.Slider(0, 60, value=DEF_PCTL_FLOOR, step=1,
|
355 |
label="能量分位下限 (dB) — 提高可在噪声环境下更保守")
|
356 |
+
with gr.Row():
|
357 |
+
cut_offset_sec = gr.Slider(-0.50, 0.50, value=DEF_CUT_OFFSET_SEC, step=0.01,
|
358 |
+
label="切分偏移 (s) — 负数=左移(提前),正数=右移(延后)")
|
359 |
+
|
360 |
with gr.Accordion("Advanced (Frame Settings)", open=False):
|
361 |
with gr.Row():
|
362 |
frame_len_ms = gr.Slider(10, 50, value=DEF_FRAME_LEN_MS, step=1, label="帧长 (ms)")
|
|
|
368 |
|
369 |
run_btn.click(
|
370 |
fn=transcribe_file_to_srt,
|
371 |
+
inputs=[
|
372 |
+
audio_in, task,
|
373 |
+
silence_min_len, db_drop, pctl_floor, min_seg_dur,
|
374 |
+
frame_len_ms, hop_len_ms, cut_offset_sec # ★ 新增偏移输入
|
375 |
+
],
|
376 |
outputs=[srt_preview, srt_file],
|
377 |
)
|
378 |
|