Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -178,6 +178,12 @@ def split_audio_by_pause(audio, sr, pause_threshold, top_db=30, energy_threshold
|
|
178 |
filtered_intervals.append((start, end))
|
179 |
return filtered_intervals
|
180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
# -------------------------------
|
183 |
# Main Transcription Function
|
@@ -186,6 +192,7 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
|
|
186 |
start_time = time.time()
|
187 |
final_result = ""
|
188 |
debug_log = []
|
|
|
189 |
|
190 |
try:
|
191 |
# If vocal extraction is enabled, process the file first
|
@@ -240,7 +247,13 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
|
|
240 |
for word in segment["words"]:
|
241 |
adjusted_start = word['start'] + seg_start/sr
|
242 |
adjusted_end = word['end'] + seg_start/sr
|
243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
else:
|
245 |
# Process the entire audio without splitting
|
246 |
transcript = model.transcribe(audio, batch_size=batch_size, language=language)
|
@@ -249,7 +262,24 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
|
|
249 |
)
|
250 |
for segment in aligned["segments"]:
|
251 |
for word in segment["words"]:
|
252 |
-
final_result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
debug_log.append(f"Language used: {language}")
|
255 |
debug_log.append(f"Batch size: {batch_size}")
|
|
|
178 |
filtered_intervals.append((start, end))
|
179 |
return filtered_intervals
|
180 |
|
181 |
+
def seconds_to_srt_time(seconds):
|
182 |
+
msec_total = int(round(seconds * 1000))
|
183 |
+
hours, msec_remainder = divmod(msec_total, 3600 * 1000)
|
184 |
+
minutes, msec_remainder = divmod(msec_remainder, 60 * 1000)
|
185 |
+
sec, msec = divmod(msec_remainder, 1000)
|
186 |
+
return f"{hours:02d}:{minutes:02d}:{sec:02d},{msec:03d}"
|
187 |
|
188 |
# -------------------------------
|
189 |
# Main Transcription Function
|
|
|
192 |
start_time = time.time()
|
193 |
final_result = ""
|
194 |
debug_log = []
|
195 |
+
srt_entries = []
|
196 |
|
197 |
try:
|
198 |
# If vocal extraction is enabled, process the file first
|
|
|
247 |
for word in segment["words"]:
|
248 |
adjusted_start = word['start'] + seg_start/sr
|
249 |
adjusted_end = word['end'] + seg_start/sr
|
250 |
+
|
251 |
+
srt_entries.append({
|
252 |
+
'start': adjusted_start,
|
253 |
+
'end': adjusted_end,
|
254 |
+
'word': word['word'].strip()
|
255 |
+
})
|
256 |
+
#final_result += f"[{adjusted_start:5.2f}s-{adjusted_end:5.2f}s] {word['word']}\n"
|
257 |
else:
|
258 |
# Process the entire audio without splitting
|
259 |
transcript = model.transcribe(audio, batch_size=batch_size, language=language)
|
|
|
262 |
)
|
263 |
for segment in aligned["segments"]:
|
264 |
for word in segment["words"]:
|
265 |
+
#final_result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
|
266 |
+
srt_entries.append({
|
267 |
+
'start': word['start'],
|
268 |
+
'end': word['end'],
|
269 |
+
'word': word['word'].strip()
|
270 |
+
})
|
271 |
+
|
272 |
+
srt_content = []
|
273 |
+
for idx, entry in enumerate(srt_entries, start=1):
|
274 |
+
start_time_srt = seconds_to_srt_time(entry['start'])
|
275 |
+
end_time_srt = seconds_to_srt_time(entry['end'])
|
276 |
+
srt_content.append(
|
277 |
+
f"{idx}\n"
|
278 |
+
f"{start_time_srt} --> {end_time_srt}\n"
|
279 |
+
f"{entry['word']}\n"
|
280 |
+
)
|
281 |
+
|
282 |
+
final_result = "\n".join(srt_content)
|
283 |
|
284 |
debug_log.append(f"Language used: {language}")
|
285 |
debug_log.append(f"Batch size: {batch_size}")
|