Spaces:

sdafd
/

whisperx-test

Sleeping

App Files Files Community

sdafd commited on Feb 27

Commit

cd84e90

verified ·

1 Parent(s): 0b55c27

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -2

app.py CHANGED Viewed

@@ -178,6 +178,12 @@ def split_audio_by_pause(audio, sr, pause_threshold, top_db=30, energy_threshold
             filtered_intervals.append((start, end))
     return filtered_intervals
 # -------------------------------
 # Main Transcription Function
@@ -186,6 +192,7 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
     start_time = time.time()
     final_result = ""
     debug_log = []
     try:
         # If vocal extraction is enabled, process the file first
@@ -240,7 +247,13 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
                     for word in segment["words"]:
                         adjusted_start = word['start'] + seg_start/sr
                         adjusted_end = word['end'] + seg_start/sr
-                        final_result += f"[{adjusted_start:5.2f}s-{adjusted_end:5.2f}s] {word['word']}\n"
         else:
             # Process the entire audio without splitting
             transcript = model.transcribe(audio, batch_size=batch_size, language=language)
@@ -249,7 +262,24 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
             )
             for segment in aligned["segments"]:
                 for word in segment["words"]:
-                    final_result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
         debug_log.append(f"Language used: {language}")
         debug_log.append(f"Batch size: {batch_size}")

             filtered_intervals.append((start, end))
     return filtered_intervals
+def seconds_to_srt_time(seconds):
+    msec_total = int(round(seconds * 1000))
+    hours, msec_remainder = divmod(msec_total, 3600 * 1000)
+    minutes, msec_remainder = divmod(msec_remainder, 60 * 1000)
+    sec, msec = divmod(msec_remainder, 1000)
+    return f"{hours:02d}:{minutes:02d}:{sec:02d},{msec:03d}"
 # -------------------------------
 # Main Transcription Function
     start_time = time.time()
     final_result = ""
     debug_log = []
+    srt_entries = []
     try:
         # If vocal extraction is enabled, process the file first
                     for word in segment["words"]:
                         adjusted_start = word['start'] + seg_start/sr
                         adjusted_end = word['end'] + seg_start/sr
+                         srt_entries.append({
+                            'start': adjusted_start,
+                            'end': adjusted_end,
+                            'word': word['word'].strip()
+                        })
+                        #final_result += f"[{adjusted_start:5.2f}s-{adjusted_end:5.2f}s] {word['word']}\n"
         else:
             # Process the entire audio without splitting
             transcript = model.transcribe(audio, batch_size=batch_size, language=language)
             )
             for segment in aligned["segments"]:
                 for word in segment["words"]:
+                    #final_result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
+                    srt_entries.append({
+                        'start': word['start'],
+                        'end': word['end'],
+                        'word': word['word'].strip()
+                    })
+        srt_content = []
+        for idx, entry in enumerate(srt_entries, start=1):
+            start_time_srt = seconds_to_srt_time(entry['start'])
+            end_time_srt = seconds_to_srt_time(entry['end'])
+            srt_content.append(
+                f"{idx}\n"
+                f"{start_time_srt} --> {end_time_srt}\n"
+                f"{entry['word']}\n"
+            )
+        final_result = "\n".join(srt_content)
         debug_log.append(f"Language used: {language}")
         debug_log.append(f"Batch size: {batch_size}")