Spaces:

reab5555
/

WhisperCap

Sleeping

App Files Files Community

reab5555 commited on Oct 29, 2024

Commit

34eebab

verified ·

1 Parent(s): f8366cb

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -13

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
         max_new_tokens=128,
-        chunk_length_s=10,  # Shorter chunk length to prevent overly long captions
         batch_size=2,
         return_timestamps=True,
         torch_dtype=torch_dtype,
@@ -41,7 +41,7 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
     audio = video.audio
     duration = video.duration
-    n_chunks = math.ceil(duration / 10)  # Split into 10-second chunks
     transcription_txt = ""
     transcription_srt = []
@@ -82,7 +82,7 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
             srt_entry = f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
             srt_content += srt_entry
-        # Remove duplicate captions
         cleaned_srt_content = clean_srt_duplicates(srt_content)
         # Save SRT content to a file
@@ -99,27 +99,35 @@ def format_time(seconds):
     h, m = divmod(m, 60)
     return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')
-def clean_srt_duplicates(srt_content):
     """
-    Function to remove consecutive duplicate captions in SRT format.
     """
     cleaned_srt = []
-    last_text = None
     # Pattern to match each SRT block
     srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.+)", re.DOTALL)
     for match in srt_pattern.finditer(srt_content):
         index, start_time, end_time, text = match.groups()
         text = text.strip()
-        # If this caption is identical to the last one, skip it
-        if text == last_text:
-            continue
-        # Add cleaned entry to the list
-        cleaned_srt.append(f"{index}\n{start_time} --> {end_time}\n{text}\n\n")
-        last_text = text
     return ''.join(cleaned_srt)

         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
         max_new_tokens=128,
+        chunk_length_s=10,
         batch_size=2,
         return_timestamps=True,
         torch_dtype=torch_dtype,
     audio = video.audio
     duration = video.duration
+    n_chunks = math.ceil(duration / 10)
     transcription_txt = ""
     transcription_srt = []
             srt_entry = f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
             srt_content += srt_entry
+        # Remove duplicate captions and keep only the last occurrence
         cleaned_srt_content = clean_srt_duplicates(srt_content)
         # Save SRT content to a file
     h, m = divmod(m, 60)
     return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')
+def clean_srt_duplicates(srt_content, time_threshold=30):
     """
+    Function to remove duplicate captions within a specified time range in SRT format,
+    keeping only the last occurrence.
     """
     cleaned_srt = []
+    last_seen = {}
     # Pattern to match each SRT block
     srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.+)", re.DOTALL)
+    # Store blocks temporarily to determine duplicates
+    blocks = []
     for match in srt_pattern.finditer(srt_content):
         index, start_time, end_time, text = match.groups()
         text = text.strip()
+        # Convert start time to seconds
+        start_seconds = sum(int(x) * 60 ** i for i, x in enumerate(reversed(start_time.split(":"))))
+        # Only keep the last instance within the time threshold
+        if text in last_seen and (start_seconds - last_seen[text]) < time_threshold:
+            blocks.pop()  # Remove the previous occurrence
+        blocks.append((index, start_time, end_time, text))
+        last_seen[text] = start_seconds  # Update last occurrence time
+    # Build cleaned SRT content
+    for i, (index, start_time, end_time, text) in enumerate(blocks, 1):
+        cleaned_srt.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
     return ''.join(cleaned_srt)