reab5555 commited on
Commit
34eebab
·
verified ·
1 Parent(s): f8366cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -13
app.py CHANGED
@@ -21,7 +21,7 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
21
  tokenizer=processor.tokenizer,
22
  feature_extractor=processor.feature_extractor,
23
  max_new_tokens=128,
24
- chunk_length_s=10, # Shorter chunk length to prevent overly long captions
25
  batch_size=2,
26
  return_timestamps=True,
27
  torch_dtype=torch_dtype,
@@ -41,7 +41,7 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
41
 
42
  audio = video.audio
43
  duration = video.duration
44
- n_chunks = math.ceil(duration / 10) # Split into 10-second chunks
45
  transcription_txt = ""
46
  transcription_srt = []
47
 
@@ -82,7 +82,7 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
82
  srt_entry = f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
83
  srt_content += srt_entry
84
 
85
- # Remove duplicate captions
86
  cleaned_srt_content = clean_srt_duplicates(srt_content)
87
 
88
  # Save SRT content to a file
@@ -99,27 +99,35 @@ def format_time(seconds):
99
  h, m = divmod(m, 60)
100
  return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')
101
 
102
- def clean_srt_duplicates(srt_content):
103
  """
104
- Function to remove consecutive duplicate captions in SRT format.
 
105
  """
106
  cleaned_srt = []
107
- last_text = None
108
 
109
  # Pattern to match each SRT block
110
  srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.+)", re.DOTALL)
111
 
 
 
112
  for match in srt_pattern.finditer(srt_content):
113
  index, start_time, end_time, text = match.groups()
114
  text = text.strip()
115
 
116
- # If this caption is identical to the last one, skip it
117
- if text == last_text:
118
- continue
119
-
120
- # Add cleaned entry to the list
121
- cleaned_srt.append(f"{index}\n{start_time} --> {end_time}\n{text}\n\n")
122
- last_text = text
 
 
 
 
 
123
 
124
  return ''.join(cleaned_srt)
125
 
 
21
  tokenizer=processor.tokenizer,
22
  feature_extractor=processor.feature_extractor,
23
  max_new_tokens=128,
24
+ chunk_length_s=10,
25
  batch_size=2,
26
  return_timestamps=True,
27
  torch_dtype=torch_dtype,
 
41
 
42
  audio = video.audio
43
  duration = video.duration
44
+ n_chunks = math.ceil(duration / 10)
45
  transcription_txt = ""
46
  transcription_srt = []
47
 
 
82
  srt_entry = f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
83
  srt_content += srt_entry
84
 
85
+ # Remove duplicate captions and keep only the last occurrence
86
  cleaned_srt_content = clean_srt_duplicates(srt_content)
87
 
88
  # Save SRT content to a file
 
99
  h, m = divmod(m, 60)
100
  return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')
101
 
102
+ def clean_srt_duplicates(srt_content, time_threshold=30):
103
  """
104
+ Function to remove duplicate captions within a specified time range in SRT format,
105
+ keeping only the last occurrence.
106
  """
107
  cleaned_srt = []
108
+ last_seen = {}
109
 
110
  # Pattern to match each SRT block
111
  srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.+)", re.DOTALL)
112
 
113
+ # Store blocks temporarily to determine duplicates
114
+ blocks = []
115
  for match in srt_pattern.finditer(srt_content):
116
  index, start_time, end_time, text = match.groups()
117
  text = text.strip()
118
 
119
+ # Convert start time to seconds
120
+ start_seconds = sum(int(x) * 60 ** i for i, x in enumerate(reversed(start_time.split(":"))))
121
+
122
+ # Only keep the last instance within the time threshold
123
+ if text in last_seen and (start_seconds - last_seen[text]) < time_threshold:
124
+ blocks.pop() # Remove the previous occurrence
125
+ blocks.append((index, start_time, end_time, text))
126
+ last_seen[text] = start_seconds # Update last occurrence time
127
+
128
+ # Build cleaned SRT content
129
+ for i, (index, start_time, end_time, text) in enumerate(blocks, 1):
130
+ cleaned_srt.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
131
 
132
  return ''.join(cleaned_srt)
133