Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -21,7 +21,7 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
|
21 |
tokenizer=processor.tokenizer,
|
22 |
feature_extractor=processor.feature_extractor,
|
23 |
max_new_tokens=128,
|
24 |
-
chunk_length_s=10,
|
25 |
batch_size=2,
|
26 |
return_timestamps=True,
|
27 |
torch_dtype=torch_dtype,
|
@@ -41,7 +41,7 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
|
41 |
|
42 |
audio = video.audio
|
43 |
duration = video.duration
|
44 |
-
n_chunks = math.ceil(duration / 10)
|
45 |
transcription_txt = ""
|
46 |
transcription_srt = []
|
47 |
|
@@ -82,7 +82,7 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
|
82 |
srt_entry = f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
|
83 |
srt_content += srt_entry
|
84 |
|
85 |
-
# Remove duplicate captions
|
86 |
cleaned_srt_content = clean_srt_duplicates(srt_content)
|
87 |
|
88 |
# Save SRT content to a file
|
@@ -99,27 +99,35 @@ def format_time(seconds):
|
|
99 |
h, m = divmod(m, 60)
|
100 |
return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')
|
101 |
|
102 |
-
def clean_srt_duplicates(srt_content):
|
103 |
"""
|
104 |
-
Function to remove
|
|
|
105 |
"""
|
106 |
cleaned_srt = []
|
107 |
-
|
108 |
|
109 |
# Pattern to match each SRT block
|
110 |
srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.+)", re.DOTALL)
|
111 |
|
|
|
|
|
112 |
for match in srt_pattern.finditer(srt_content):
|
113 |
index, start_time, end_time, text = match.groups()
|
114 |
text = text.strip()
|
115 |
|
116 |
-
#
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
return ''.join(cleaned_srt)
|
125 |
|
|
|
21 |
tokenizer=processor.tokenizer,
|
22 |
feature_extractor=processor.feature_extractor,
|
23 |
max_new_tokens=128,
|
24 |
+
chunk_length_s=10,
|
25 |
batch_size=2,
|
26 |
return_timestamps=True,
|
27 |
torch_dtype=torch_dtype,
|
|
|
41 |
|
42 |
audio = video.audio
|
43 |
duration = video.duration
|
44 |
+
n_chunks = math.ceil(duration / 10)
|
45 |
transcription_txt = ""
|
46 |
transcription_srt = []
|
47 |
|
|
|
82 |
srt_entry = f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
|
83 |
srt_content += srt_entry
|
84 |
|
85 |
+
# Remove duplicate captions and keep only the last occurrence
|
86 |
cleaned_srt_content = clean_srt_duplicates(srt_content)
|
87 |
|
88 |
# Save SRT content to a file
|
|
|
99 |
h, m = divmod(m, 60)
|
100 |
return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')
|
101 |
|
102 |
+
def clean_srt_duplicates(srt_content, time_threshold=30):
|
103 |
"""
|
104 |
+
Function to remove duplicate captions within a specified time range in SRT format,
|
105 |
+
keeping only the last occurrence.
|
106 |
"""
|
107 |
cleaned_srt = []
|
108 |
+
last_seen = {}
|
109 |
|
110 |
# Pattern to match each SRT block
|
111 |
srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.+)", re.DOTALL)
|
112 |
|
113 |
+
# Store blocks temporarily to determine duplicates
|
114 |
+
blocks = []
|
115 |
for match in srt_pattern.finditer(srt_content):
|
116 |
index, start_time, end_time, text = match.groups()
|
117 |
text = text.strip()
|
118 |
|
119 |
+
# Convert start time to seconds
|
120 |
+
start_seconds = sum(int(x) * 60 ** i for i, x in enumerate(reversed(start_time.split(":"))))
|
121 |
+
|
122 |
+
# Only keep the last instance within the time threshold
|
123 |
+
if text in last_seen and (start_seconds - last_seen[text]) < time_threshold:
|
124 |
+
blocks.pop() # Remove the previous occurrence
|
125 |
+
blocks.append((index, start_time, end_time, text))
|
126 |
+
last_seen[text] = start_seconds # Update last occurrence time
|
127 |
+
|
128 |
+
# Build cleaned SRT content
|
129 |
+
for i, (index, start_time, end_time, text) in enumerate(blocks, 1):
|
130 |
+
cleaned_srt.append(f"{i}\n{start_time} --> {end_time}\n{text}\n\n")
|
131 |
|
132 |
return ''.join(cleaned_srt)
|
133 |
|