reab5555 commited on
Commit
f8366cb
·
verified ·
1 Parent(s): 85ce446

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -3
app.py CHANGED
@@ -1,9 +1,10 @@
1
  import os
2
  import math
 
3
  import gradio as gr
4
  import torch
5
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
6
- from moviepy.editor import VideoFileClip, concatenate_audioclips
7
 
8
  def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -79,12 +80,15 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
79
  srt_content = ""
80
  for i, sub in enumerate(transcription_srt, 1):
81
  srt_entry = f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
82
- output += srt_entry
83
  srt_content += srt_entry
84
 
 
 
 
 
85
  srt_file_path = "transcription.srt"
86
  with open(srt_file_path, "w", encoding="utf-8") as srt_file:
87
- srt_file.write(srt_content)
88
 
89
  output += f"\nSRT file saved as: {srt_file_path}"
90
 
@@ -95,6 +99,30 @@ def format_time(seconds):
95
  h, m = divmod(m, 60)
96
  return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  iface = gr.Interface(
99
  fn=transcribe,
100
  inputs=[
 
1
  import os
2
  import math
3
+ import re
4
  import gradio as gr
5
  import torch
6
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
7
+ from moviepy.editor import VideoFileClip
8
 
9
  def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
80
  srt_content = ""
81
  for i, sub in enumerate(transcription_srt, 1):
82
  srt_entry = f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
 
83
  srt_content += srt_entry
84
 
85
+ # Remove duplicate captions
86
+ cleaned_srt_content = clean_srt_duplicates(srt_content)
87
+
88
+ # Save SRT content to a file
89
  srt_file_path = "transcription.srt"
90
  with open(srt_file_path, "w", encoding="utf-8") as srt_file:
91
+ srt_file.write(cleaned_srt_content)
92
 
93
  output += f"\nSRT file saved as: {srt_file_path}"
94
 
 
99
  h, m = divmod(m, 60)
100
  return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')
101
 
102
+ def clean_srt_duplicates(srt_content):
103
+ """
104
+ Function to remove consecutive duplicate captions in SRT format.
105
+ """
106
+ cleaned_srt = []
107
+ last_text = None
108
+
109
+ # Pattern to match each SRT block
110
+ srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.+)", re.DOTALL)
111
+
112
+ for match in srt_pattern.finditer(srt_content):
113
+ index, start_time, end_time, text = match.groups()
114
+ text = text.strip()
115
+
116
+ # If this caption is identical to the last one, skip it
117
+ if text == last_text:
118
+ continue
119
+
120
+ # Add cleaned entry to the list
121
+ cleaned_srt.append(f"{index}\n{start_time} --> {end_time}\n{text}\n\n")
122
+ last_text = text
123
+
124
+ return ''.join(cleaned_srt)
125
+
126
  iface = gr.Interface(
127
  fn=transcribe,
128
  inputs=[