Spaces:
Sleeping
Sleeping
Eason Lu
commited on
Commit
·
6113bd9
1
Parent(s):
cf5f1c9
merge segments
Browse filesFormer-commit-id: 3b73651a94d5dac62b0c7577f59b3d59509839f9
- SRT.py +60 -15
- pipeline.py +13 -7
SRT.py
CHANGED
|
@@ -8,26 +8,31 @@ class SRT_segment(object):
|
|
| 8 |
segment = args[0]
|
| 9 |
self.start_time_str = str(0)+str(timedelta(seconds=int(segment['start'])))+',000'
|
| 10 |
self.end_time_str = str(0)+str(timedelta(seconds=int(segment['end'])))+',000'
|
| 11 |
-
self.segment_id = segment['id']+1
|
| 12 |
self.source_text = segment['text']
|
| 13 |
self.duration = f"{self.start_time_str} --> {self.end_time_str}"
|
| 14 |
self.translation = ""
|
| 15 |
elif isinstance(args[0], list):
|
| 16 |
-
self.segment_id = args[0][0]
|
| 17 |
self.source_text = args[0][2]
|
| 18 |
self.duration = args[0][1]
|
| 19 |
-
self.start_time_str = self.duration.split("-->")[0]
|
| 20 |
-
self.end_time_str = self.duration.split("-->")[1]
|
| 21 |
self.translation = ""
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def __str__(self) -> str:
|
| 24 |
-
return f'{self.
|
| 25 |
|
| 26 |
def get_trans_str(self) -> str:
|
| 27 |
-
return f'{self.
|
| 28 |
|
| 29 |
def get_bilingual_str(self) -> str:
|
| 30 |
-
return f'{self.
|
| 31 |
|
| 32 |
class SRT_script():
|
| 33 |
def __init__(self, segments) -> None:
|
|
@@ -48,42 +53,82 @@ class SRT_script():
|
|
| 48 |
|
| 49 |
return cls(segments)
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
def set_translation(self, translate:str, id_range:tuple):
|
| 52 |
start_seg_id = id_range[0]
|
| 53 |
end_seg_id = id_range[1]
|
| 54 |
|
| 55 |
lines = translate.split('\n\n')
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
| 59 |
|
| 60 |
for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id]):
|
| 61 |
seg.translation = lines[i]
|
| 62 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
def get_source_only(self):
|
| 65 |
# return a string with pure source text
|
| 66 |
result = ""
|
| 67 |
-
for seg in self.segments:
|
| 68 |
-
result+=f'{seg.source_text}\n\n'
|
| 69 |
|
| 70 |
return result
|
| 71 |
|
| 72 |
def reform_src_str(self):
|
| 73 |
result = ""
|
| 74 |
-
for seg in self.segments:
|
|
|
|
| 75 |
result += str(seg)
|
| 76 |
return result
|
| 77 |
|
| 78 |
def reform_trans_str(self):
|
| 79 |
result = ""
|
| 80 |
-
for seg in self.segments:
|
|
|
|
| 81 |
result += seg.get_trans_str()
|
| 82 |
return result
|
| 83 |
|
| 84 |
def form_bilingual_str(self):
|
| 85 |
result = ""
|
| 86 |
-
for seg in self.segments:
|
|
|
|
| 87 |
result += seg.get_bilingual_str()
|
| 88 |
return result
|
| 89 |
|
|
|
|
| 8 |
segment = args[0]
|
| 9 |
self.start_time_str = str(0)+str(timedelta(seconds=int(segment['start'])))+',000'
|
| 10 |
self.end_time_str = str(0)+str(timedelta(seconds=int(segment['end'])))+',000'
|
|
|
|
| 11 |
self.source_text = segment['text']
|
| 12 |
self.duration = f"{self.start_time_str} --> {self.end_time_str}"
|
| 13 |
self.translation = ""
|
| 14 |
elif isinstance(args[0], list):
|
|
|
|
| 15 |
self.source_text = args[0][2]
|
| 16 |
self.duration = args[0][1]
|
| 17 |
+
self.start_time_str = self.duration.split(" --> ")[0]
|
| 18 |
+
self.end_time_str = self.duration.split(" --> ")[1]
|
| 19 |
self.translation = ""
|
| 20 |
|
| 21 |
+
def merge_seg(self, seg):
|
| 22 |
+
self.source_text += seg.source_text
|
| 23 |
+
self.translation += seg.translation
|
| 24 |
+
self.end_time_str = seg.end_time_str
|
| 25 |
+
self.duration = f"{self.start_time_str} --> {self.end_time_str}"
|
| 26 |
+
pass
|
| 27 |
+
|
| 28 |
def __str__(self) -> str:
|
| 29 |
+
return f'{self.duration}\n{self.source_text}\n\n'
|
| 30 |
|
| 31 |
def get_trans_str(self) -> str:
|
| 32 |
+
return f'{self.duration}\n{self.translation}\n\n'
|
| 33 |
|
| 34 |
def get_bilingual_str(self) -> str:
|
| 35 |
+
return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
|
| 36 |
|
| 37 |
class SRT_script():
|
| 38 |
def __init__(self, segments) -> None:
|
|
|
|
| 53 |
|
| 54 |
return cls(segments)
|
| 55 |
|
| 56 |
+
def merge_segs(self, idx_list) -> SRT_segment:
|
| 57 |
+
final_seg = self.segments[idx_list[0]]
|
| 58 |
+
if len(idx_list) == 1:
|
| 59 |
+
return final_seg
|
| 60 |
+
|
| 61 |
+
for idx in range(1, len(idx_list)):
|
| 62 |
+
final_seg.merge_seg(self.segments[idx_list[idx]])
|
| 63 |
+
|
| 64 |
+
return final_seg
|
| 65 |
+
|
| 66 |
+
def form_whole_sentence(self):
|
| 67 |
+
merge_list = [] # a list of indices that should be merged e.g. [[0], [2, 3, 4], [5, 6], [7]]
|
| 68 |
+
sentence = []
|
| 69 |
+
for i, seg in enumerate(self.segments):
|
| 70 |
+
if seg.source_text[-1] == '.':
|
| 71 |
+
sentence.append(i)
|
| 72 |
+
merge_list.append(sentence)
|
| 73 |
+
sentence = []
|
| 74 |
+
else:
|
| 75 |
+
sentence.append(i)
|
| 76 |
+
|
| 77 |
+
segments = []
|
| 78 |
+
for idx_list in merge_list:
|
| 79 |
+
segments.append(self.merge_segs(idx_list))
|
| 80 |
+
|
| 81 |
+
self.segments = segments # need memory release?
|
| 82 |
+
|
| 83 |
def set_translation(self, translate:str, id_range:tuple):
|
| 84 |
start_seg_id = id_range[0]
|
| 85 |
end_seg_id = id_range[1]
|
| 86 |
|
| 87 |
lines = translate.split('\n\n')
|
| 88 |
+
if len(lines) != (end_seg_id - start_seg_id + 1):
|
| 89 |
+
print(id_range)
|
| 90 |
+
for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id]):
|
| 91 |
+
print(seg.source_text)
|
| 92 |
+
print(translate)
|
| 93 |
|
| 94 |
for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id]):
|
| 95 |
seg.translation = lines[i]
|
| 96 |
pass
|
| 97 |
+
|
| 98 |
+
def split_seg(self, seg_id):
|
| 99 |
+
# TODO: evenly split seg to 2 parts and add new seg into self.segments
|
| 100 |
+
pass
|
| 101 |
+
|
| 102 |
+
def check_len_and_split(self, threshold):
|
| 103 |
+
# TODO: if sentence length >= threshold, split this segments to two
|
| 104 |
+
pass
|
| 105 |
|
| 106 |
def get_source_only(self):
|
| 107 |
# return a string with pure source text
|
| 108 |
result = ""
|
| 109 |
+
for i, seg in enumerate(self.segments):
|
| 110 |
+
result+=f'SENTENCE {i+1}: {seg.source_text}\n\n\n'
|
| 111 |
|
| 112 |
return result
|
| 113 |
|
| 114 |
def reform_src_str(self):
|
| 115 |
result = ""
|
| 116 |
+
for i, seg in enumerate(self.segments):
|
| 117 |
+
result += f'{i+1}\n'
|
| 118 |
result += str(seg)
|
| 119 |
return result
|
| 120 |
|
| 121 |
def reform_trans_str(self):
|
| 122 |
result = ""
|
| 123 |
+
for i, seg in enumerate(self.segments):
|
| 124 |
+
result += f'{i+1}\n'
|
| 125 |
result += seg.get_trans_str()
|
| 126 |
return result
|
| 127 |
|
| 128 |
def form_bilingual_str(self):
|
| 129 |
result = ""
|
| 130 |
+
for i, seg in enumerate(self.segments):
|
| 131 |
+
result += f'{i+1}\n'
|
| 132 |
result += seg.get_bilingual_str()
|
| 133 |
return result
|
| 134 |
|
pipeline.py
CHANGED
|
@@ -88,8 +88,6 @@ if not os.path.exists(f'{RESULT_PATH}/{VIDEO_NAME}'):
|
|
| 88 |
srt_file_en = args.srt_file
|
| 89 |
|
| 90 |
if srt_file_en is not None:
|
| 91 |
-
# with open(srt_file_en, 'r', encoding='utf-8') as f:
|
| 92 |
-
# script_input = f.read()
|
| 93 |
srt = SRT_script.parse_from_srt_file(srt_file_en)
|
| 94 |
script_input = srt.get_source_only()
|
| 95 |
else:
|
|
@@ -106,12 +104,20 @@ else:
|
|
| 106 |
|
| 107 |
# use stable-whisper
|
| 108 |
model = stable_whisper.load_model('base')
|
| 109 |
-
transcript = model.transcribe(audio_path)
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
transcript = transcript.to_dict()
|
| 112 |
srt = SRT_script(transcript['segments']) # read segments to SRT class
|
|
|
|
| 113 |
script_input = srt.get_source_only()
|
| 114 |
-
|
| 115 |
#Write SRT file
|
| 116 |
|
| 117 |
# from whisper.utils import WriteSRT
|
|
@@ -168,7 +174,7 @@ if not args.only_srt:
|
|
| 168 |
|
| 169 |
# script_input_withForceTerm = re.sub('\n ', '\n', "".join(ready_words))
|
| 170 |
|
| 171 |
-
srt.correct_with_force_term()
|
| 172 |
|
| 173 |
# Split the video script by sentences and create chunks within the token limit
|
| 174 |
def script_split(script_in, chunk_size = 1000):
|
|
@@ -199,8 +205,8 @@ script_arr, range_arr = script_split(script_input)
|
|
| 199 |
|
| 200 |
# Translate and save
|
| 201 |
for s, range in tqdm(zip(script_arr, range_arr)):
|
| 202 |
-
print(s)
|
| 203 |
# using chatgpt model
|
|
|
|
| 204 |
if model_name == "gpt-3.5-turbo":
|
| 205 |
# print(s + "\n")
|
| 206 |
response = openai.ChatCompletion.create(
|
|
|
|
| 88 |
srt_file_en = args.srt_file
|
| 89 |
|
| 90 |
if srt_file_en is not None:
|
|
|
|
|
|
|
| 91 |
srt = SRT_script.parse_from_srt_file(srt_file_en)
|
| 92 |
script_input = srt.get_source_only()
|
| 93 |
else:
|
|
|
|
| 104 |
|
| 105 |
# use stable-whisper
|
| 106 |
model = stable_whisper.load_model('base')
|
| 107 |
+
transcript = model.transcribe(audio_path, regroup = False)
|
| 108 |
+
(
|
| 109 |
+
transcript
|
| 110 |
+
.split_by_punctuation(['.', '。', '?'])
|
| 111 |
+
.merge_by_gap(.15, max_words=3)
|
| 112 |
+
.merge_by_punctuation([' '])
|
| 113 |
+
.split_by_punctuation(['.', '。', '?'])
|
| 114 |
+
)
|
| 115 |
+
# transcript.to_srt_vtt(srt_file_en)
|
| 116 |
transcript = transcript.to_dict()
|
| 117 |
srt = SRT_script(transcript['segments']) # read segments to SRT class
|
| 118 |
+
srt.form_whole_sentence()
|
| 119 |
script_input = srt.get_source_only()
|
| 120 |
+
srt.write_srt_file_src(srt_file_en)
|
| 121 |
#Write SRT file
|
| 122 |
|
| 123 |
# from whisper.utils import WriteSRT
|
|
|
|
| 174 |
|
| 175 |
# script_input_withForceTerm = re.sub('\n ', '\n', "".join(ready_words))
|
| 176 |
|
| 177 |
+
# srt.correct_with_force_term()
|
| 178 |
|
| 179 |
# Split the video script by sentences and create chunks within the token limit
|
| 180 |
def script_split(script_in, chunk_size = 1000):
|
|
|
|
| 205 |
|
| 206 |
# Translate and save
|
| 207 |
for s, range in tqdm(zip(script_arr, range_arr)):
|
|
|
|
| 208 |
# using chatgpt model
|
| 209 |
+
print(f"now translating sentences {range}")
|
| 210 |
if model_name == "gpt-3.5-turbo":
|
| 211 |
# print(s + "\n")
|
| 212 |
response = openai.ChatCompletion.create(
|