Spaces:
Sleeping
Sleeping
DWizard
commited on
Commit
·
5a7c441
1
Parent(s):
6d9ba90
add force change term into chinese before tranlate
Browse filesFormer-commit-id: 2b6ec94ed31ea352361c1591a6db9a4b3e775fb3
- finetune_data/dict.csv +173 -0
- pipeline.py +45 -3
finetune_data/dict.csv
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
barracks,兵营
|
| 2 |
+
engineering bay,工程站
|
| 3 |
+
forge,锻炉
|
| 4 |
+
blink,闪现
|
| 5 |
+
evolution chamber,进化腔
|
| 6 |
+
cybernetics core,控制芯核
|
| 7 |
+
enhanced shockwaves,EMP范围
|
| 8 |
+
gravitic boosters,ob速度
|
| 9 |
+
armory,军械库
|
| 10 |
+
robotics bay,机械研究所
|
| 11 |
+
twilight council,光影议会
|
| 12 |
+
fusion core,聚变芯体
|
| 13 |
+
fleet beacon,舰队航标
|
| 14 |
+
factory,重工厂
|
| 15 |
+
ghost academy,幽灵军校
|
| 16 |
+
infestation pit,感染深渊
|
| 17 |
+
robotics facility,机械台
|
| 18 |
+
stargate,星门
|
| 19 |
+
starport,星港
|
| 20 |
+
archon,白球
|
| 21 |
+
smart servos,变形加速
|
| 22 |
+
gateway,兵营
|
| 23 |
+
warpgate,兵营
|
| 24 |
+
immortal,不朽
|
| 25 |
+
zealot,叉叉
|
| 26 |
+
nydus network,虫洞
|
| 27 |
+
nydus worm,虫洞
|
| 28 |
+
hydralisk,刺蛇
|
| 29 |
+
grooved spines,刺蛇射程
|
| 30 |
+
muscular augments,刺蛇速度
|
| 31 |
+
hydralisk den,刺蛇塔
|
| 32 |
+
planetary fortress,大地堡
|
| 33 |
+
battle cruiser,大和
|
| 34 |
+
weapon refit,大和炮
|
| 35 |
+
brood lord,大龙
|
| 36 |
+
greater spire,大龙塔
|
| 37 |
+
anabolic synthesis,大牛速度
|
| 38 |
+
cyclone,导弹车
|
| 39 |
+
bunker,地堡
|
| 40 |
+
lurker,地刺
|
| 41 |
+
seismic spines,地刺射程
|
| 42 |
+
adaptive talons,地刺速埋
|
| 43 |
+
lurker den,地刺塔
|
| 44 |
+
widow mine,地雷
|
| 45 |
+
ground carapace,地面单位甲壳等级
|
| 46 |
+
high templar,电兵
|
| 47 |
+
shield battery,电池
|
| 48 |
+
observer,叮当
|
| 49 |
+
baneling,毒爆
|
| 50 |
+
centrifugal hooks,毒爆速度
|
| 51 |
+
baneling nest,毒爆塔
|
| 52 |
+
raven,渡鸦
|
| 53 |
+
combat shield,盾
|
| 54 |
+
shield,盾
|
| 55 |
+
lair,二本
|
| 56 |
+
missile turret,防空
|
| 57 |
+
spore crawler,防空
|
| 58 |
+
supply depot,房子
|
| 59 |
+
overlord,房子
|
| 60 |
+
pneumatized carapace,房子速度
|
| 61 |
+
mutalisk,飞龙
|
| 62 |
+
spire,飞龙塔
|
| 63 |
+
viper,飞蛇
|
| 64 |
+
flyer attacks,飞行生物攻击等级
|
| 65 |
+
flyer carapace,飞行生物甲壳等级
|
| 66 |
+
tempest,风暴
|
| 67 |
+
tectonic destabilizers,风暴伤害
|
| 68 |
+
phoenix,凤凰
|
| 69 |
+
anion pulse-crystals,凤凰射程
|
| 70 |
+
corruptor,腐化
|
| 71 |
+
infestor,感染虫
|
| 72 |
+
pathogen glands,感染能量
|
| 73 |
+
zergling,狗
|
| 74 |
+
spawning pool,狗池
|
| 75 |
+
metabolic boost,狗速
|
| 76 |
+
spine crawler,管子
|
| 77 |
+
marauder,光头
|
| 78 |
+
ghost,鬼兵
|
| 79 |
+
arm silo with nuke,核弹
|
| 80 |
+
carrier,黄金舰队
|
| 81 |
+
hellion,火车
|
| 82 |
+
hellbat,火车侠
|
| 83 |
+
ravager,火蟑螂
|
| 84 |
+
nexus,基地
|
| 85 |
+
hatchery,基地
|
| 86 |
+
command center,基地
|
| 87 |
+
neosteel armor,建筑护甲
|
| 88 |
+
hi-sec auto tracking,建筑射程
|
| 89 |
+
ship weapons,舰船武器等级
|
| 90 |
+
charge,脚速
|
| 91 |
+
liberator,解放
|
| 92 |
+
advanced ballistics,解放射程
|
| 93 |
+
melee attacks,近战攻击等级
|
| 94 |
+
colossus,巨像
|
| 95 |
+
extended thermal lance,巨像射程
|
| 96 |
+
creep tumor,菌毯
|
| 97 |
+
tech lab,科技挂件
|
| 98 |
+
air armor,空中单位护甲等级
|
| 99 |
+
air weapons,空中单位武器等级
|
| 100 |
+
adrenal glands,狂狗
|
| 101 |
+
mule,矿螺
|
| 102 |
+
infernal pre-igniter,蓝火
|
| 103 |
+
thor,雷神
|
| 104 |
+
warp prism,棱镜
|
| 105 |
+
gravitic drive,棱镜速度
|
| 106 |
+
dragoon,龙骑士
|
| 107 |
+
cocoon,卵
|
| 108 |
+
larva,卵
|
| 109 |
+
mothership,妈妈船
|
| 110 |
+
burrow,埋地
|
| 111 |
+
changeling,拟态虫
|
| 112 |
+
ultralisk,牛
|
| 113 |
+
chitinous plating,牛甲
|
| 114 |
+
ultralisk cavern,牛塔
|
| 115 |
+
drone,农民
|
| 116 |
+
scv,农民
|
| 117 |
+
queen,女王
|
| 118 |
+
banshee,女妖
|
| 119 |
+
hyperflight rotors,女妖提速
|
| 120 |
+
photon cannon,炮台
|
| 121 |
+
missile attacks,喷射攻击等级
|
| 122 |
+
assimilator,气矿
|
| 123 |
+
extractor,气矿
|
| 124 |
+
refinery,气矿
|
| 125 |
+
roach,钱赞企
|
| 126 |
+
marine,枪兵
|
| 127 |
+
sensor tower,圈
|
| 128 |
+
infantry armor,人族防
|
| 129 |
+
infantry weapons,人族攻
|
| 130 |
+
hive,三本
|
| 131 |
+
psionic storm,闪电
|
| 132 |
+
templar archives,闪电塔
|
| 133 |
+
sentry,哨兵
|
| 134 |
+
ground armor,神族防
|
| 135 |
+
ground weapons,神族攻
|
| 136 |
+
adept,使徒
|
| 137 |
+
resonating glaives,使徒攻速
|
| 138 |
+
reactor,双倍挂件
|
| 139 |
+
pylon,水晶
|
| 140 |
+
reaper,死神
|
| 141 |
+
drilling claws,速埋
|
| 142 |
+
swarm host,宿主
|
| 143 |
+
mag-field accelerator,锁定增伤
|
| 144 |
+
siege tank,坦克
|
| 145 |
+
probe,探机
|
| 146 |
+
corvid reactor,铁鸦能量
|
| 147 |
+
neural parasite,同化完成
|
| 148 |
+
viking,维京
|
| 149 |
+
oracle,先知
|
| 150 |
+
broodling,小虫子
|
| 151 |
+
locust,小虫子
|
| 152 |
+
mothership core,小妈妈船
|
| 153 |
+
orbital command,星轨
|
| 154 |
+
stimpack,兴奋剂
|
| 155 |
+
void ray,虚空
|
| 156 |
+
flux vanes,虚空速度
|
| 157 |
+
overseer,眼虫
|
| 158 |
+
ignite afterburners,医疗机速度
|
| 159 |
+
dark templar,隐刀
|
| 160 |
+
shadow stride,隐刀闪现
|
| 161 |
+
dark shrine,隐刀塔
|
| 162 |
+
cloaking field,隐形
|
| 163 |
+
personal cloaking,隐形
|
| 164 |
+
medivac dropship,运输机
|
| 165 |
+
vehicle and ship plating,战车及舰船钢板等级
|
| 166 |
+
vehicle weapons,战车武器等级
|
| 167 |
+
war hound,战狼
|
| 168 |
+
roach warren,蟑螂巢
|
| 169 |
+
tunneling claws,蟑螂埋地
|
| 170 |
+
glial reconstitution,蟑螂速度
|
| 171 |
+
concussive shells,震撼弹
|
| 172 |
+
stalker,追猎
|
| 173 |
+
disruptor,自爆球
|
pipeline.py
CHANGED
|
@@ -89,7 +89,7 @@ if not os.path.exists(f'{RESULT_PATH}/{VIDEO_NAME}'):
|
|
| 89 |
# Instead of using the script_en variable directly, we'll use script_input
|
| 90 |
srt_file_en = args.srt_file
|
| 91 |
if srt_file_en is not None:
|
| 92 |
-
with open(srt_file_en, 'r') as f:
|
| 93 |
script_input = f.read()
|
| 94 |
else:
|
| 95 |
# using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
|
|
@@ -110,7 +110,7 @@ else:
|
|
| 110 |
writer.write_result(transcript, srt)
|
| 111 |
|
| 112 |
# split the video script(open ai prompt limit: about 5000)
|
| 113 |
-
with open(srt_file_en, 'r') as f:
|
| 114 |
script_en = f.read()
|
| 115 |
script_input = script_en
|
| 116 |
|
|
@@ -119,9 +119,51 @@ if not args.only_srt:
|
|
| 119 |
assSub_en = srt2ass(srt_file_en, "default", "No", "Modest")
|
| 120 |
print('ASS subtitle saved as: ' + assSub_en)
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
# Split the video script by sentences and create chunks within the token limit
|
| 123 |
n_threshold = 1500 # Token limit for the GPT-3 model
|
| 124 |
-
script_split =
|
| 125 |
|
| 126 |
script_arr = []
|
| 127 |
script = ""
|
|
|
|
| 89 |
# Instead of using the script_en variable directly, we'll use script_input
|
| 90 |
srt_file_en = args.srt_file
|
| 91 |
if srt_file_en is not None:
|
| 92 |
+
with open(srt_file_en, 'r', encoding='utf-8') as f:
|
| 93 |
script_input = f.read()
|
| 94 |
else:
|
| 95 |
# using whisper to perform speech-to-text and save it in <video name>_en.txt under RESULT PATH.
|
|
|
|
| 110 |
writer.write_result(transcript, srt)
|
| 111 |
|
| 112 |
# split the video script(open ai prompt limit: about 5000)
|
| 113 |
+
with open(srt_file_en, 'r', encoding='utf-8') as f:
|
| 114 |
script_en = f.read()
|
| 115 |
script_input = script_en
|
| 116 |
|
|
|
|
| 119 |
assSub_en = srt2ass(srt_file_en, "default", "No", "Modest")
|
| 120 |
print('ASS subtitle saved as: ' + assSub_en)
|
| 121 |
|
| 122 |
+
# force translate the starcraft2 term into chinese according to the dict
|
| 123 |
+
# TODO: shortcut translation i.e. VA, ob
|
| 124 |
+
# TODO: variety of translation
|
| 125 |
+
from csv import reader
|
| 126 |
+
import re
|
| 127 |
+
|
| 128 |
+
# read dict
|
| 129 |
+
with open("finetune_data/dict.csv",'r', encoding='utf-8') as f:
|
| 130 |
+
csv_reader = reader(f)
|
| 131 |
+
term_dict = {rows[0]:rows[1] for rows in csv_reader}
|
| 132 |
+
|
| 133 |
+
def clean_timestamp(lines):
|
| 134 |
+
new_lines = []
|
| 135 |
+
strinfo = re.compile('[0-9]+\n.{25},[0-9]{3}') # 注意用4个\\\\来替换\
|
| 136 |
+
new_lines = strinfo.sub('_-_', lines)
|
| 137 |
+
print(new_lines)
|
| 138 |
+
return new_lines
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
ready_lines = re.sub('\n', '\n ', script_input)
|
| 142 |
+
ready_words = ready_lines.split(" ")
|
| 143 |
+
i = 0
|
| 144 |
+
while i < len(ready_words):
|
| 145 |
+
word = ready_words[i]
|
| 146 |
+
if word[-2:] == ".\n" :
|
| 147 |
+
if word[:-2].lower() in term_dict :
|
| 148 |
+
new_word = word.replace(word[:-2], term_dict.get(word[:-2].lower())) + ' '
|
| 149 |
+
ready_words[i] = new_word
|
| 150 |
+
else :
|
| 151 |
+
word += ' '
|
| 152 |
+
ready_words[i] = word
|
| 153 |
+
elif word.lower() in term_dict :
|
| 154 |
+
new_word = word.replace(word,term_dict.get(word.lower())) + ' '
|
| 155 |
+
ready_words[i] = new_word
|
| 156 |
+
else :
|
| 157 |
+
word += " "
|
| 158 |
+
ready_words[i]= word
|
| 159 |
+
i += 1
|
| 160 |
+
|
| 161 |
+
script_input_withForceTerm = re.sub('\n ', '\n', "".join(ready_words))
|
| 162 |
+
|
| 163 |
+
|
| 164 |
# Split the video script by sentences and create chunks within the token limit
|
| 165 |
n_threshold = 1500 # Token limit for the GPT-3 model
|
| 166 |
+
script_split = script_input_withForceTerm.split('.')
|
| 167 |
|
| 168 |
script_arr = []
|
| 169 |
script = ""
|