Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -13,13 +13,13 @@ os.environ["MACRO_CORRECT_FLAG_CSC_TOKEN"] = "1"
|
|
13 |
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
14 |
os.environ["USE_TORCH"] = "1"
|
15 |
|
16 |
-
from macro_correct.pytorch_textcorrection.tcTools import
|
17 |
from macro_correct import correct_basic
|
18 |
from macro_correct import correct_long
|
19 |
from macro_correct import correct
|
20 |
import gradio as gr
|
21 |
|
22 |
-
|
23 |
|
24 |
# pretrained_model_name_or_path = "shibing624/macbert4csc-base-chinese"
|
25 |
pretrained_model_name_or_path = "Macadam/macbert4mdcspell_v2"
|
@@ -31,9 +31,54 @@ pretrained_model_name_or_path = "Macadam/macbert4mdcspell_v2"
|
|
31 |
# device = torch.device("cuda")
|
32 |
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
def macro_correct(text):
|
35 |
print(text)
|
36 |
-
|
|
|
37 |
text_str = ""
|
38 |
text_list = []
|
39 |
for t in texts:
|
@@ -73,10 +118,16 @@ if __name__ == '__main__':
|
|
73 |
macro_correct,
|
74 |
inputs='text',
|
75 |
outputs='text',
|
76 |
-
title="Chinese Spelling Correction Model Macropodus/
|
77 |
description="Copy or input error Chinese text. Submit and the machine will correct text.",
|
78 |
article="Link to <a href='https://github.com/yongzhuo/macro-correct' style='color:blue;' target='_blank\'>Github REPO: macro-correct</a>",
|
79 |
examples=examples
|
80 |
).launch()
|
81 |
# ).launch(server_name="0.0.0.0", server_port=8066, share=False, debug=True)
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
14 |
os.environ["USE_TORCH"] = "1"
|
15 |
|
16 |
+
from macro_correct.pytorch_textcorrection.tcTools import preprocess_same_with_training
|
17 |
from macro_correct import correct_basic
|
18 |
from macro_correct import correct_long
|
19 |
from macro_correct import correct
|
20 |
import gradio as gr
|
21 |
|
22 |
+
# pyinstaller -F xxxx.py
|
23 |
|
24 |
# pretrained_model_name_or_path = "shibing624/macbert4csc-base-chinese"
|
25 |
pretrained_model_name_or_path = "Macadam/macbert4mdcspell_v2"
|
|
|
31 |
# device = torch.device("cuda")
|
32 |
|
33 |
|
34 |
+
def cut_sent_by_stay_and_maxlen(text, max_len=126, return_length=True):
|
35 |
+
"""
|
36 |
+
分句但是保存原标点符号, 如果长度还是太长的话就切为固定长度的句子
|
37 |
+
Args:
|
38 |
+
text: str, sentence of input text;
|
39 |
+
max_len: int, max_len of traing texts;
|
40 |
+
return_length: bool, wether return length or not
|
41 |
+
Returns:
|
42 |
+
res: List<tuple>
|
43 |
+
"""
|
44 |
+
### text_sp = re.split(r"!”|?”|。”|……”|”!|”?|”。|”……|》。|)。|!|?|。|…|\!|\?", text)
|
45 |
+
text_sp = re.split(r"[》)!?。…”;;!?\n]+", text)
|
46 |
+
conn_symbol = "!?。…”;;!?》)\n"
|
47 |
+
text_length_s = []
|
48 |
+
text_cut = []
|
49 |
+
len_text = len(text) - 1
|
50 |
+
# signal_symbol = "—”>;?…)‘《’(·》“~,、!。:<"
|
51 |
+
len_global = 0
|
52 |
+
for idx, text_sp_i in enumerate(text_sp):
|
53 |
+
text_cut_idx = text_sp[idx]
|
54 |
+
len_global_before = copy.deepcopy(len_global)
|
55 |
+
len_global += len(text_sp_i)
|
56 |
+
while True:
|
57 |
+
if len_global <= len_text and text[len_global] in conn_symbol:
|
58 |
+
text_cut_idx += text[len_global]
|
59 |
+
else:
|
60 |
+
# len_global += 1
|
61 |
+
if text_cut_idx:
|
62 |
+
### 如果标点符号依旧切分不了, 就强行切
|
63 |
+
if len(text_cut_idx) > max_len:
|
64 |
+
text_cut_i, text_length_s_i = cut_sent_by_maxlen(
|
65 |
+
text=text, max_len=max_len, return_length=True)
|
66 |
+
text_length_s.extend(text_length_s_i)
|
67 |
+
text_cut.extend(text_cut_i)
|
68 |
+
else:
|
69 |
+
text_length_s.append([len_global_before, len_global])
|
70 |
+
text_cut.append(text_cut_idx)
|
71 |
+
break
|
72 |
+
len_global += 1
|
73 |
+
if return_length:
|
74 |
+
return text_cut, text_length_s
|
75 |
+
return text_cut
|
76 |
+
|
77 |
+
|
78 |
def macro_correct(text):
|
79 |
print(text)
|
80 |
+
text = preprocess_same_with_training(text)
|
81 |
+
texts, texts_length = cut_sent_by_stay_and_maxlen(text, return_length=True)
|
82 |
text_str = ""
|
83 |
text_list = []
|
84 |
for t in texts:
|
|
|
118 |
macro_correct,
|
119 |
inputs='text',
|
120 |
outputs='text',
|
121 |
+
title="Chinese Spelling Correction Model Macropodus/macbert4mdcspell_v2",
|
122 |
description="Copy or input error Chinese text. Submit and the machine will correct text.",
|
123 |
article="Link to <a href='https://github.com/yongzhuo/macro-correct' style='color:blue;' target='_blank\'>Github REPO: macro-correct</a>",
|
124 |
examples=examples
|
125 |
).launch()
|
126 |
# ).launch(server_name="0.0.0.0", server_port=8066, share=False, debug=True)
|
127 |
|
128 |
+
"""
|
129 |
+
赤热的阳光烘烤大地,婵鸣撕破树荫的宁净。少年咬下鲜红西瓜,糖汁溶化在沙摊上。孩童举着冰其淋奔跑,浪花打湿嘻闹的脚丫。威风卷起碎花裙摆,牵牛花在篱笆绽方。这个季结,连空气都浸着清谅的甜。
|
130 |
+
炽热的阳光烘烤大地,蝉鸣撕破树荫的宁静。少年咬下鲜红西瓜,糖汁溶化在沙滩上。孩童举着冰淇淋奔跑,浪花打湿嬉闹的脚丫。微风卷起碎花裙摆,牵牛花在篱笆绽放。这个季节,连空气都浸着清凉的甜。
|
131 |
+
|
132 |
+
pyinstaller --onefile app_macro_correct_std.py
|
133 |
+
"""
|