File size: 5,384 Bytes
391e575
cb10a62
391e575
 
 
 
 
 
 
 
 
 
 
 
 
c89f60b
391e575
 
 
760a845
cb10a62
c89f60b
6b2b9b7
391e575
 
 
 
 
 
 
 
cb10a62
 
c89f60b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391e575
6b2b9b7
c89f60b
 
0954181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb10a62
 
 
391e575
cb10a62
 
 
 
391e575
cb10a62
 
 
 
 
391e575
cb10a62
 
c89f60b
cb10a62
 
 
391e575
 
 
c89f60b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time    : 2021/2/29 21:41
# @author  : Mo
# @function: transformers直接加载bert类模型测试


import traceback
import time
import sys
import os
os.environ["MACRO_CORRECT_FLAG_CSC_TOKEN"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["USE_TORCH"] = "1"

from macro_correct.pytorch_textcorrection.tcTools import preprocess_same_with_training
from macro_correct import correct_basic
from macro_correct import correct_long
from macro_correct import correct
import gradio as gr

# pyinstaller -F xxxx.py

# pretrained_model_name_or_path = "shibing624/macbert4csc-base-chinese"
pretrained_model_name_or_path = "Macadam/macbert4mdcspell_v2"
# pretrained_model_name_or_path = "Macropodus/macbert4mdcspell_v1"
# pretrained_model_name_or_path = "Macropodus/macbert4csc_v1"
# pretrained_model_name_or_path = "Macropodus/macbert4csc_v2"
# pretrained_model_name_or_path = "Macropodus/bert4csc_v1"
# device = torch.device("cpu")
# device = torch.device("cuda")


def cut_sent_by_stay_and_maxlen(text, max_len=126, return_length=True):
    """
    分句但是保存原标点符号, 如果长度还是太长的话就切为固定长度的句子
    Args:
        text: str, sentence of input text;
        max_len: int, max_len of traing texts;
        return_length: bool, wether return length or not
    Returns:
        res: List<tuple>
    """
    ### text_sp = re.split(r"!”|?”|。”|……”|”!|”?|”。|”……|》。|)。|!|?|。|…|\!|\?", text)
    text_sp = re.split(r"[》)!?。…”;;!?\n]+", text)
    conn_symbol = "!?。…”;;!?》)\n"
    text_length_s = []
    text_cut = []
    len_text = len(text) - 1
    # signal_symbol = "—”>;?…)‘《’(·》“~,、!。:<"
    len_global = 0
    for idx, text_sp_i in enumerate(text_sp):
        text_cut_idx = text_sp[idx]
        len_global_before = copy.deepcopy(len_global)
        len_global += len(text_sp_i)
        while True:
            if len_global <= len_text and text[len_global] in conn_symbol:
                text_cut_idx += text[len_global]
            else:
                # len_global += 1
                if text_cut_idx:
                    ### 如果标点符号依旧切分不了, 就强行切
                    if len(text_cut_idx) > max_len:
                        text_cut_i, text_length_s_i = cut_sent_by_maxlen(
                            text=text, max_len=max_len, return_length=True)
                        text_length_s.extend(text_length_s_i)
                        text_cut.extend(text_cut_i)
                    else:
                        text_length_s.append([len_global_before, len_global])
                        text_cut.append(text_cut_idx)
                break
            len_global += 1
    if return_length:
        return text_cut, text_length_s
    return text_cut


def macro_correct(text):
    print(text)
    text = preprocess_same_with_training(text)
    texts, texts_length = cut_sent_by_stay_and_maxlen(text, return_length=True)
    text_str = ""
    text_list = []
    for t in texts:
        print(t)
        text_csc = correct_long(t, num_rethink=2, flag_cut=True)
        print(text_csc)
        if text_csc:
            text_list.extend(text_csc)
            text_str += text_csc[0].get("target")
        else:
            text_list.extend([{}])
            text_str += t
    text_str += "\n" + "#" * 32 + "\n"
    for tdx, t in enumerate(text_list):
        if t:
            for tk, tv in t.items():
                if tk == "index":
                    text_str += f"idx: {str(tdx+1)}\n"
                else:
                    text_str += f"{str(tk).strip()}: {str(tv).strip()}\n"
            text_str += "\n"
    return text_str


if __name__ == '__main__':
    print(macro_correct('少先队员因该为老人让坐'))

    examples = [
        "机七学习是人工智能领遇最能体现智能的一个分知",
        "我是练习时长两念半的鸽仁练习生蔡徐坤",
        "真麻烦你了。希望你们好好的跳无",
        "他法语说的很好,的语也不错",
        "遇到一位很棒的奴生跟我疗天",
        "我们为这个目标努力不解",
    ]
    gr.Interface(
        macro_correct,
        inputs='text',
        outputs='text',
        title="Chinese Spelling Correction Model Macropodus/macbert4mdcspell_v2",
        description="Copy or input error Chinese text. Submit and the machine will correct text.",
        article="Link to <a href='https://github.com/yongzhuo/macro-correct' style='color:blue;' target='_blank\'>Github REPO: macro-correct</a>",
        examples=examples
    ).launch()
    # ).launch(server_name="0.0.0.0", server_port=8066, share=False, debug=True)

"""
赤热的阳光烘烤大地,婵鸣撕破树荫的宁净。少年咬下鲜红西瓜,糖汁溶化在沙摊上。孩童举着冰其淋奔跑,浪花打湿嘻闹的脚丫。威风卷起碎花裙摆,牵牛花在篱笆绽方。这个季结,连空气都浸着清谅的甜。
炽热的阳光烘烤大地,蝉鸣撕破树荫的宁静。少年咬下鲜红西瓜,糖汁溶化在沙滩上。孩童举着冰淇淋奔跑,浪花打湿嬉闹的脚丫。微风卷起碎花裙摆,牵牛花在篱笆绽放。这个季节,连空气都浸着清凉的甜。

pyinstaller --onefile app_macro_correct_std.py
"""