File size: 2,794 Bytes
d3830cc
 
 
e24f20d
98186a7
d3830cc
 
 
e542c59
 
 
 
 
 
 
 
98186a7
e542c59
 
 
 
 
aea35b3
e542c59
 
 
 
faf7c96
e542c59
 
 
98186a7
e542c59
 
 
19a55cd
e542c59
 
 
 
 
 
 
 
 
 
 
bb7d89b
 
813b46e
bb7d89b
e542c59
813b46e
e542c59
 
19a55cd
 
 
e542c59
 
 
 
 
 
 
 
 
813b46e
 
19a55cd
 
 
 
 
e542c59
19a55cd
e542c59
19a55cd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
python interactive.py
"""
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, AutoConfig
from transformers import TextClassificationPipeline
import gradio as gr

# global var
MODEL_NAME = 'momo/KcBERT-base_Hate_speech_Privacy_Detection'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels= 15,
    problem_type="multi_label_classification"
)

MODEL_BUF = {
    "name": MODEL_NAME,
    "tokenizer": tokenizer,
    "model": model,
}

def change_model_name(name):
    MODEL_BUF["name"] = name
    MODEL_BUF["tokenizer"] = AutoTokenizer.from_pretrained(name)
    MODEL_BUF["model"] = AutoModelForSequenceClassification.from_pretrained(name)

def predict(model_name, text):
    if model_name != MODEL_BUF["name"]:
        change_model_name(model_name)
    
    tokenizer = MODEL_BUF["tokenizer"]
    model = MODEL_BUF["model"]

    unsmile_labels = ["์—ฌ์„ฑ/๊ฐ€์กฑ","๋‚จ์„ฑ","์„ฑ์†Œ์ˆ˜์ž","์ธ์ข…/๊ตญ์ ","์—ฐ๋ น","์ง€์—ญ","์ข…๊ต","๊ธฐํƒ€ ํ˜์˜ค","์•…ํ”Œ/์š•์„ค", "clean", '์ด๋ฆ„', '์ „ํ™”๋ฒˆํ˜ธ', '์ฃผ์†Œ', '๊ณ„์ขŒ๋ฒˆํ˜ธ', '์ฃผ๋ฏผ๋ฒˆํ˜ธ']
    num_labels = len(unsmile_labels)

    model.config.id2label = {i: label for i, label in zip(range(num_labels), unsmile_labels)}
    model.config.label2id = {label: i for i, label in zip(range(num_labels), unsmile_labels)}

    pipe = TextClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    return_all_scores=True,
    function_to_apply='sigmoid'
    )
    print(pipe(text)[0])

    for i in pipe(text)[0]:
        output = pipe(text)[0][i] + '\t'

    return output

if __name__ == '__main__':
    exam1 = '๊ฒฝ๊ธฐ๋„ ์„ฑ๋‚จ์‹œ ์ˆ˜์ •๊ตฌ ํƒœํ‰3๋™์€ ์šฐ๋ฆฌ ๋™๋„ค์•ผ!'
    exam2 = '๋‚ด ํ•ธ๋“œํฐ ๋ฒˆํ˜ธ๋Š” 010-3930-8237 ์ด์•ผ!'
    exam3 = '์•„ ์  ์žฅ ๋„ˆ๋ฌด ์งœ์ฆ๋‚œ๋‹ค'

    model_name_list = [
        'momo/KcELECTRA-base_Hate_speech_Privacy_Detection',
        "momo/KcBERT-base_Hate_speech_Privacy_Detection",
    ]

    #Create a gradio app with a button that calls predict()
    app = gr.Interface(
        fn=predict,
        inputs=[gr.inputs.Dropdown(model_name_list, label="Model Name"), 'text'],
        outputs='text', 
        examples = [
            [MODEL_BUF["name"], exam1], 
            [MODEL_BUF["name"], exam2],
            [MODEL_BUF["name"], exam3]
            ],
        title="ํ•œ๊ตญ์–ด ํ˜์˜คํ‘œํ˜„, ๊ฐœ์ธ์ •๋ณด ํŒ๋ณ„๊ธฐ (Korean Hate Speech and Privacy Detection)",
        description="Korean Hate Speech and Privacy Detection. \t 15๊ฐœ label Detection: ์—ฌ์„ฑ/๊ฐ€์กฑ, ๋‚จ์„ฑ, ์„ฑ์†Œ์ˆ˜์ž, ์ธ์ข…/๊ตญ์ , ์—ฐ๋ น, ์ง€์—ญ, ์ข…๊ต, ๊ธฐํƒ€ ํ˜์˜ค, ์•…ํ”Œ/์š•์„ค, clean, name, number, address, bank, person"
        )
    app.launch()