File size: 3,584 Bytes
1f8de22
 
 
 
 
 
a5e7dd0
1f8de22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5e7dd0
1f8de22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72a0576
1f8de22
 
 
 
 
 
 
 
 
 
 
 
 
72a0576
1f8de22
fe0532d
1f8de22
 
 
 
 
 
 
 
 
72a0576
1f8de22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe0532d
 
 
 
1f8de22
 
eb844ee
1f8de22
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import torch
import torch.nn as nn
import gradio as gr
import numpy as np
import os
import random
from transformers import AutoConfig, AutoModel, AutoTokenizer


device = torch.device('cpu')


labels = {
 0: 'toxic',
 1: 'severe_toxic',
 2: 'obscene',
 3: 'threat',
 4: 'insult',
 5: 'identity_hate',
 }
 
MODEL_NAME='roberta-base'
NUM_CLASSES=6
 
MAX_LEN = 128
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
class ToxicModel(torch.nn.Module):
    def __init__(self):
        super(ToxicModel, self).__init__()
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7

        config = AutoConfig.from_pretrained(MODEL_NAME)

        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": NUM_CLASSES,
            }
        )
        self.transformer = AutoModel.from_pretrained(MODEL_NAME, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Linear(config.hidden_size, NUM_CLASSES)  
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        transformer_out = self.transformer(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = transformer_out[0]
        sequence_output = self.dropout(torch.mean(sequence_output, 1))
        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return logits
 
def inference_fn(model, input_ids=None, attention_mask=None, token_type_ids=None):  
    model.eval()
    
    input_ids = input_ids.to(device)  
    attention_mask = attention_mask.to(device)  
    token_type_ids = token_type_ids.to(device)  
    
    with torch.no_grad():
        output = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0), token_type_ids.unsqueeze(0))
    out = output.sigmoid().detach().cpu().numpy().flatten()
       
    return out
   
def predict(comment=None) -> dict:  
    text = str(comment)
    text = " ".join(text.split())

    inputs = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_token_type_ids=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]
    
    ids = torch.tensor(ids, dtype=torch.long),
    mask = torch.tensor(mask, dtype=torch.long),
    token_type_ids = torch.tensor(token_type_ids, dtype=torch.long),
    
    model = ToxicModel()
    
    model.load_state_dict(torch.load("toxicx_model_0.pth", map_location=torch.device(device)))
    model.to(device)

    predicted = inference_fn(model, ids, mask, token_type_ids)
  
    return {labels[i]: float(predicted[i]) for i in range(NUM_CLASSES)}
    

gr.Interface(fn=predict, 
             inputs='text',
             outputs=gr.outputs.Label(num_top_classes=NUM_CLASSES)).launch()