File size: 3,757 Bytes
fe3311f
2b470ab
 
 
 
 
 
 
 
fe3311f
2b470ab
fe3311f
2b470ab
fe3311f
 
 
 
2b470ab
fe3311f
 
 
2b470ab
fe3311f
 
 
2b470ab
 
 
 
fe3311f
2b470ab
 
 
 
 
 
 
fe3311f
2b470ab
 
 
fe3311f
2b470ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe3311f
 
 
2b470ab
 
 
 
 
 
 
 
 
 
 
fe3311f
 
 
2b470ab
 
 
 
 
 
 
 
 
 
 
 
 
 
fe3311f
 
 
 
 
2b470ab
 
 
 
 
fe3311f
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from fastapi import FastAPI
from pydantic import BaseModel
from langdetect import detect
import torch
import torch.nn as nn
from transformers import DistilBertModel, AutoModel, AutoTokenizer, DistilBertTokenizer
from huggingface_hub import snapshot_download
import os

# App and device
app = FastAPI()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create safe local cache directory
hf_cache_dir = "./hf_cache"
os.makedirs(hf_cache_dir, exist_ok=True)
os.environ["TRANSFORMERS_CACHE"] = hf_cache_dir

# Download model repositories to local path
english_path = snapshot_download("koyu008/English_Toxic_Classifier", cache_dir=hf_cache_dir)
hinglish_path = snapshot_download("koyu008/Hinglish_comment_classifier", cache_dir=hf_cache_dir)

# ----------------------------
# Model classes
# ----------------------------

class ToxicBERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained(english_path)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 6)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        return self.classifier(self.dropout(output))


class HinglishToxicClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained(hinglish_path)
        hidden_size = self.bert.config.hidden_size
        self.pool = lambda hidden: torch.cat([
            hidden.mean(dim=1),
            hidden.max(dim=1).values
        ], dim=1)
        self.bottleneck = nn.Sequential(
            nn.Linear(2 * hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.classifier = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        hidden = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        pooled = self.pool(hidden)
        x = self.bottleneck(pooled)
        return self.classifier(x)

# ----------------------------
# Load Models & Tokenizers
# ----------------------------

english_model = ToxicBERT().to(device)
english_model.load_state_dict(torch.load("bert_toxic_classifier.pt", map_location=device))
english_model.eval()
english_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

hinglish_model = HinglishToxicClassifier().to(device)
hinglish_model.load_state_dict(torch.load("best_hinglish_model.pt", map_location=device))
hinglish_model.eval()
hinglish_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# ----------------------------
# API
# ----------------------------

class InputText(BaseModel):
    text: str

@app.post("/predict")
async def predict(input: InputText):
    text = input.text
    lang = detect(text)

    if lang == "en":
        inputs = english_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            logits = english_model(**inputs)
        probs = torch.softmax(logits, dim=1).cpu().numpy().tolist()[0]
        return {
            "language": "english",
            "classes": ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"],
            "probabilities": probs
        }
    else:
        inputs = hinglish_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            logits = hinglish_model(**inputs)
        probs = torch.softmax(logits, dim=1).cpu().numpy().tolist()[0]
        return {
            "language": "hinglish",
            "classes": ["toxic", "non-toxic"],
            "probabilities": probs
        }