Spaces:

koyu008
/

Toxic_Comment_Classifier

Running

File size: 3,937 Bytes

16b2ba7
2b470ab
 
 
16b2ba7
 
2b470ab
 
2d04c0e
2b470ab
16b2ba7
fe3311f
2b470ab
16b2ba7
 
 
2b470ab
16b2ba7
 
 
2b470ab
 
16b2ba7
2b470ab
 
 
16b2ba7
2b470ab
 
 
 
 
 
 
fe3311f
16b2ba7
2b470ab
 
 
16b2ba7
2b470ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16b2ba7
2b470ab
16b2ba7
2b470ab
 
 
16b2ba7
2b470ab
 
16b2ba7
 
 
 
 
 
 
f3eb85c
 
 
 
 
 
 
 
 
 
2b470ab
16b2ba7
a3af327
2b470ab
16b2ba7
2d04c0e
79819d8
16b2ba7
a3af327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a3863a

from fastapi import FastAPI, Request
from pydantic import BaseModel
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel, AutoModel, AutoTokenizer
from langdetect import detect
from huggingface_hub import snapshot_download
import os


# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Download model repos from HF Hub
english_repo = snapshot_download("koyu008/English_Toxic_Classifier")
hinglish_repo = snapshot_download("koyu008/HInglish_comment_classifier")

# Tokenizers
english_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
hinglish_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")


# English Model
class ToxicBERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 6)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        return self.classifier(self.dropout(output))


# Hinglish Model
class HinglishToxicClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained("xlm-roberta-base")
        hidden_size = self.bert.config.hidden_size
        self.pool = lambda hidden: torch.cat([
            hidden.mean(dim=1),
            hidden.max(dim=1).values
        ], dim=1)
        self.bottleneck = nn.Sequential(
            nn.Linear(2 * hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.classifier = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        hidden = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        pooled = self.pool(hidden)
        x = self.bottleneck(pooled)
        return self.classifier(x)


# Instantiate and load models
english_model = ToxicBERT().to(device)
english_model.load_state_dict(torch.load(os.path.join(english_repo, "bert_toxic_classifier.pt"), map_location=device))
english_model.eval()

hinglish_model = HinglishToxicClassifier().to(device)
hinglish_model.load_state_dict(torch.load(os.path.join(hinglish_repo, "best_hinglish_model.pt"), map_location=device))
hinglish_model.eval()

# Labels
english_labels = ['toxic', 'severe toxic', 'obscene', 'threat', 'insult', 'identity hate']
hinglish_labels = ['not toxic', 'toxic']

# FastAPI
app = FastAPI()

from fastapi.middleware.cors import CORSMiddleware

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Or restrict to your frontend domain
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


class TextIn(BaseModel):
    text: str


@app.post("/api/predict")
@app.post("/api/predict")
def predict(data: TextIn):
    text = data.text
    try:
        lang = detect(text)
    except:
        lang = "unknown"

    if lang == "en":
        tokenizer = english_tokenizer
        model = english_model
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.sigmoid(outputs).squeeze().cpu().tolist()
        return {"language": "English", "predictions": dict(zip(english_labels, probs))}

    else:
        tokenizer = hinglish_tokenizer
        model = hinglish_model
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs, dim=1).squeeze().cpu().tolist()
        return {"language": "Hinglish", "predictions": dict(zip(hinglish_labels, probs))}

@app.get("/")
def root():
    return {"message": "API is running"}