Spaces:
Sleeping
Sleeping
import torch | |
import pandas as pd | |
import re | |
from flask import Flask, render_template, request, jsonify | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
from sklearn.metrics import classification_report | |
import io | |
import sys | |
# Define model names | |
bert_model_name = "bert-base-uncased" | |
hatebert_model_name = "GroNLP/hateBERT" | |
# Initialize Flask app | |
app = Flask(__name__) | |
class CyberbullyingDetector: | |
def __init__(self, model_type="bert"): | |
if model_type == "bert": | |
self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name) | |
self.model = AutoModelForSequenceClassification.from_pretrained(bert_model_name) | |
elif model_type == "hatebert": | |
self.tokenizer = AutoTokenizer.from_pretrained(hatebert_model_name) | |
self.model = AutoModelForSequenceClassification.from_pretrained(hatebert_model_name) | |
else: | |
raise ValueError("Invalid model_type. Choose 'bert' or 'hatebert'.") | |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
self.model.to(self.device) | |
self.cyberbullying_threshold = 0.7 | |
self.borderline_threshold = 0.4 | |
self.trigger_words = [ | |
'buang', 'pokpok', 'bogo', 'linte', 'tanga', 'diputa', 'salamat', 'Padayon lang', 'mayo gid', 'Nagapasalamat', | |
'gago', 'law-ay', 'bilatibay', 'yudipota', 'pangit', 'tikalon', 'tinikal', 'hambog', | |
'batinggilan', 'biga-on', 'bulay-ug', 'agi', 'agitot', 'alpot', 'hangag' | |
] | |
def find_triggers(self, text): | |
text_lower = text.lower() | |
return [word for word in self.trigger_words if word in text_lower] | |
def predict(self, text): | |
triggers = self.find_triggers(text) | |
inputs = self.tokenizer( | |
text, | |
return_tensors="pt", | |
truncation=True, | |
max_length=128, | |
padding=True | |
).to(self.device) | |
with torch.no_grad(): | |
outputs = self.model(**inputs) | |
probs = torch.nn.functional.softmax(outputs.logits, dim=1) | |
pred_class = torch.argmax(probs).item() | |
confidence = probs[0][pred_class].item() | |
if confidence >= self.cyberbullying_threshold or (pred_class == 1) or (len(triggers) > 0): | |
label = "Cyberbullying" | |
is_cyberbullying = True | |
elif confidence >= self.borderline_threshold: | |
label = "Borderline" | |
is_cyberbullying = False | |
else: | |
label = "Safe" | |
is_cyberbullying = False | |
return { | |
"text": text, | |
"label": label, | |
"confidence": confidence, | |
"language": "hil", | |
"triggers": triggers, | |
"is_cyberbullying": is_cyberbullying | |
} | |
# Initialize the detector | |
detector = CyberbullyingDetector(model_type="bert") | |
def index(): | |
return render_template('index.html', classification_report="Loading...") | |
def predict(): | |
data = request.get_json() | |
text = data.get('text', '') | |
if not text: | |
return jsonify({"error": "No text provided"}), 400 | |
# Make prediction using the model | |
result = detector.predict(text) | |
# Generate the classification report | |
true_labels = ["Cyberbullying" if "cyberbullying" in text else "Safe" for text in [text]] | |
predicted_labels = [result['label']] | |
report = classification_report(true_labels, predicted_labels, zero_division=0) | |
# Render the template with the classification report | |
return render_template('index.html', classification_report=report) | |
if __name__ == '__main__': | |
app.run(debug=True) | |