a77an's picture
Upload 2 files
f4b426e verified
import torch
import pandas as pd
import re
from flask import Flask, render_template, request, jsonify
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report
import io
import sys
# Define model names
bert_model_name = "bert-base-uncased"
hatebert_model_name = "GroNLP/hateBERT"
# Initialize Flask app
app = Flask(__name__)
class CyberbullyingDetector:
def __init__(self, model_type="bert"):
if model_type == "bert":
self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(bert_model_name)
elif model_type == "hatebert":
self.tokenizer = AutoTokenizer.from_pretrained(hatebert_model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(hatebert_model_name)
else:
raise ValueError("Invalid model_type. Choose 'bert' or 'hatebert'.")
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
self.cyberbullying_threshold = 0.7
self.borderline_threshold = 0.4
self.trigger_words = [
'buang', 'pokpok', 'bogo', 'linte', 'tanga', 'diputa', 'salamat', 'Padayon lang', 'mayo gid', 'Nagapasalamat',
'gago', 'law-ay', 'bilatibay', 'yudipota', 'pangit', 'tikalon', 'tinikal', 'hambog',
'batinggilan', 'biga-on', 'bulay-ug', 'agi', 'agitot', 'alpot', 'hangag'
]
def find_triggers(self, text):
text_lower = text.lower()
return [word for word in self.trigger_words if word in text_lower]
def predict(self, text):
triggers = self.find_triggers(text)
inputs = self.tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=128,
padding=True
).to(self.device)
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=1)
pred_class = torch.argmax(probs).item()
confidence = probs[0][pred_class].item()
if confidence >= self.cyberbullying_threshold or (pred_class == 1) or (len(triggers) > 0):
label = "Cyberbullying"
is_cyberbullying = True
elif confidence >= self.borderline_threshold:
label = "Borderline"
is_cyberbullying = False
else:
label = "Safe"
is_cyberbullying = False
return {
"text": text,
"label": label,
"confidence": confidence,
"language": "hil",
"triggers": triggers,
"is_cyberbullying": is_cyberbullying
}
# Initialize the detector
detector = CyberbullyingDetector(model_type="bert")
@app.route('/')
def index():
return render_template('index.html', classification_report="Loading...")
@app.route('/predict', methods=['POST'])
def predict():
data = request.get_json()
text = data.get('text', '')
if not text:
return jsonify({"error": "No text provided"}), 400
# Make prediction using the model
result = detector.predict(text)
# Generate the classification report
true_labels = ["Cyberbullying" if "cyberbullying" in text else "Safe" for text in [text]]
predicted_labels = [result['label']]
report = classification_report(true_labels, predicted_labels, zero_division=0)
# Render the template with the classification report
return render_template('index.html', classification_report=report)
if __name__ == '__main__':
app.run(debug=True)