import torch import pandas as pd import re from flask import Flask, render_template, request, jsonify from transformers import AutoTokenizer, AutoModelForSequenceClassification from sklearn.metrics import classification_report import io import sys # Define model names bert_model_name = "bert-base-uncased" hatebert_model_name = "GroNLP/hateBERT" # Initialize Flask app app = Flask(__name__) class CyberbullyingDetector: def __init__(self, model_type="bert"): if model_type == "bert": self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name) self.model = AutoModelForSequenceClassification.from_pretrained(bert_model_name) elif model_type == "hatebert": self.tokenizer = AutoTokenizer.from_pretrained(hatebert_model_name) self.model = AutoModelForSequenceClassification.from_pretrained(hatebert_model_name) else: raise ValueError("Invalid model_type. Choose 'bert' or 'hatebert'.") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device) self.cyberbullying_threshold = 0.7 self.borderline_threshold = 0.4 self.trigger_words = [ 'buang', 'pokpok', 'bogo', 'linte', 'tanga', 'diputa', 'salamat', 'Padayon lang', 'mayo gid', 'Nagapasalamat', 'gago', 'law-ay', 'bilatibay', 'yudipota', 'pangit', 'tikalon', 'tinikal', 'hambog', 'batinggilan', 'biga-on', 'bulay-ug', 'agi', 'agitot', 'alpot', 'hangag' ] def find_triggers(self, text): text_lower = text.lower() return [word for word in self.trigger_words if word in text_lower] def predict(self, text): triggers = self.find_triggers(text) inputs = self.tokenizer( text, return_tensors="pt", truncation=True, max_length=128, padding=True ).to(self.device) with torch.no_grad(): outputs = self.model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=1) pred_class = torch.argmax(probs).item() confidence = probs[0][pred_class].item() if confidence >= self.cyberbullying_threshold or (pred_class == 1) or (len(triggers) > 0): label = "Cyberbullying" is_cyberbullying = True elif confidence >= self.borderline_threshold: label = "Borderline" is_cyberbullying = False else: label = "Safe" is_cyberbullying = False return { "text": text, "label": label, "confidence": confidence, "language": "hil", "triggers": triggers, "is_cyberbullying": is_cyberbullying } # Initialize the detector detector = CyberbullyingDetector(model_type="bert") @app.route('/') def index(): return render_template('index.html', classification_report="Loading...") @app.route('/predict', methods=['POST']) def predict(): data = request.get_json() text = data.get('text', '') if not text: return jsonify({"error": "No text provided"}), 400 # Make prediction using the model result = detector.predict(text) # Generate the classification report true_labels = ["Cyberbullying" if "cyberbullying" in text else "Safe" for text in [text]] predicted_labels = [result['label']] report = classification_report(true_labels, predicted_labels, zero_division=0) # Render the template with the classification report return render_template('index.html', classification_report=report) if __name__ == '__main__': app.run(debug=True)