Spaces:

a77an
/

cyberbully_research

Sleeping

File size: 3,793 Bytes

f4b426e

import torch
import pandas as pd
import re
from flask import Flask, render_template, request, jsonify
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report
import io
import sys

# Define model names
bert_model_name = "bert-base-uncased"
hatebert_model_name = "GroNLP/hateBERT"

# Initialize Flask app
app = Flask(__name__)

class CyberbullyingDetector:
    def __init__(self, model_type="bert"):
        if model_type == "bert":
            self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(bert_model_name)
        elif model_type == "hatebert":
            self.tokenizer = AutoTokenizer.from_pretrained(hatebert_model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(hatebert_model_name)
        else:
            raise ValueError("Invalid model_type. Choose 'bert' or 'hatebert'.")

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        self.cyberbullying_threshold = 0.7
        self.borderline_threshold = 0.4
        self.trigger_words = [
            'buang', 'pokpok', 'bogo', 'linte', 'tanga', 'diputa', 'salamat', 'Padayon lang', 'mayo gid', 'Nagapasalamat',
            'gago', 'law-ay', 'bilatibay', 'yudipota', 'pangit', 'tikalon', 'tinikal', 'hambog',
            'batinggilan', 'biga-on', 'bulay-ug', 'agi', 'agitot', 'alpot', 'hangag'
        ]

    def find_triggers(self, text):
        text_lower = text.lower()
        return [word for word in self.trigger_words if word in text_lower]

    def predict(self, text):
        triggers = self.find_triggers(text)

        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=128,
            padding=True
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        pred_class = torch.argmax(probs).item()
        confidence = probs[0][pred_class].item()

        if confidence >= self.cyberbullying_threshold or (pred_class == 1) or (len(triggers) > 0):
            label = "Cyberbullying"
            is_cyberbullying = True
        elif confidence >= self.borderline_threshold:
            label = "Borderline"
            is_cyberbullying = False
        else:
            label = "Safe"
            is_cyberbullying = False

        return {
            "text": text,
            "label": label,
            "confidence": confidence,
            "language": "hil",
            "triggers": triggers,
            "is_cyberbullying": is_cyberbullying
        }

# Initialize the detector
detector = CyberbullyingDetector(model_type="bert")

@app.route('/')
def index():
    return render_template('index.html', classification_report="Loading...")

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    text = data.get('text', '')

    if not text:
        return jsonify({"error": "No text provided"}), 400

    # Make prediction using the model
    result = detector.predict(text)

    # Generate the classification report
    true_labels = ["Cyberbullying" if "cyberbullying" in text else "Safe" for text in [text]]
    predicted_labels = [result['label']]
    report = classification_report(true_labels, predicted_labels, zero_division=0)

    # Render the template with the classification report
    return render_template('index.html', classification_report=report)

if __name__ == '__main__':
    app.run(debug=True)