Spaces:

SamanthaStorm
/

Tether

Running on Zero

File size: 4,125 Bytes

d6e219c
f1948f2
a9d4250
f1948f2
 
5dfb1ca
f1948f2
5dfb1ca
 
a9d4250
5dfb1ca
79936aa
 
 
 
 
 
 
f1948f2
5dfb1ca
c303ab8
293a004
 
5dfb1ca
ab8c96f
293a004
 
 
b11fbe8
293a004
 
 
ab8c96f
b11fbe8
c303ab8
b11fbe8
 
 
c303ab8
4292d1b
5dfb1ca
4292d1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79936aa
 
f1948f2
4292d1b
79936aa
5dfb1ca
f1948f2
 
 
4292d1b
79936aa
5dfb1ca
4292d1b
 
79936aa
5dfb1ca
4292d1b
 
79936aa
5dfb1ca
4292d1b
ab8c96f
5d7c4ba
ab8c96f
 
5dfb1ca
ab8c96f
 
 
 
 
83c1ff8
5dfb1ca
 
 
5d7c4ba
5dfb1ca
ab8c96f
 
5dfb1ca
 
 
 
 
ab8c96f
 
4292d1b
5dfb1ca

import gradio as gr
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import numpy as np

# Load model and tokenizer with trust_remote_code in case it's needed
model_name = "SamanthaStorm/abuse-pattern-detector-v2"
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Define labels (17 total)
LABELS = [
    "gaslighting", "mockery", "dismissiveness", "control",
    "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
    "contradictory_statements", "manipulation", "deflection", "insults",
    "obscure_formal", "recovery_phase", "suicidal_threat", "physical_threat",
    "extreme_control"
]

# Custom thresholds for each label (make sure these match your original settings)
THRESHOLDS = {
    "gaslighting": 0.15,
    "mockery": 0.15,
    "dismissiveness": 0.25,  # original value, not 0.30
    "control": 0.13,
    "guilt_tripping": 0.15,
    "apology_baiting": 0.15,
    "blame_shifting": 0.15,
    "projection": 0.20,
    "contradictory_statements": 0.15,
    "manipulation": 0.15,
    "deflection": 0.15,
    "insults": 0.20,
    "obscure_formal": 0.20,
    "recovery_phase": 0.15,
    "suicidal_threat": 0.08,
    "physical_threat": 0.045,
    "extreme_control": 0.30,
}

# Define label groups using slicing (first 14: abuse patterns, last 3: danger cues)
PATTERN_LABELS = LABELS[:14]
DANGER_LABELS = LABELS[14:]

def calculate_abuse_level(scores, thresholds):
    triggered_scores = [score for label, score in zip(LABELS, scores) if score > thresholds[label]]
    if not triggered_scores:
        return 0.0
    return round(np.mean(triggered_scores) * 100, 2)

def interpret_abuse_level(score):
    if score > 80:
        return "Extreme / High Risk"
    elif score > 60:
        return "Severe / Harmful Pattern Present"
    elif score > 40:
        return "Likely Abuse"
    elif score > 20:
        return "Mild Concern"
    else:
        return "Very Low / Likely Safe"

def analyze_messages(input_text):
    input_text = input_text.strip()
    if not input_text:
        return "Please enter a message for analysis.", None

    # Tokenize input and generate model predictions
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()

    # Count the number of triggered abuse pattern and danger flags based on thresholds
    pattern_count = sum(score > THRESHOLDS[label] for label, score in zip(PATTERN_LABELS, scores[:14]))
    danger_flag_count = sum(score > THRESHOLDS[label] for label, score in zip(DANGER_LABELS, scores[14:]))

    # Calculate overall abuse level and interpret it
    abuse_level = calculate_abuse_level(scores, THRESHOLDS)
    abuse_description = interpret_abuse_level(abuse_level)

    # Resource logic based on the number of danger cues
    if danger_flag_count >= 2:
        resources = "Immediate assistance recommended. Please seek professional help or contact emergency services."
    else:
        resources = "For more information on abuse patterns, consider reaching out to support groups or professional counselors."

    # Prepare the result summary and detailed scores
    result = (
        f"Abuse Patterns Detected: {pattern_count} out of {len(PATTERN_LABELS)}\n"
        f"Danger Flags Detected: {danger_flag_count} out of {len(DANGER_LABELS)}\n"
        f"Abuse Level: {abuse_level}% - {abuse_description}\n"
        f"Resources: {resources}"
    )
    
    # Return both a text summary and a JSON-like dict of scores per label
    return result, {"scores": dict(zip(LABELS, scores))}

# Updated Gradio Interface using new component syntax
iface = gr.Interface(
    fn=analyze_messages,
    inputs=gr.Textbox(lines=10, placeholder="Enter message here..."),
    outputs=[
        gr.Textbox(label="Analysis Result"),
        gr.JSON(label="Scores")
    ],
    title="Abuse Pattern Detector"
)

if __name__ == "__main__":
    iface.launch()