File size: 6,053 Bytes
d6e219c
f1948f2
 
e185e86
b54664e
0ff864f
38e8859
4dccd71
 
e185e86
dd2f06d
38e8859
 
a9d4250
e185e86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38e8859
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e185e86
 
38e8859
 
 
e185e86
38e8859
 
e185e86
 
23bb2d2
4472a1d
73582bd
 
b98a1ee
 
 
38e8859
4472a1d
28fc37c
38e8859
dcb0de6
38e8859
ecc77cc
28fc37c
43095bd
23bb2d2
38e8859
 
 
 
73582bd
43095bd
38e8859
a6c0cf2
38e8859
 
a6c0cf2
2dda625
38e8859
 
 
e185e86
38e8859
 
 
2dda625
e185e86
38e8859
 
 
 
 
 
a28ef35
ab8c96f
a6c0cf2
ad04ec8
38e8859
 
 
e185e86
 
 
 
a6c0cf2
 
 
 
 
e185e86
ad04ec8
a6c0cf2
e185e86
ab8c96f
4292d1b
2dda625
38e8859
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from motif_tagging import detect_motifs

# Load models
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")

model_name = "SamanthaStorm/autotrain-c1un8-p8vzo"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

LABELS = [
    "gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
    "contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase", "non_abusive",
    "suicidal_threat", "physical_threat", "extreme_control"
]

THRESHOLDS = {
    "gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.45, "control": 0.43, "guilt_tripping": 0.15,
    "apology_baiting": 0.2, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
    "manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25,
    "non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.30
}

EXPLANATIONS = {
    "gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
    "blame_shifting": "Redirecting responsibility to the victim...",
    "projection": "Accusing the victim of behaviors the abuser exhibits...",
    "dismissiveness": "Belittling or disregarding someone's feelings...",
    "mockery": "Ridiculing someone in a hurtful, humiliating way...",
    "recovery_phase": "Dismissing someone's emotional healing...",
    "insults": "Derogatory remarks aimed at degrading someone...",
    "apology_baiting": "Manipulating victims into apologizing for abuse...",
    "deflection": "Redirecting blame to avoid accountability...",
    "control": "Restricting autonomy through manipulation...",
    "extreme_control": "Dominating decisions and behaviors entirely...",
    "physical_threat": "Signals risk of bodily harm...",
    "suicidal_threat": "Manipulates others using self-harm threats...",
    "guilt_tripping": "Uses guilt to manipulate someone's actions...",
    "manipulation": "Deceives to influence or control outcomes...",
    "non_abusive": "Respectful and free of coercion...",
    "obscure_formal": "Uses confusion/superiority to manipulate..."
}

DANGER_LABELS = LABELS[15:18]
PATTERN_LABELS = LABELS[:15]

PATTERN_WEIGHTS = {
    "physical_threat": 1.5, "suicidal_threat": 1.4, "extreme_control": 1.5,
    "gaslighting": 1.3, "control": 1.2, "dismissiveness": 0.8,
    "non_abusive": 0.0
}

def custom_sentiment(text):
    inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = sentiment_model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        label_idx = torch.argmax(probs).item()
    return {"label": "supportive" if label_idx == 0 else "undermining", "score": probs[0][label_idx].item()}

def calculate_abuse_level(scores, thresholds, motif_hits=None):
    weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]]
    base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
    if any(label in (motif_hits or []) for label in DANGER_LABELS):
        base_score = max(base_score, 75.0)
    return base_score

def interpret_abuse_level(score):
    if score > 80: return "Extreme / High Risk"
    if score > 60: return "Severe / Harmful Pattern Present"
    if score > 40: return "Likely Abuse"
    if score > 20: return "Mild Concern"
    return "Very Low / Likely Safe"

def analyze_single_message(text, thresholds, context_flags):
    motif_flags, matched_phrases = detect_motifs(text)
    sentiment = custom_sentiment(text)
    thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        scores = torch.sigmoid(model(**inputs).logits.squeeze(0)).numpy()
    labels_used = list(set([l for l, s in zip(PATTERN_LABELS, scores[:15]) if s > thresholds[l]] + [l for l, _ in matched_phrases]))
    abuse_level = calculate_abuse_level(scores, thresholds, motif_hits=[l for l, _ in matched_phrases])
    abuse_description = interpret_abuse_level(abuse_level)
    danger_count = sum(scores[LABELS.index(lbl)] > thresholds[lbl] for lbl in DANGER_LABELS)
    output = f"Score: {abuse_level}% – {abuse_description}\nLabels: {', '.join(labels_used)}"
    return output, abuse_level

def analyze_composite(msg1, msg2, msg3, flags):
    thresholds = THRESHOLDS.copy()
    results = [analyze_single_message(t, thresholds, flags) for t in [msg1, msg2, msg3] if t.strip()]
    result_texts = [r[0] for r in results]
    composite_score = round(np.mean([r[1] for r in results]), 2) if results else 0.0
    result_texts.append(f"\nComposite Abuse Score: {composite_score}%")
    return tuple(result_texts)

iface = gr.Interface(
    fn=analyze_composite,
    inputs=[
        gr.Textbox(lines=3, label="Message 1"),
        gr.Textbox(lines=3, label="Message 2"),
        gr.Textbox(lines=3, label="Message 3"),
        gr.CheckboxGroup(label="Contextual Flags", choices=[
            "They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
            "They monitor/follow me", "I feel unsafe when alone with them"
        ])
    ],
    outputs=[
        gr.Textbox(label="Message 1 Result"),
        gr.Textbox(label="Message 2 Result"),
        gr.Textbox(label="Message 3 Result"),
        gr.Textbox(label="Composite Score")
    ],
    title="Abuse Pattern Detector (Multi-Message)",
    flagging_mode="manual"
)

if __name__ == "__main__":
    iface.launch()