File size: 9,034 Bytes
d6e219c
f1948f2
 
4472a1d
cc98c96
b54664e
4026c4d
0ff864f
c43452c
4dccd71
 
2efdba9
2272c61
4dccd71
8e4d20e
 
a9d4250
79936aa
2dda625
 
 
79936aa
f1948f2
c303ab8
2dda625
 
 
 
c303ab8
4292d1b
43095bd
 
 
94e76c4
2dda625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94e76c4
23bb2d2
4472a1d
73582bd
 
b98a1ee
 
 
2efdba9
73582bd
 
 
 
4472a1d
28fc37c
 
 
 
 
 
 
 
ecc77cc
28fc37c
 
43095bd
23bb2d2
2dda625
 
 
 
 
 
 
 
73582bd
43095bd
 
73582bd
2796d42
03c90d9
2efdba9
931f75a
c10440f
867b307
2dda625
 
 
2efdba9
2dda625
2efdba9
2dda625
 
 
 
2efdba9
ecc77cc
 
 
 
2efdba9
ecc77cc
 
 
 
 
 
2dda625
 
 
2efdba9
2dda625
 
2efdba9
2dda625
bd4971f
 
 
2dda625
2efdba9
4026c4d
ecc77cc
 
a28ef35
45a52da
 
 
 
 
 
 
2dda625
 
 
 
ecc77cc
2dda625
ecc77cc
9519fb4
ecc77cc
d533c27
0e9226e
9519fb4
 
a28ef35
 
2dda625
dd85bbd
5795ebb
2dda625
 
5795ebb
 
 
dd85bbd
 
 
4026c4d
 
 
 
 
3529542
6297c04
5d7c4ba
ab8c96f
ad04ec8
 
 
 
 
 
 
 
 
 
2dda625
ab8c96f
4292d1b
2dda625
ecc77cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import gradio as gr
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs
from abuse_type_mapping import determine_abuse_type

# custom fine-tuned sentiment model
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")

# Load abuse pattern model
model_name = "SamanthaStorm/abuse-pattern-detector-v2"
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)

LABELS = [
    "gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
    "contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase", "non_abusive",
    "suicidal_threat", "physical_threat", "extreme_control"
]

THRESHOLDS = {
    "gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.30, "control": 0.43, "guilt_tripping": 0.19,
    "apology_baiting": 0.45, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
    "manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25,
    "non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.36
}

PATTERN_LABELS = LABELS[:15]
DANGER_LABELS = LABELS[15:18]

EXPLANATIONS = {
    "gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
    "blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
    "projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
    "dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
    "mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
    "recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
    "insults": "Insults are derogatory remarks aimed at degrading someone.",
    "apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
    "deflection": "Deflection avoids accountability by redirecting blame.",
    "control": "Control restricts autonomy through manipulation or coercion.",
    "extreme_control": "Extreme control dominates decisions and behaviors entirely.",
    "physical_threat": "Physical threats signal risk of bodily harm.",
    "suicidal_threat": "Suicidal threats manipulate others using self-harm threats.",
    "guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.",
    "manipulation": "Manipulation deceives to influence or control outcomes.",
    "non_abusive": "Non-abusive language is respectful and free of coercion.",
    "obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
}

def custom_sentiment(text):
    inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = sentiment_model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        label_idx = torch.argmax(probs).item()

    label_map = {0: "supportive", 1: "undermining"}
    label = label_map[label_idx]
    score = probs[0][label_idx].item()
    return {"label": label, "score": score}

def calculate_abuse_level(scores, thresholds, motif_hits=None):
    triggered_scores = [
        score for label, score in zip(LABELS, scores) if score > thresholds[label]
    ]
    base_score = round(np.mean(triggered_scores) * 100, 2) if triggered_scores else 0.0

    motif_hits = motif_hits or []
    if any(label in motif_hits for label in {"physical_threat", "suicidal_threat", "extreme_control"}):
        base_score = max(base_score, 75.0)

    return base_score

def interpret_abuse_level(score):
    if score > 80:
        return "Extreme / High Risk"
    elif score > 60:
        return "Severe / Harmful Pattern Present"
    elif score > 40:
        return "Likely Abuse"
    elif score > 20:
        return "Mild Concern"
    return "Very Low / Likely Safe"

def analyze_messages(input_text, risk_flags):
    input_text = input_text.strip()
    if not input_text:
        return "Please enter a message for analysis."

    motif_flags, matched_phrases = detect_motifs(input_text)
    risk_flags = list(set(risk_flags + motif_flags)) if risk_flags else motif_flags

    sentiment = custom_sentiment(input_text)
    sentiment_label = sentiment['label']
    sentiment_score = sentiment['score']

    adjusted_thresholds = {k: v * 0.8 for k, v in THRESHOLDS.items()} if sentiment_label == "undermining" else THRESHOLDS.copy()

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()

    pattern_labels_used = list(set(
        [label for label, score in zip(PATTERN_LABELS, scores[:15]) if score > adjusted_thresholds[label]] +
        [label for label, _ in matched_phrases]
    ))

    abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits=[label for label, _ in matched_phrases])
    abuse_description = interpret_abuse_level(abuse_level)

    abuse_type, abuser_profile, advice = determine_abuse_type(pattern_labels_used)

    danger_flag_count = sum(score > adjusted_thresholds[label] for label, score in zip(DANGER_LABELS, scores[15:18]))
    contextual_flags = risk_flags if risk_flags else []
    if len(contextual_flags) >= 2:
        danger_flag_count += 1

    critical_flags = ["They've threatened harm", "They monitor/follow me", "I feel unsafe when alone with them"]
    high_risk_context = any(flag in contextual_flags for flag in critical_flags)

    non_abusive_score = scores[LABELS.index('non_abusive')]
    non_abusive_confident = non_abusive_score > adjusted_thresholds['non_abusive']

    if non_abusive_confident and danger_flag_count == 0 and not matched_phrases:
        return "This message is classified as non-abusive."

    scored_patterns = [
        (label, score) for label, score in zip(PATTERN_LABELS, scores[:15]) if label != "non_abusive"
    ]

    override_labels = {"physical_threat", "suicidal_threat", "extreme_control"}
    override_matches = [label for label, _ in matched_phrases if label in override_labels]

    if override_matches:
        top_patterns = [(label, 1.0) for label in override_matches]
    else:
        top_patterns = sorted(scored_patterns, key=lambda x: x[1], reverse=True)[:2]

    top_pattern_explanations = "\n".join([
        f"• {label.replace('_', ' ').title()}: {EXPLANATIONS.get(label, 'No explanation available.')}"
        for label, _ in top_patterns
    ])

    resources = "Immediate assistance recommended. Please seek professional help or contact emergency services." if danger_flag_count >= 2 else "For more information on abuse patterns, consider reaching out to support groups or professional counselors."

    result = f"Abuse Risk Score: {abuse_level}% – {abuse_description}\n\n"
    if abuse_level >= 15:
        result += f"Most Likely Patterns:\n{top_pattern_explanations}\n\n"
    result += f"⚠️ Critical Danger Flags Detected: {danger_flag_count} of 3\n"
    result += f"Resources: {resources}\n"
    result += f"🧠 Sentiment: {sentiment_label.title()} (Confidence: {sentiment_score*100:.2f}%)\n"

    if contextual_flags:
        result += "\n\n⚠️ You indicated the following:\n" + "\n".join([f"• {flag.replace('_', ' ').title()}" for flag in contextual_flags])

    if high_risk_context:
        result += "\n\n🚨 These responses suggest a high-risk situation. Consider seeking immediate help or safety planning resources."

    if matched_phrases:
        result += "\n\n🚨 Detected High-Risk Phrases:\n"
        for label, phrase in matched_phrases:
            phrase_clean = phrase.replace('"', "'").strip()
            result += f"• {label.replace('_', ' ').title()}: “{phrase_clean}”\n"

    if abuse_type:
        result += f"\n\n🧠 Likely Abuse Type: {abuse_type}"
        result += f"\n🧠 Abuser Profile: {abuser_profile}"
        result += f"\n📘 Safety Tip: {advice}"

    return result

iface = gr.Interface(
    fn=analyze_messages,
    inputs=[
        gr.Textbox(lines=10, placeholder="Enter message here..."),
        gr.CheckboxGroup(label="Do any of these apply to your situation?", choices=[
            "They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
            "They monitor/follow me", "I feel unsafe when alone with them"
        ])
    ],
    outputs=[gr.Textbox(label="Analysis Result")],
    title="Abuse Pattern Detector",
    live=True
)

if __name__ == "__main__":
    iface.queue().launch()