File size: 6,403 Bytes
d6e219c
f1948f2
 
e185e86
e032990
b54664e
0ff864f
ec5f81e
4dccd71
 
e185e86
e032990
dd2f06d
e032990
 
a9d4250
e185e86
 
ec5f81e
e185e86
 
 
 
 
ec5f81e
e185e86
 
 
 
e032990
 
 
 
 
 
 
 
 
ec5f81e
e032990
 
e185e86
 
 
ec5f81e
e185e86
23bb2d2
4472a1d
73582bd
 
b98a1ee
 
 
e032990
 
4472a1d
ec5f81e
38e8859
dcb0de6
ec5f81e
 
43095bd
23bb2d2
e032990
 
 
 
 
 
 
 
73582bd
43095bd
e032990
 
38e8859
e032990
a6c0cf2
2dda625
e032990
 
ec5f81e
 
 
e032990
ec5f81e
 
2dda625
e185e86
e032990
ec5f81e
 
 
e032990
ec5f81e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e032990
 
 
 
 
 
 
 
 
a28ef35
ab8c96f
a6c0cf2
ec5f81e
e032990
a6c0cf2
e032990
ab8c96f
4292d1b
2dda625
cbd8c88
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import gradio as gr
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs

# custom fine-tuned sentiment model
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")

# Load abuse pattern model
model_name = "SamanthaStorm/autotrain-c1un8-p8vzo"
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)

LABELS = [
    "gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
    "contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase"
]

THRESHOLDS = {
    "gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.45, "control": 0.43, "guilt_tripping": 0.15,
    "apology_baiting": 0.2, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
    "manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25
}

EXPLANATIONS = {
    "gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
    "blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
    "projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
    "dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
    "mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
    "recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
    "insults": "Insults are derogatory remarks aimed at degrading someone.",
    "apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
    "deflection": "Deflection avoids accountability by redirecting blame.",
    "control": "Control restricts autonomy through manipulation or coercion.",
    "guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.",
    "manipulation": "Manipulation deceives to influence or control outcomes.",
    "obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
}

PATTERN_WEIGHTS = {
    "gaslighting": 1.3, "mockery": 1.2, "control": 1.2, "dismissiveness": 0.8
}

def custom_sentiment(text):
    inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = sentiment_model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        label_idx = torch.argmax(probs).item()
    label_map = {0: "supportive", 1: "undermining"}
    return {"label": label_map[label_idx], "score": probs[0][label_idx].item()}

def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1.0):
    weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]]
    base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
    base_score *= flag_multiplier
    return min(base_score, 100.0)

def interpret_abuse_level(score):
    if score > 80:
        return "Extreme / High Risk"
    elif score > 60:
        return "Severe / Harmful Pattern Present"
    elif score > 40:
        return "Likely Abuse"
    elif score > 20:
        return "Mild Concern"
    return "Very Low / Likely Safe"

def analyze_single_message(text, thresholds, motif_flags):
    motif_hits, matched_phrases = detect_motifs(text)
    sentiment = custom_sentiment(text)
    adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
    threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
    phrase_labels = [label for label, _ in matched_phrases]
    pattern_labels_used = list(set(threshold_labels + phrase_labels))
    abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
    top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
    return abuse_level, pattern_labels_used, top_patterns

def analyze_composite(msg1, msg2, msg3, flags):
    thresholds = THRESHOLDS
    messages = [msg1, msg2, msg3]
    active_messages = [m for m in messages if m.strip()]
    if not active_messages:
        return "Please enter at least one message."

    flag_multiplier = 1 + (0.1 * len(flags))  # each checked flag increases weight by 10%
    results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
    abuse_scores = [r[0] for r in results]
    composite_score = round(sum(abuse_scores) / len(abuse_scores), 2)
    label_sets = [label for result in results for label in result[1]]
    label_counts = {label: label_sets.count(label) for label in set(label_sets)}
    top_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)[:2]
    top_explanations = [EXPLANATIONS.get(label, "") for label, _ in top_labels]

    result = f"These messages show patterns of {', '.join(label for label, _ in top_labels)} and are estimated to be {composite_score}% likely abusive."
    for expl in top_explanations:
        if expl:
            result += f"\n• {expl}"
    return result

textbox_inputs = [
    gr.Textbox(label="Message 1"),
    gr.Textbox(label="Message 2"),
    gr.Textbox(label="Message 3")
]

checkboxes = gr.CheckboxGroup(label="Contextual Flags", choices=[
    "They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
    "They monitor/follow me", "I feel unsafe when alone with them"
])

iface = gr.Interface(
    fn=analyze_composite,
    inputs=textbox_inputs + [checkboxes],
    outputs=gr.Textbox(label="Results"),
    title="Abuse Pattern Detector (Multi-Message)",
    allow_flagging="manual"
)

if __name__ == "__main__":
    iface.launch()