File size: 9,209 Bytes
d6e219c
f1948f2
 
e185e86
e032990
b54664e
0ff864f
ec5f81e
4dccd71
 
e185e86
e032990
37dfdf9
e032990
 
a9d4250
e185e86
aeed86a
 
 
e185e86
 
 
b602114
 
aeed86a
 
 
 
 
 
 
 
 
e185e86
 
aeed86a
 
e185e86
aeed86a
 
 
 
 
 
 
 
 
 
 
e185e86
 
 
b602114
 
e185e86
23bb2d2
1e3558a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4472a1d
73582bd
 
b98a1ee
 
 
e032990
 
4472a1d
ec5f81e
38e8859
dcb0de6
ec5f81e
 
43095bd
23bb2d2
e032990
 
 
 
 
 
 
 
73582bd
43095bd
e032990
 
38e8859
1e3558a
 
e032990
1e3558a
a6c0cf2
2dda625
e032990
 
1e3558a
ec5f81e
 
 
1e3558a
e032990
1e3558a
ec5f81e
1e3558a
 
 
 
 
2dda625
e185e86
e032990
ec5f81e
 
 
e032990
ec5f81e
 
8475dea
1e3558a
 
 
a5405aa
d88c331
036dae9
 
a5405aa
 
ec5f81e
036dae9
 
2dc9dfb
 
 
 
 
 
 
fbd7c8e
 
b602114
 
 
 
 
 
09ebcdf
847586e
1e3558a
 
09ebcdf
 
95573fe
1e3558a
 
 
 
 
 
 
ec5f81e
 
e032990
 
 
 
 
 
 
 
 
a28ef35
ab8c96f
a6c0cf2
ec5f81e
e032990
a6c0cf2
e032990
ab8c96f
4292d1b
2dda625
cbd8c88
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import gradio as gr
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs

# custom fine-tuned sentiment model
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")

# Load abuse pattern model
model_name ="SamanthaStorm/autotrain-jlpi4-mllvp"
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)

LABELS = [
    "blame shifting", "contradictory statements", "control", "dismissiveness",
    "gaslighting", "guilt tripping", "insults", "obscure language",
    "projection", "recovery phase", "threat"
]

THRESHOLDS = {
    "blame shifting": 0.3,
    "contradictory statements": 0.32,
    "control": 0.40,
    "dismissiveness": 0.45,
    "gaslighting": 0.30,
    "guilt tripping": 0.20,
    "insults": 0.34,
    "obscure language": 0.25,
    "projection": 0.35,
    "recovery phase": 0.25,
    "threat": 0.25
}

PATTERN_LABELS = LABELS

EXPLANATIONS = {
    "blame shifting": "Blame-shifting is when one person redirects responsibility onto someone else to avoid accountability.",
    "contradictory statements": "Contradictory statements confuse the listener by flipping positions or denying previous claims.",
    "control": "Control restricts another person’s autonomy through coercion, manipulation, or threats.",
    "dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings, needs, or opinions.",
    "gaslighting": "Gaslighting involves making someone question their own reality, memory, or perceptions.",
    "guilt tripping": "Guilt-tripping uses guilt to manipulate someone’s actions or decisions.",
    "insults": "Insults are derogatory or demeaning remarks meant to shame, belittle, or hurt someone.",
    "obscure language": "Obscure language manipulates through complexity, vagueness, or superiority to confuse the other person.",
    "projection": "Projection accuses someone else of the very behaviors or intentions the speaker is exhibiting.",
    "recovery phase": "Recovery phase statements attempt to soothe or reset tension without acknowledging harm or change.",
    "threat": "Threats use fear of harm (physical, emotional, or relational) to control or intimidate someone."
}

PATTERN_WEIGHTS = {
    "gaslighting": 1.3, "mockery": 1.2, "control": 1.2, "dismissiveness": 0.8, "blame_shifting": 0.8,
    "contradictory_statements": 0.75,
}

# --- DARVO Detection Tools ---
DARVO_PATTERNS = {
    "blame shifting", "projection", "mockery", "dismissiveness", "deflection", "guilt tripping"
}
DARVO_MOTIFS = [
    "i guess i’m the bad guy", "after everything i’ve done", "you always twist everything",
    "so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen",
    "you’re attacking me", "i’m done trying", "i’m the only one who cares"
]

def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
    pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS])
    pattern_score = pattern_hits / len(DARVO_PATTERNS)

    sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)

    motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS])
    motif_score = motif_hits / len(DARVO_MOTIFS)

    contradiction_score = 1.0 if contradiction_flag else 0.0

    darvo_score = (
        0.3 * pattern_score +
        0.3 * sentiment_shift_score +
        0.2 * motif_score +
        0.2 * contradiction_score
    )
    return round(min(darvo_score, 1.0), 3)

def custom_sentiment(text):
    inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = sentiment_model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        label_idx = torch.argmax(probs).item()
    label_map = {0: "supportive", 1: "undermining"}
    return {"label": label_map[label_idx], "score": probs[0][label_idx].item()}

def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1.0):
    weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]]
    base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
    base_score *= flag_multiplier
    return min(base_score, 100.0)

def interpret_abuse_level(score):
    if score > 80:
        return "Extreme / High Risk"
    elif score > 60:
        return "Severe / Harmful Pattern Present"
    elif score > 40:
        return "Likely Abuse"
    elif score > 20:
        return "Mild Concern"
    return "Very Low / Likely Safe"

def analyze_single_message(text, thresholds, motif_flags):
    motif_hits, matched_phrases = detect_motifs(text)
    sentiment = custom_sentiment(text)
    sentiment_score = sentiment["score"] if sentiment["label"] == "undermining" else 0.0

    adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()

    threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
    phrase_labels = [label for label, _ in matched_phrases]
    pattern_labels_used = list(set(threshold_labels + phrase_labels))

    abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)

    top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]

    motif_phrases = [text for _, text in matched_phrases]
    darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag=False)

    return abuse_level, pattern_labels_used, top_patterns, darvo_score

def analyze_composite(msg1, msg2, msg3, flags):
    thresholds = THRESHOLDS
    messages = [msg1, msg2, msg3]
    active_messages = [m for m in messages if m.strip()]
    if not active_messages:
        return "Please enter at least one message."

    results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
    abuse_scores = [r[0] for r in results]
    darvo_scores = [r[3] for r in results]
    average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
    print(f"Average DARVO Score: {average_darvo}")

    base_score = sum(abuse_scores) / len(abuse_scores)
    label_sets = [[label for label, _ in r[2]] for r in results]
    label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)}
    top_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)[:2]
    top_explanations = [EXPLANATIONS.get(label, "") for label, _ in top_labels]

    # Adjust flag-based weight relative to number of messages
    danger_weight = 5
    flag_weights = {
    "They've threatened harm": 6,
    "They isolate me": 5,
    "I’ve changed my behavior out of fear": 4,
    "They monitor/follow me": 4,
    "I feel unsafe when alone with them": 6
}
    flag_boost = sum(flag_weights.get(f, 3) for f in flags) / len(active_messages)
    composite_score = min(base_score + flag_boost, 100)
  # Apply message count dampening AFTER base and flag boost
    if len(active_messages) == 1:
        composite_score *= 0.85  # 15% reduction for 1 message
    elif len(active_messages) == 2:
        composite_score *= 0.93  # 7% reduction for 2 messages

    composite_score = round(min(composite_score, 100), 2)  # re-cap just in case
    result = f"These messages show patterns of {', '.join(label for label, _ in top_labels)} and are estimated to be {composite_score}% likely abusive."

# Include pattern explanations
    for expl in top_explanations:
        if expl:
            result += f"\n• {expl}"

# Show DARVO score
    if average_darvo > 0.25:
        darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
        result += f"\n\nDARVO Score: {average_darvo} → This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."

return result

textbox_inputs = [
    gr.Textbox(label="Message 1"),
    gr.Textbox(label="Message 2"),
    gr.Textbox(label="Message 3")
]

checkboxes = gr.CheckboxGroup(label="Contextual Flags", choices=[
    "They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
    "They monitor/follow me", "I feel unsafe when alone with them"
])

iface = gr.Interface(
    fn=analyze_composite,
    inputs=textbox_inputs + [checkboxes],
    outputs=gr.Textbox(label="Results"),
    title="Abuse Pattern Detector (Multi-Message)",
    allow_flagging="manual"
)

if __name__ == "__main__":
    iface.launch()