File size: 12,231 Bytes
d6e219c
f1948f2
 
2f6ac5d
e032990
b54664e
6153eb8
0ff864f
2f6ac5d
 
 
e185e86
2f6ac5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e032990
 
a9d4250
e185e86
aeed86a
 
 
e185e86
 
 
b602114
 
eb7b135
aeed86a
 
 
 
 
 
 
 
e185e86
 
2f6ac5d
 
 
 
 
 
 
aeed86a
e185e86
aeed86a
 
 
 
 
 
 
 
 
 
 
e185e86
 
d33c30b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f6ac5d
d33c30b
2f6ac5d
1e3558a
16058bd
1e3558a
2f6ac5d
1e3558a
 
 
 
 
 
21fd350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6153eb8
1e3558a
 
 
 
 
 
 
1796883
 
 
16058bd
 
1796883
1e3558a
 
2f6ac5d
4472a1d
2f6ac5d
73582bd
2f6ac5d
 
 
 
4472a1d
2f6ac5d
ec5f81e
2f6ac5d
 
dcb0de6
ec5f81e
 
43095bd
e032990
 
38e8859
2f6ac5d
 
b883fe8
2f6ac5d
 
 
b883fe8
a6c0cf2
2dda625
e032990
 
b883fe8
ec5f81e
 
 
b883fe8
e032990
2f6ac5d
 
 
1e3558a
6153eb8
 
b883fe8
 
2dda625
2f6ac5d
e185e86
8874807
ec5f81e
 
 
e032990
b883fe8
 
8874807
 
 
b883fe8
 
 
 
 
8874807
 
 
 
 
 
 
 
 
 
 
 
 
2f6ac5d
8874807
8475dea
1e3558a
 
d88c331
2f6ac5d
036dae9
 
002eb5f
 
2f6ac5d
2dc9dfb
6153eb8
 
 
 
 
 
fbd7c8e
 
b602114
6153eb8
b602114
6153eb8
 
2f6ac5d
002eb5f
 
 
1e3558a
 
 
d33c30b
8874807
 
 
 
91805a5
ec5f81e
2f6ac5d
ec5f81e
e032990
 
 
 
 
 
 
 
 
a28ef35
ab8c96f
a6c0cf2
ec5f81e
e032990
a6c0cf2
e032990
ab8c96f
4292d1b
2dda625
cbd8c88
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs
import re

# --- Sentiment Model: T5-based Emotion Classifier ---
sentiment_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")
sentiment_model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-emotion")

EMOTION_TO_SENTIMENT = {
    "joy": "supportive",
    "love": "supportive",
    "surprise": "supportive",
    "neutral": "supportive",
    "sadness": "undermining",
    "anger": "undermining",
    "fear": "undermining",
    "disgust": "undermining",
    "shame": "undermining",
    "guilt": "undermining"
}

# --- Abuse Detection Model ---
model_name = "SamanthaStorm/autotrain-jlpi4-mllvp"
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)

LABELS = [
    "blame shifting", "contradictory statements", "control", "dismissiveness",
    "gaslighting", "guilt tripping", "insults", "obscure language",
    "projection", "recovery phase", "threat"
]

THRESHOLDS = {
    "blame shifting": 0.3,
    "contradictory statements": 0.32,
    "control": 0.48,
    "dismissiveness": 0.45,
    "gaslighting": 0.30,
    "guilt tripping": 0.20,
    "insults": 0.34,
    "obscure language": 0.25,
    "projection": 0.35,
    "recovery phase": 0.25,
    "threat": 0.25
}

PATTERN_WEIGHTS = {
    "gaslighting": 1.3,
    "control": 1.2,
    "dismissiveness": 0.8,
    "blame shifting": 0.8,
    "contradictory statements": 0.75
}

EXPLANATIONS = {
    "blame shifting": "Blame-shifting is when one person redirects responsibility onto someone else to avoid accountability.",
    "contradictory statements": "Contradictory statements confuse the listener by flipping positions or denying previous claims.",
    "control": "Control restricts another person’s autonomy through coercion, manipulation, or threats.",
    "dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings, needs, or opinions.",
    "gaslighting": "Gaslighting involves making someone question their own reality, memory, or perceptions.",
    "guilt tripping": "Guilt-tripping uses guilt to manipulate someone’s actions or decisions.",
    "insults": "Insults are derogatory or demeaning remarks meant to shame, belittle, or hurt someone.",
    "obscure language": "Obscure language manipulates through complexity, vagueness, or superiority to confuse the other person.",
    "projection": "Projection accuses someone else of the very behaviors or intentions the speaker is exhibiting.",
    "recovery phase": "Recovery phase statements attempt to soothe or reset tension without acknowledging harm or change.",
    "threat": "Threats use fear of harm (physical, emotional, or relational) to control or intimidate someone."
}

RISK_SNIPPETS = {
    "low": (
        "🟢 Risk Level: Low",
        "The language patterns here do not strongly indicate abuse.",
        "Continue to check in with yourself and notice how you feel in response to repeated patterns."
    ),
    "moderate": (
        "⚠️ Risk Level: Moderate to High",
        "This language includes control, guilt, or reversal tactics.",
        "These patterns often lead to emotional confusion and reduced self-trust. Document these messages or talk with someone safe."
    ),
    "high": (
        "🛑 Risk Level: High",
        "Language includes threats or coercive control, which are strong indicators of escalation.",
        "Consider creating a safety plan or contacting a support line. Trust your sense of unease."
    )
}

def generate_risk_snippet(abuse_score, top_label):
    if abuse_score >= 85:
        risk_level = "high"
    elif abuse_score >= 60:
        risk_level = "moderate"
    else:
        risk_level = "low"
    title, summary, advice = RISK_SNIPPETS[risk_level]
    return f"\n\n{title}\n{summary} (Pattern: {top_label})\n💡 {advice}"

# --- DARVO Detection ---
DARVO_PATTERNS = {
    "blame shifting", "projection", "dismissiveness", "guilt tripping", "contradictory statements"
}

DARVO_MOTIFS = [
    "i guess i’m the bad guy", "after everything i’ve done", "you always twist everything",
    "so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen",
    "you’re attacking me", "i’m done trying", "i’m the only one who cares"
]

def detect_contradiction(message):
    contradiction_flag = False
    contradiction_phrases = [
        (r"\b(i love you).{0,15}(i hate you|you ruin everything)", re.IGNORECASE),
        (r"\b(i’m sorry).{0,15}(but you|if you hadn’t)", re.IGNORECASE),
        (r"\b(i’m trying).{0,15}(you never|why do you)", re.IGNORECASE),
        (r"\b(do what you want).{0,15}(you’ll regret it|i always give everything)", re.IGNORECASE),
        (r"\b(i don’t care).{0,15}(you never think of me)", re.IGNORECASE),
        (r"\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)", re.IGNORECASE),
    ]
    for pattern, flags in contradiction_phrases:
        if re.search(pattern, message, flags):
            contradiction_flag = True
            break
    return contradiction_flag

def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
    pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS])
    pattern_score = pattern_hits / len(DARVO_PATTERNS)
    sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
    motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS])
    motif_score = motif_hits / len(DARVO_MOTIFS)
    contradiction_score = 1.0 if contradiction_flag else 0.0
    darvo_score = (
        0.3 * pattern_score +
        0.3 * sentiment_shift_score +
        0.25 * motif_score +
        0.15 * contradiction_score
    )
    return round(min(darvo_score, 1.0), 3)

# --- Sentiment Mapping ---
def custom_sentiment(text):
    input_ids = sentiment_tokenizer(f"emotion: {text}", return_tensors="pt").input_ids
    with torch.no_grad():
        outputs = sentiment_model.generate(input_ids)
    emotion = sentiment_tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower()
    sentiment = EMOTION_TO_SENTIMENT.get(emotion, "undermining")
    return {"label": sentiment, "emotion": emotion}

# --- Abuse Analysis Core ---
def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1.0):
    weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0)
                       for label, score in zip(LABELS, scores) if score > thresholds[label]]
    base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
    base_score *= flag_multiplier
    return min(base_score, 100.0)

def analyze_single_message(text, thresholds, motif_flags):
    motif_hits, matched_phrases = detect_motifs(text)
    sentiment = custom_sentiment(text)
    sentiment_score = 0.5 if sentiment["label"] == "undermining" else 0.0
    print(f"Detected emotion: {sentiment['emotion']} → sentiment: {sentiment['label']}")

    adjusted_thresholds = {
        k: v * 0.8 for k, v in thresholds.items()
    } if sentiment["label"] == "undermining" else thresholds.copy()

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()

    threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
    phrase_labels = [label for label, _ in matched_phrases]
    pattern_labels_used = list(set(threshold_labels + phrase_labels))

    abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
    top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)],
                          key=lambda x: x[1], reverse=True)[:2]

    motif_phrases = [text for _, text in matched_phrases]
    contradiction_flag = detect_contradiction(text)
    darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag)

    return abuse_level, pattern_labels_used, top_patterns, darvo_score, sentiment

# --- Composite Message Analysis ---
def analyze_composite(msg1, msg2, msg3, flags):
    thresholds = THRESHOLDS.copy()
    messages = [msg1, msg2, msg3]
    active_messages = [m for m in messages if m.strip()]
    if not active_messages:
        return "Please enter at least one message."

    results = []
    sentiment_labels = []
    sentiment_score_total = 0.0

    for m in active_messages:
        result = analyze_single_message(m, thresholds, flags)
        print(f"Message: {m}")
        print(f"Sentiment result: {result[4]}")
        results.append(result)
        sentiment_labels.append(result[4]["label"])
        if result[4]["label"] == "undermining":
            sentiment_score_total += 0.5  # fixed value from `analyze_single_message`

    # Sentiment adjustment based on average and balance
    undermining_count = sentiment_labels.count("undermining")
    supportive_count = sentiment_labels.count("supportive")

    if undermining_count > supportive_count:
        thresholds = {k: v * 0.9 for k, v in thresholds.items()}
    elif undermining_count and supportive_count:
        thresholds = {k: v * 0.95 for k, v in thresholds.items()}  # very subtle if mixed
        print("⚖️ Detected conflicting sentiment across messages.")

    # Abuse scoring
    abuse_scores = [r[0] for r in results]
    darvo_scores = [r[3] for r in results]
    average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
    base_score = sum(abuse_scores) / len(abuse_scores)

    label_sets = [[label for label, _ in r[2]] for r in results]
    label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)}
    top_label = max(label_counts.items(), key=lambda x: x[1])
    top_explanation = EXPLANATIONS.get(top_label[0], "")

    flag_weights = {
        "They've threatened harm": 6,
        "They isolate me": 5,
        "I’ve changed my behavior out of fear": 4,
        "They monitor/follow me": 4,
        "I feel unsafe when alone with them": 6
    }
    flag_boost = sum(flag_weights.get(f, 3) for f in flags) / len(active_messages)
    composite_score = min(base_score + flag_boost, 100)
    if len(active_messages) == 1:
        composite_score *= 0.85
    elif len(active_messages) == 2:
        composite_score *= 0.93
    composite_score = round(min(composite_score, 100), 2)

    result = f"These messages show a pattern of **{top_label[0]}** and are estimated to be {composite_score}% likely abusive."
    if top_explanation:
        result += f"\n• {top_explanation}"
    if average_darvo > 0.25:
        darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
        result += f"\n\nDARVO Score: {average_darvo} → This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
    result += generate_risk_snippet(composite_score, top_label[0])
    
    if undermining_count and supportive_count:
        result += "\n\n⚖️ These messages contain **conflicting emotional tones** — this may indicate mixed signals, ambivalence, or a push-pull dynamic. Use caution interpreting any one message alone."

    return result

# --- Gradio Interface ---
textbox_inputs = [
    gr.Textbox(label="Message 1"),
    gr.Textbox(label="Message 2"),
    gr.Textbox(label="Message 3")
]

checkboxes = gr.CheckboxGroup(label="Contextual Flags", choices=[
    "They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
    "They monitor/follow me", "I feel unsafe when alone with them"
])

iface = gr.Interface(
    fn=analyze_composite,
    inputs=textbox_inputs + [checkboxes],
    outputs=gr.Textbox(label="Results"),
    title="Abuse Pattern Detector (Multi-Message)",
    allow_flagging="manual"
)

if __name__ == "__main__":
    iface.launch()