SamanthaStorm commited on
Commit
e743e48
Β·
verified Β·
1 Parent(s): 161571e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +869 -0
app.py ADDED
@@ -0,0 +1,869 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ from transformers import pipeline, RobertaForSequenceClassification, RobertaTokenizer
5
+ from motif_tagging import detect_motifs
6
+ import re
7
+ import matplotlib.pyplot as plt
8
+ import io
9
+ from PIL import Image
10
+ from datetime import datetime
11
+ from transformers import pipeline as hf_pipeline # prevent name collision with gradio pipeline
12
+
13
+ def get_emotion_profile(text):
14
+ emotions = emotion_pipeline(text)
15
+ if isinstance(emotions, list) and isinstance(emotions[0], list):
16
+ emotions = emotions[0]
17
+ return {e['label'].lower(): round(e['score'], 3) for e in emotions}
18
+ # Emotion model (no retraining needed)
19
+ emotion_pipeline = hf_pipeline(
20
+ "text-classification",
21
+ model="j-hartmann/emotion-english-distilroberta-base",
22
+ top_k=None,
23
+ truncation=True
24
+ )
25
+
26
+ # --- Timeline Visualization Function ---
27
+ def generate_abuse_score_chart(dates, scores, labels):
28
+ import matplotlib.pyplot as plt
29
+ import io
30
+ from PIL import Image
31
+ from datetime import datetime
32
+ import re
33
+
34
+ # Determine if all entries are valid dates
35
+ if all(re.match(r"\d{4}-\d{2}-\d{2}", d) for d in dates):
36
+ parsed_x = [datetime.strptime(d, "%Y-%m-%d") for d in dates]
37
+ x_labels = [d.strftime("%Y-%m-%d") for d in parsed_x]
38
+ else:
39
+ parsed_x = list(range(1, len(dates) + 1))
40
+ x_labels = [f"Message {i+1}" for i in range(len(dates))]
41
+
42
+ fig, ax = plt.subplots(figsize=(8, 3))
43
+ ax.plot(parsed_x, scores, marker='o', linestyle='-', color='darkred', linewidth=2)
44
+
45
+ for x, y in zip(parsed_x, scores):
46
+ ax.text(x, y + 2, f"{int(y)}%", ha='center', fontsize=8, color='black')
47
+
48
+ ax.set_xticks(parsed_x)
49
+ ax.set_xticklabels(x_labels)
50
+ ax.set_xlabel("") # No axis label
51
+ ax.set_ylabel("Abuse Score (%)")
52
+ ax.set_ylim(0, 105)
53
+ ax.grid(True)
54
+ plt.tight_layout()
55
+
56
+ buf = io.BytesIO()
57
+ plt.savefig(buf, format='png')
58
+ buf.seek(0)
59
+ return Image.open(buf)
60
+
61
+
62
+ # --- Abuse Model ---
63
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
64
+
65
+ model_name = "SamanthaStorm/tether-multilabel-v3"
66
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
67
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
68
+
69
+ LABELS = [
70
+ "recovery", "control", "gaslighting", "guilt tripping", "dismissiveness", "blame shifting",
71
+ "nonabusive","projection", "insults", "contradictory statements", "obscure language"
72
+ ]
73
+
74
+ THRESHOLDS = {
75
+ "recovery": 0.55,
76
+ "control": 0.25,
77
+ "gaslighting": 0.50,
78
+ "guilt tripping": .20,
79
+ "dismissiveness": 0.15,
80
+ "blame shifting": 0.30,
81
+ "projection": 0.40,
82
+ "insults": 0.30,
83
+ "contradictory statements": 0.50,
84
+ "obscure language": 0.40,
85
+ "nonabusive": 1.5
86
+ }
87
+
88
+ PATTERN_WEIGHTS = {
89
+ "recovery": 0.7,
90
+ "control": 1.4,
91
+ "gaslighting": 1.50,
92
+ "guilt tripping": 1.2,
93
+ "dismissiveness": 0.9,
94
+ "blame shifting": 0.8,
95
+ "projection": 0.5,
96
+ "insults": 1.4,
97
+ "contradictory statements": 1.0,
98
+ "obscure language": 0.9,
99
+ "nonabusive": 0.01
100
+ }
101
+
102
+ ESCALATION_RISKS = {
103
+ "blame shifting": "low",
104
+ "contradictory statements": "moderate",
105
+ "control": "high",
106
+ "dismissiveness": "moderate",
107
+ "gaslighting": "moderate",
108
+ "guilt tripping": "moderate",
109
+ "insults": "moderate",
110
+ "obscure language": "low",
111
+ "projection": "low",
112
+ "recovery phase": "low"
113
+ }
114
+ RISK_STAGE_LABELS = {
115
+ 1: "πŸŒ€ Risk Stage: Tension-Building\nThis message reflects rising emotional pressure or subtle control attempts.",
116
+ 2: "πŸ”₯ Risk Stage: Escalation\nThis message includes direct or aggressive patterns, suggesting active harm.",
117
+ 3: "🌧️ Risk Stage: Reconciliation\nThis message reflects a reset attemptβ€”apologies or emotional repair without accountability.",
118
+ 4: "🌸 Risk Stage: Calm / Honeymoon\nThis message appears supportive but may follow prior harm, minimizing it."
119
+ }
120
+
121
+ ESCALATION_QUESTIONS = [
122
+ ("Partner has access to firearms or weapons", 4),
123
+ ("Partner threatened to kill you", 3),
124
+ ("Partner threatened you with a weapon", 3),
125
+ ("Partner has ever choked you, even if you considered it consensual at the time", 4),
126
+ ("Partner injured or threatened your pet(s)", 3),
127
+ ("Partner has broken your things, punched or kicked walls, or thrown things ", 2),
128
+ ("Partner forced or coerced you into unwanted sexual acts", 3),
129
+ ("Partner threatened to take away your children", 2),
130
+ ("Violence has increased in frequency or severity", 3),
131
+ ("Partner monitors your calls/GPS/social media", 2)
132
+ ]
133
+ DARVO_PATTERNS = [
134
+ "blame shifting", # "You're the reason this happens"
135
+ "projection", # "You're the abusive one"
136
+ "deflection", # "This isn't about that"
137
+ "dismissiveness", # "You're overreacting"
138
+ "insults", # Personal attacks that redirect attention
139
+ "aggression", # Escalates tone to destabilize
140
+ "recovery phase", # Sudden affection following aggression
141
+ "contradictory statements" # β€œI never said that” immediately followed by a version of what they said
142
+ ]
143
+ DARVO_MOTIFS = [
144
+ "I never said that.", "You’re imagining things.", "That never happened.",
145
+ "You’re making a big deal out of nothing.", "It was just a joke.", "You’re too sensitive.",
146
+ "I don’t know what you’re talking about.", "You’re overreacting.", "I didn’t mean it that way.",
147
+ "You’re twisting my words.", "You’re remembering it wrong.", "You’re always looking for something to complain about.",
148
+ "You’re just trying to start a fight.", "I was only trying to help.", "You’re making things up.",
149
+ "You’re blowing this out of proportion.", "You’re being paranoid.", "You’re too emotional.",
150
+ "You’re always so dramatic.", "You’re just trying to make me look bad.",
151
+
152
+ "You’re crazy.", "You’re the one with the problem.", "You’re always so negative.",
153
+ "You’re just trying to control me.", "You’re the abusive one.", "You’re trying to ruin my life.",
154
+ "You’re just jealous.", "You’re the one who needs help.", "You’re always playing the victim.",
155
+ "You’re the one causing all the problems.", "You’re just trying to make me feel guilty.",
156
+ "You’re the one who can’t let go of the past.", "You’re the one who’s always angry.",
157
+ "You’re the one who’s always complaining.", "You’re the one who’s always starting arguments.",
158
+ "You’re the one who’s always making things worse.", "You’re the one who’s always making me feel bad.",
159
+ "You’re the one who’s always making me look like the bad guy.",
160
+ "You’re the one who’s always making me feel like a failure.",
161
+ "You’re the one who’s always making me feel like I’m not good enough.",
162
+
163
+ "I can’t believe you’re doing this to me.", "You’re hurting me.",
164
+ "You’re making me feel like a terrible person.", "You’re always blaming me for everything.",
165
+ "You’re the one who’s abusive.", "You’re the one who’s controlling.", "You’re the one who’s manipulative.",
166
+ "You’re the one who’s toxic.", "You’re the one who’s gaslighting me.",
167
+ "You’re the one who’s always putting me down.", "You’re the one who’s always making me feel bad.",
168
+ "You’re the one who’s always making me feel like I’m not good enough.",
169
+ "You’re the one who’s always making me feel like I’m the problem.",
170
+ "You’re the one who’s always making me feel like I’m the bad guy.",
171
+ "You’re the one who’s always making me feel like I’m the villain.",
172
+ "You’re the one who’s always making me feel like I’m the one who needs to change.",
173
+ "You’re the one who’s always making me feel like I’m the one who’s wrong.",
174
+ "You’re the one who’s always making me feel like I’m the one who’s crazy.",
175
+ "You’re the one who’s always making me feel like I’m the one who’s abusive.",
176
+ "You’re the one who’s always making me feel like I’m the one who’s toxic."
177
+ ]
178
+ def get_emotional_tone_tag(emotions, sentiment, patterns, abuse_score):
179
+ sadness = emotions.get("sadness", 0)
180
+ joy = emotions.get("joy", 0)
181
+ neutral = emotions.get("neutral", 0)
182
+ disgust = emotions.get("disgust", 0)
183
+ anger = emotions.get("anger", 0)
184
+ fear = emotions.get("fear", 0)
185
+ disgust = emotions.get("disgust", 0)
186
+
187
+ # 1. Performative Regret
188
+ if (
189
+ sadness > 0.4 and
190
+ any(p in patterns for p in ["blame shifting", "guilt tripping", "recovery phase"]) and
191
+ (sentiment == "undermining" or abuse_score > 40)
192
+ ):
193
+ return "performative regret"
194
+
195
+ # 2. Coercive Warmth
196
+ if (
197
+ (joy > 0.3 or sadness > 0.4) and
198
+ any(p in patterns for p in ["control", "gaslighting"]) and
199
+ sentiment == "undermining"
200
+ ):
201
+ return "coercive warmth"
202
+
203
+ # 3. Cold Invalidation
204
+ if (
205
+ (neutral + disgust) > 0.5 and
206
+ any(p in patterns for p in ["dismissiveness", "projection", "obscure language"]) and
207
+ sentiment == "undermining"
208
+ ):
209
+ return "cold invalidation"
210
+
211
+ # 4. Genuine Vulnerability
212
+ if (
213
+ (sadness + fear) > 0.5 and
214
+ sentiment == "supportive" and
215
+ all(p in ["recovery phase"] for p in patterns)
216
+ ):
217
+ return "genuine vulnerability"
218
+
219
+ # 5. Emotional Threat
220
+ if (
221
+ (anger + disgust) > 0.5 and
222
+ any(p in patterns for p in ["control", "threat", "insults", "dismissiveness"]) and
223
+ sentiment == "undermining"
224
+ ):
225
+ return "emotional threat"
226
+
227
+ # 6. Weaponized Sadness
228
+ if (
229
+ sadness > 0.6 and
230
+ any(p in patterns for p in ["guilt tripping", "projection"]) and
231
+ sentiment == "undermining"
232
+ ):
233
+ return "weaponized sadness"
234
+
235
+ # 7. Toxic Resignation
236
+ if (
237
+ neutral > 0.5 and
238
+ any(p in patterns for p in ["dismissiveness", "obscure language"]) and
239
+ sentiment == "undermining"
240
+ ):
241
+ return "toxic resignation"
242
+ # 8. Aggressive Dismissal
243
+ if (
244
+ anger > 0.5 and
245
+ any(p in patterns for p in ["aggression", "insults", "control"]) and
246
+ sentiment == "undermining"
247
+ ):
248
+ return "aggressive dismissal"
249
+ # 9. Deflective Hostility
250
+ if (
251
+ (0.2 < anger < 0.7 or 0.2 < disgust < 0.7) and
252
+ any(p in patterns for p in ["deflection", "projection"]) and
253
+ sentiment == "undermining"
254
+ ):
255
+ return "deflective hostility"
256
+ # 10. Mocking Detachment
257
+ if (
258
+ (neutral + joy) > 0.5 and
259
+ any(p in patterns for p in ["mockery", "insults", "projection"]) and
260
+ sentiment == "undermining"
261
+ ):
262
+ return "mocking detachment"
263
+ # 11. Contradictory Gaslight
264
+ if (
265
+ (joy + anger + sadness) > 0.5 and
266
+ any(p in patterns for p in ["gaslighting", "contradictory statements"]) and
267
+ sentiment == "undermining"
268
+ ):
269
+ return "contradictory gaslight"
270
+ # 12. Calculated Neutrality
271
+ if (
272
+ neutral > 0.6 and
273
+ any(p in patterns for p in ["obscure language", "deflection", "dismissiveness"]) and
274
+ sentiment == "undermining"
275
+ ):
276
+ return "calculated neutrality"
277
+ # 13. Forced Accountability Flip
278
+ if (
279
+ (anger + disgust) > 0.5 and
280
+ any(p in patterns for p in ["blame shifting", "manipulation", "projection"]) and
281
+ sentiment == "undermining"
282
+ ):
283
+ return "forced accountability flip"
284
+ # 14. Conditional Affection
285
+ if (
286
+ joy > 0.4 and
287
+ any(p in patterns for p in ["apology baiting", "control", "recovery phase"]) and
288
+ sentiment == "undermining"
289
+ ):
290
+ return "conditional affection"
291
+
292
+ if (
293
+ (anger + disgust) > 0.5 and
294
+ any(p in patterns for p in ["blame shifting", "projection", "deflection"]) and
295
+ sentiment == "undermining"
296
+ ):
297
+ return "forced accountability flip"
298
+
299
+ # Emotional Instability Fallback
300
+ if (
301
+ (anger + sadness + disgust) > 0.6 and
302
+ sentiment == "undermining"
303
+ ):
304
+ return "emotional instability"
305
+
306
+ return None
307
+ def detect_contradiction(message):
308
+ patterns = [
309
+ (r"\b(i love you).{0,15}(i hate you|you ruin everything)", re.IGNORECASE),
310
+ (r"\b(i’m sorry).{0,15}(but you|if you hadn’t)", re.IGNORECASE),
311
+ (r"\b(i’m trying).{0,15}(you never|why do you)", re.IGNORECASE),
312
+ (r"\b(do what you want).{0,15}(you’ll regret it|i always give everything)", re.IGNORECASE),
313
+ (r"\b(i don’t care).{0,15}(you never think of me)", re.IGNORECASE),
314
+ (r"\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)", re.IGNORECASE)
315
+ ]
316
+ return any(re.search(p, message, flags) for p, flags in patterns)
317
+
318
+ def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
319
+ # Count all detected DARVO-related patterns
320
+ pattern_hits = sum(1 for p in patterns if p.lower() in DARVO_PATTERNS)
321
+
322
+ # Sentiment delta
323
+ sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
324
+
325
+ # Match against DARVO motifs more loosely
326
+ motif_hits = sum(
327
+ any(phrase.lower() in motif.lower() or motif.lower() in phrase.lower()
328
+ for phrase in DARVO_MOTIFS)
329
+ for motif in motifs_found
330
+ )
331
+ motif_score = motif_hits / max(len(DARVO_MOTIFS), 1)
332
+
333
+ # Contradiction still binary
334
+ contradiction_score = 1.0 if contradiction_flag else 0.0
335
+
336
+ # Final DARVO score
337
+ return round(min(
338
+ 0.3 * pattern_hits +
339
+ 0.3 * sentiment_shift_score +
340
+ 0.25 * motif_score +
341
+ 0.15 * contradiction_score, 1.0
342
+ ), 3)
343
+ def detect_weapon_language(text):
344
+ weapon_keywords = [
345
+ "knife", "knives", "stab", "cut you", "cutting",
346
+ "gun", "shoot", "rifle", "firearm", "pistol",
347
+ "bomb", "blow up", "grenade", "explode",
348
+ "weapon", "armed", "loaded", "kill you", "take you out"
349
+ ]
350
+ text_lower = text.lower()
351
+ return any(word in text_lower for word in weapon_keywords)
352
+ def get_risk_stage(patterns, sentiment):
353
+ if "threat" in patterns or "insults" in patterns:
354
+ return 2
355
+ elif "recovery phase" in patterns:
356
+ return 3
357
+ elif "control" in patterns or "guilt tripping" in patterns:
358
+ return 1
359
+ elif sentiment == "supportive" and any(p in patterns for p in ["projection", "dismissiveness"]):
360
+ return 4
361
+ return 1
362
+
363
+ def generate_risk_snippet(abuse_score, top_label, escalation_score, stage):
364
+ import re
365
+
366
+ # Extract aggression score if aggression is detected
367
+ if isinstance(top_label, str) and "aggression" in top_label.lower():
368
+ try:
369
+ match = re.search(r"\(?(\d+)\%?\)?", top_label)
370
+ aggression_score = int(match.group(1)) / 100 if match else 0
371
+ except:
372
+ aggression_score = 0
373
+ else:
374
+ aggression_score = 0
375
+
376
+ # Revised risk logic
377
+ if abuse_score >= 85 or escalation_score >= 16:
378
+ risk_level = "high"
379
+ elif abuse_score >= 60 or escalation_score >= 8 or aggression_score >= 0.25:
380
+ risk_level = "moderate"
381
+ elif stage == 2 and abuse_score >= 40:
382
+ risk_level = "moderate"
383
+ else:
384
+ risk_level = "low"
385
+
386
+ if isinstance(top_label, str) and " – " in top_label:
387
+ pattern_label, pattern_score = top_label.split(" – ")
388
+ else:
389
+ pattern_label = str(top_label) if top_label is not None else "Unknown"
390
+ pattern_score = ""
391
+
392
+ WHY_FLAGGED = {
393
+ "control": "This message may reflect efforts to restrict someone’s autonomy, even if it's framed as concern or care.",
394
+ "gaslighting": "This message could be manipulating someone into questioning their perception or feelings.",
395
+ "dismissiveness": "This message may include belittling, invalidating, or ignoring the other person’s experience.",
396
+ "insults": "Direct insults often appear in escalating abusive dynamics and can erode emotional safety.",
397
+ "threat": "This message includes threatening language, which is a strong predictor of harm.",
398
+ "blame shifting": "This message may redirect responsibility to avoid accountability, especially during conflict.",
399
+ "guilt tripping": "This message may induce guilt in order to control or manipulate behavior.",
400
+ "recovery phase": "This message may be part of a tension-reset cycle, appearing kind but avoiding change.",
401
+ "projection": "This message may involve attributing the abuser’s own behaviors to the victim.",
402
+ "contradictory statements": "This message may contain internal contradictions used to confuse, destabilize, or deflect responsibility.",
403
+ "obscure language": "This message may use overly formal, vague, or complex language to obscure meaning or avoid accountability.",
404
+ "default": "This message contains language patterns that may affect safety, clarity, or emotional autonomy."
405
+ }
406
+
407
+ explanation = WHY_FLAGGED.get(pattern_label.lower(), WHY_FLAGGED["default"])
408
+
409
+ base = f"\n\nπŸ›‘ Risk Level: {risk_level.capitalize()}\n"
410
+ base += f"This message shows strong indicators of **{pattern_label}**. "
411
+
412
+ if risk_level == "high":
413
+ base += "The language may reflect patterns of emotional control, even when expressed in soft or caring terms.\n"
414
+ elif risk_level == "moderate":
415
+ base += "There are signs of emotional pressure or verbal aggression that may escalate if repeated.\n"
416
+ else:
417
+ base += "The message does not strongly indicate abuse, but it's important to monitor for patterns.\n"
418
+
419
+ base += f"\nπŸ’‘ *Why this might be flagged:*\n{explanation}\n"
420
+ base += f"\nDetected Pattern: **{pattern_label} ({pattern_score})**\n"
421
+ base += "🧠 You can review the pattern in context. This tool highlights possible dynamicsβ€”not judgments."
422
+ return base
423
+
424
+ WHY_FLAGGED = {
425
+ "control": "This message may reflect efforts to restrict someone’s autonomy, even if it's framed as concern or care.",
426
+ "gaslighting": "This message could be manipulating someone into questioning their perception or feelings.",
427
+ "dismissiveness": "This message may include belittling, invalidating, or ignoring the other person’s experience.",
428
+ "insults": "Direct insults often appear in escalating abusive dynamics and can erode emotional safety.",
429
+ "threat": "This message includes threatening language, which is a strong predictor of harm.",
430
+ "blame shifting": "This message may redirect responsibility to avoid accountability, especially during conflict.",
431
+ "guilt tripping": "This message may induce guilt in order to control or manipulate behavior.",
432
+ "recovery phase": "This message may be part of a tension-reset cycle, appearing kind but avoiding change.",
433
+ "projection": "This message may involve attributing the abuser’s own behaviors to the victim.",
434
+ "contradictory statements": "This message may contain internal contradictions used to confuse, destabilize, or deflect responsibility.",
435
+ "obscure language": "This message may use overly formal, vague, or complex language to obscure meaning or avoid accountability.",
436
+ "default": "This message contains language patterns that may affect safety, clarity, or emotional autonomy."
437
+ }
438
+ explanation = WHY_FLAGGED.get(pattern_label.lower(), WHY_FLAGGED["default"])
439
+
440
+ base = f"\n\nπŸ›‘ Risk Level: {risk_level.capitalize()}\n"
441
+ base += f"This message shows strong indicators of **{pattern_label}**. "
442
+
443
+ if risk_level == "high":
444
+ base += "The language may reflect patterns of emotional control, even when expressed in soft or caring terms.\n"
445
+ elif risk_level == "moderate":
446
+ base += "There are signs of emotional pressure or indirect control that may escalate if repeated.\n"
447
+ else:
448
+ base += "The message does not strongly indicate abuse, but it's important to monitor for patterns.\n"
449
+
450
+ base += f"\nπŸ’‘ *Why this might be flagged:*\n{explanation}\n"
451
+ base += f"\nDetected Pattern: **{pattern_label} ({pattern_score})**\n"
452
+ base += "🧠 You can review the pattern in context. This tool highlights possible dynamicsβ€”not judgments."
453
+ return base
454
+ def compute_abuse_score(matched_scores, sentiment):
455
+ if not matched_scores:
456
+ return 0
457
+
458
+ # Weighted average of passed patterns
459
+ weighted_total = sum(score * weight for _, score, weight in matched_scores)
460
+ weight_sum = sum(weight for _, _, weight in matched_scores)
461
+ base_score = (weighted_total / weight_sum) * 100
462
+
463
+ # Boost for pattern count
464
+ pattern_count = len(matched_scores)
465
+ scale = 1.0 + 0.25 * max(0, pattern_count - 1) # 1.25x for 2, 1.5x for 3+
466
+ scaled_score = base_score * scale
467
+
468
+ # Pattern floors
469
+ FLOORS = {
470
+ "threat": 70,
471
+ "control": 40,
472
+ "gaslighting": 30,
473
+ "insults": 25,
474
+ "aggression": 40
475
+ }
476
+ floor = max(FLOORS.get(label, 0) for label, _, _ in matched_scores)
477
+ adjusted_score = max(scaled_score, floor)
478
+
479
+ # Sentiment tweak
480
+ if sentiment == "undermining" and adjusted_score < 50:
481
+ adjusted_score += 10
482
+
483
+ return min(adjusted_score, 100)
484
+
485
+
486
+ def analyze_single_message(text, thresholds):
487
+ motif_hits, matched_phrases = detect_motifs(text)
488
+
489
+ # Get emotion profile
490
+ emotion_profile = get_emotion_profile(text)
491
+ sentiment_score = emotion_profile.get("anger", 0) + emotion_profile.get("disgust", 0)
492
+
493
+ # Get model scores
494
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
495
+ with torch.no_grad():
496
+ outputs = model(**inputs)
497
+ scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
498
+
499
+ # Sentiment override if neutral is high while critical thresholds are passed
500
+ if emotion_profile.get("neutral", 0) > 0.85 and any(
501
+ scores[LABELS.index(l)] > thresholds[l]
502
+ for l in ["control", "threat", "blame shifting"]
503
+ ):
504
+ sentiment = "undermining"
505
+ else:
506
+ sentiment = "undermining" if sentiment_score > 0.25 else "supportive"
507
+
508
+ weapon_flag = detect_weapon_language(text)
509
+
510
+ adjusted_thresholds = {
511
+ k: v + 0.05 if sentiment == "supportive" else v
512
+ for k, v in thresholds.items()
513
+ }
514
+
515
+ contradiction_flag = detect_contradiction(text)
516
+
517
+ threshold_labels = [
518
+ label for label, score in zip(LABELS, scores)
519
+ if score > adjusted_thresholds[label]
520
+ ]
521
+ tone_tag = get_emotional_tone_tag(emotion_profile, sentiment, threshold_labels, 0)
522
+ motifs = [phrase for _, phrase in matched_phrases]
523
+
524
+ darvo_score = calculate_darvo_score(
525
+ threshold_labels,
526
+ sentiment_before=0.0,
527
+ sentiment_after=sentiment_score,
528
+ motifs_found=motifs,
529
+ contradiction_flag=contradiction_flag
530
+ )
531
+
532
+ top_patterns = sorted(
533
+ [(label, score) for label, score in zip(LABELS, scores)],
534
+ key=lambda x: x[1],
535
+ reverse=True
536
+ )[:2]
537
+ # Post-threshold validation: strip recovery if it occurs with undermining sentiment
538
+ if "recovery" in threshold_labels and tone_tag == "forced accountability flip":
539
+ threshold_labels.remove("recovery")
540
+ top_patterns = [p for p in top_patterns if p[0] != "recovery"]
541
+ print("⚠️ Removing 'recovery' due to undermining sentiment (not genuine repair)")
542
+
543
+ matched_scores = [
544
+ (label, score, PATTERN_WEIGHTS.get(label, 1.0))
545
+ for label, score in zip(LABELS, scores)
546
+ if score > adjusted_thresholds[label]
547
+ ]
548
+
549
+ abuse_score_raw = compute_abuse_score(matched_scores, sentiment)
550
+ abuse_score = abuse_score_raw
551
+
552
+ # Risk stage logic
553
+ stage = get_risk_stage(threshold_labels, sentiment) if threshold_labels else 1
554
+ if weapon_flag and stage < 2:
555
+ stage = 2
556
+ if weapon_flag:
557
+ abuse_score_raw = min(abuse_score_raw + 25, 100)
558
+
559
+ abuse_score = min(
560
+ abuse_score_raw,
561
+ 100 if "threat" in threshold_labels or "control" in threshold_labels else 95
562
+ )
563
+
564
+ # Tag must happen after abuse score is finalized
565
+ tone_tag = get_emotional_tone_tag(emotion_profile, sentiment, threshold_labels, abuse_score)
566
+
567
+ # ---- Profanity + Anger Override Logic ----
568
+ profane_words = {"fuck", "fucking", "bitch", "shit", "cunt", "ho", "asshole", "dick", "whore", "slut"}
569
+ tokens = set(text.lower().split())
570
+ has_profane = any(word in tokens for word in profane_words)
571
+
572
+ anger_score = emotion_profile.get("Anger", 0)
573
+ short_text = len(tokens) <= 10
574
+ insult_score = next((s for l, s in top_patterns if l == "insults"), 0)
575
+
576
+ if has_profane and anger_score > 0.75 and short_text:
577
+ print("⚠️ Profanity + Anger Override Triggered")
578
+ top_patterns = sorted(top_patterns, key=lambda x: x[1], reverse=True)
579
+ if top_patterns[0][0] != "insults":
580
+ top_patterns.insert(0, ("insults", insult_score))
581
+ if "insults" not in threshold_labels:
582
+ threshold_labels.append("insults")
583
+ top_patterns = [("insults", insult_score)] + [p for p in top_patterns if p[0] != "insults"]
584
+ # Debug
585
+ print(f"Emotional Tone Tag: {tone_tag}")
586
+ # Debug
587
+ print(f"Emotional Tone Tag: {tone_tag}")
588
+ print("Emotion Profile:")
589
+ for emotion, score in emotion_profile.items():
590
+ print(f" {emotion.capitalize():10}: {score}")
591
+ print("\n--- Debug Info ---")
592
+ print(f"Text: {text}")
593
+ print(f"Sentiment (via emotion): {sentiment} (score: {round(sentiment_score, 3)})")
594
+ print("Abuse Pattern Scores:")
595
+ for label, score in zip(LABELS, scores):
596
+ passed = "βœ…" if score > adjusted_thresholds[label] else "❌"
597
+ print(f" {label:25} β†’ {score:.3f} {passed}")
598
+ print(f"Matched for score: {[(l, round(s, 3)) for l, s, _ in matched_scores]}")
599
+ print(f"Abuse Score Raw: {round(abuse_score_raw, 1)}")
600
+ print(f"Motifs: {motifs}")
601
+ print(f"Contradiction: {contradiction_flag}")
602
+ print("------------------\n")
603
+
604
+ return abuse_score, threshold_labels, top_patterns, {"label": sentiment}, stage, darvo_score, tone_tag
605
+
606
+ def analyze_composite(msg1, date1, msg2, date2, msg3, date3, *answers_and_none):
607
+ none_selected_checked = answers_and_none[-1]
608
+ responses_checked = any(answers_and_none[:-1])
609
+ none_selected = not responses_checked and none_selected_checked
610
+
611
+ if none_selected:
612
+ escalation_score = None
613
+ risk_level = "unknown"
614
+ else:
615
+ escalation_score = sum(w for (_, w), a in zip(ESCALATION_QUESTIONS, answers_and_none[:-1]) if a)
616
+
617
+ messages = [msg1, msg2, msg3]
618
+ dates = [date1, date2, date3]
619
+ active = [(m, d) for m, d in zip(messages, dates) if m.strip()]
620
+ if not active:
621
+ return "Please enter at least one message."
622
+
623
+ # Run model on messages
624
+ results = [(analyze_single_message(m, THRESHOLDS.copy()), d) for m, d in active]
625
+ # --- Combined Abuse Escalation Scoring ---
626
+
627
+ # Extract predicted abuse labels from all messages
628
+ predicted_labels = [label for r in results for label, _ in r[0][2]]
629
+
630
+ # Categorize by severity
631
+ high = {'control'}
632
+ moderate = {
633
+ 'gaslighting', 'dismissiveness', 'obscure language',
634
+ 'insults', 'contradictory statements', 'guilt tripping'
635
+ }
636
+ low = {'blame shifting', 'projection', 'recovery phase'}
637
+
638
+ # Count severity types
639
+ counts = {'high': 0, 'moderate': 0, 'low': 0}
640
+ for label in predicted_labels:
641
+ if label in high:
642
+ counts['high'] += 1
643
+ elif label in moderate:
644
+ counts['moderate'] += 1
645
+ elif label in low:
646
+ counts['low'] += 1
647
+
648
+ # Derive abuse_risk from combinations
649
+ if counts['high'] >= 2 and counts['moderate'] >= 2:
650
+ abuse_risk = 'Critical'
651
+ elif (counts['high'] >= 2 and counts['moderate'] >= 1) or (counts['moderate'] >= 3) or (counts['high'] >= 1 and counts['moderate'] >= 2):
652
+ abuse_risk = 'High'
653
+ elif (counts['moderate'] == 2) or (counts['high'] == 1 and counts['moderate'] == 1) or (counts['moderate'] == 1 and counts['low'] >= 2) or (counts['high'] == 1 and sum(counts.values()) == 1):
654
+ abuse_risk = 'Moderate'
655
+ else:
656
+ abuse_risk = 'Low'
657
+
658
+ # Combine abuse_risk and checklist score into final risk_level
659
+ if escalation_score is not None:
660
+ if escalation_score >= 8 or abuse_risk == 'Critical':
661
+ risk_level = 'Critical'
662
+ elif escalation_score >= 5 or abuse_risk == 'High':
663
+ risk_level = 'High'
664
+ elif escalation_score >= 2 or abuse_risk == 'Moderate':
665
+ risk_level = 'Moderate'
666
+ else:
667
+ risk_level = 'Low'
668
+ abuse_scores = [r[0][0] for r in results]
669
+ top_labels = [r[0][1][0] if r[0][1] else r[0][2][0][0] for r in results]
670
+ top_scores = [r[0][2][0][1] for r in results]
671
+ sentiments = [r[0][3]['label'] for r in results]
672
+ stages = [r[0][4] for r in results]
673
+ darvo_scores = [r[0][5] for r in results]
674
+ tone_tags= [r[0][6] for r in results]
675
+ dates_used = [r[1] or "Undated" for r in results] # Store dates for future mapping
676
+ # Calculate escalation bump *after* model results exist
677
+ escalation_bump = 0
678
+ for result, _ in results:
679
+ abuse_score, threshold_labels, top_patterns, sentiment, stage, darvo_score, tone_tag = result
680
+ if darvo_score > 0.65:
681
+ escalation_bump += 3
682
+ if tone_tag in ["forced accountability flip", "emotional threat"]:
683
+ escalation_bump += 2
684
+ if abuse_score > 80:
685
+ escalation_bump += 2
686
+ if stage == 2:
687
+ escalation_bump += 3
688
+
689
+ # Now we can safely calculate hybrid_score
690
+ hybrid_score = escalation_score + escalation_bump if escalation_score is not None else 0
691
+ risk_level = (
692
+ "High" if hybrid_score >= 16 else
693
+ "Moderate" if hybrid_score >= 8 else
694
+ "Low"
695
+ )
696
+
697
+ # Now compute scores and allow override
698
+ abuse_scores = [r[0][0] for r in results]
699
+ stages = [r[0][4] for r in results]
700
+
701
+ # Post-check override (e.g. stage 2 or high abuse score forces Moderate risk)
702
+ if any(score > 70 for score in abuse_scores) or any(stage == 2 for stage in stages):
703
+ if risk_level == "Low":
704
+ risk_level = "Moderate"
705
+
706
+ for result, date in results:
707
+ assert len(result) == 7, "Unexpected output from analyze_single_message"
708
+ from collections import Counter
709
+
710
+ # --- Step 1: Pattern-based escalation scoring ---
711
+ flat_patterns = [label for r in results for (label, _) in r[0][2]]
712
+
713
+ PATTERN_RISKS = {
714
+ "blame shifting": "low",
715
+ "contradictory statements": "moderate",
716
+ "control": "high",
717
+ "dismissiveness": "moderate",
718
+ "gaslighting": "moderate",
719
+ "guilt tripping": "moderate",
720
+ "insults": "moderate",
721
+ "obscure language": "low",
722
+ "projection": "low",
723
+ " recovery phase": "low"
724
+ }
725
+
726
+ risk_counts = Counter(PATTERN_RISKS.get(p, "unknown") for p in flat_patterns)
727
+ num_critical = 0 # no "critical" tags defined in your label set
728
+ num_high = risk_counts["high"]
729
+ num_moderate = risk_counts["moderate"]
730
+ num_low = risk_counts["low"]
731
+
732
+ # Determine pattern-based escalation risk
733
+ pattern_escalation_risk = "Low"
734
+ if num_high >= 2 and num_moderate >= 2:
735
+ pattern_escalation_risk = "Critical"
736
+ elif num_high >= 2 and num_moderate >= 1:
737
+ pattern_escalation_risk = "High"
738
+ elif num_moderate >= 3:
739
+ pattern_escalation_risk = "High"
740
+ elif num_high == 1 and num_moderate >= 1:
741
+ pattern_escalation_risk = "Moderate"
742
+ elif num_moderate == 2:
743
+ pattern_escalation_risk = "Moderate"
744
+ elif num_moderate == 1 and num_low >= 2:
745
+ pattern_escalation_risk = "Moderate"
746
+ elif num_high == 1 and (num_high + num_moderate + num_low) == 1:
747
+ pattern_escalation_risk = "Moderate"
748
+
749
+ # --- Step 2: Checklist escalation logic ---
750
+ if none_selected:
751
+ escalation_score = None
752
+ checklist_escalation_risk = None
753
+ else:
754
+ escalation_score = sum(w for (_, w), a in zip(ESCALATION_QUESTIONS, answers_and_none[:-1]) if a)
755
+ checklist_escalation_risk = (
756
+ "Critical" if escalation_score >= 8 else
757
+ "High" if escalation_score >= 5 else
758
+ "Moderate" if escalation_score >= 2 else
759
+ "Low"
760
+ )
761
+
762
+ # --- Step 3: Escalation bump from DARVO, tone, abuse score, etc.
763
+ escalation_bump = 0
764
+ for result, _ in results:
765
+ abuse_score, _, _, sentiment, stage, darvo_score, tone_tag = result
766
+ if darvo_score > 0.65:
767
+ escalation_bump += 3
768
+ if tone_tag in ["forced accountability flip", "emotional threat"]:
769
+ escalation_bump += 2
770
+ if abuse_score > 80:
771
+ escalation_bump += 2
772
+ if stage == 2:
773
+ escalation_bump += 3
774
+
775
+ # --- Step 4: Final escalation risk level
776
+ def rank(risk_label):
777
+ return ["Low", "Moderate", "High", "Critical"].index(risk_label) if risk_label else 0
778
+
779
+ combined_score = rank(pattern_escalation_risk) + rank(checklist_escalation_risk) + escalation_bump
780
+
781
+ escalation_risk = (
782
+ "Critical" if combined_score >= 6 else
783
+ "High" if combined_score >= 4 else
784
+ "Moderate" if combined_score >= 2 else
785
+ "Low"
786
+ )
787
+ # --- Composite Abuse Score using compute_abuse_score ---
788
+ composite_abuse_scores = []
789
+
790
+ for result, _ in results:
791
+ _, _, top_patterns, sentiment, _, _, _ = result
792
+ matched_scores = [(label, score, PATTERN_WEIGHTS.get(label, 1.0)) for label, score in top_patterns]
793
+ final_score = compute_abuse_score(matched_scores, sentiment["label"])
794
+ composite_abuse_scores.append(final_score)
795
+
796
+ composite_abuse = int(round(sum(composite_abuse_scores) / len(composite_abuse_scores)))
797
+
798
+
799
+
800
+ most_common_stage = max(set(stages), key=stages.count)
801
+ stage_text = RISK_STAGE_LABELS[most_common_stage]
802
+
803
+ avg_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
804
+ darvo_blurb = ""
805
+ if avg_darvo > 0.25:
806
+ level = "moderate" if avg_darvo < 0.65 else "high"
807
+ darvo_blurb = f"\n\n🎭 **DARVO Score: {avg_darvo}** β†’ This indicates a **{level} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
808
+
809
+ out = f"Abuse Intensity: {composite_abuse}%\n"
810
+ out += "πŸ“Š This reflects the strength and severity of detected abuse patterns in the message(s).\n\n"
811
+
812
+ # Save this line for later use at the
813
+ if escalation_score is None:
814
+ escalation_text = "πŸ“‰ Escalation Potential: Unknown (Checklist not completed)\n"
815
+ escalation_text += "⚠️ *This section was not completed. Escalation potential is unknown.*\n"
816
+ hybrid_score = 0 # βœ… fallback so it's defined for generate_risk_snippet
817
+ else:
818
+ escalation_text = f"🧨 **Escalation Potential: {risk_level} ({escalation_score}/{sum(w for _, w in ESCALATION_QUESTIONS)})**\n"
819
+ escalation_text += "This score comes directly from the safety checklist and functions as a standalone escalation risk score.\n"
820
+ escalation_text += "It indicates how many serious risk factors are present based on your answers to the safety checklist.\n"
821
+ # Derive top_label from the strongest top_patterns across all messages
822
+ top_label = None
823
+ if results:
824
+ sorted_patterns = sorted(
825
+ [(label, score) for r in results for label, score in r[0][2]],
826
+ key=lambda x: x[1],
827
+ reverse=True
828
+ )
829
+ if sorted_patterns:
830
+ top_label = f"{sorted_patterns[0][0]} – {int(round(sorted_patterns[0][1] * 100))}%"
831
+ if top_label is None:
832
+ top_label = "Unknown – 0%"
833
+ out += generate_risk_snippet(composite_abuse, top_label, hybrid_score if escalation_score is not None else 0, most_common_stage)
834
+ out += f"\n\n{stage_text}"
835
+ out += darvo_blurb
836
+ out += "\n\n🎭 **Emotional Tones Detected:**\n"
837
+ for i, tone in enumerate(tone_tags):
838
+ label = tone if tone else "none"
839
+ out += f"β€’ Message {i+1}: *{label}*\n"
840
+ print(f"DEBUG: avg_darvo = {avg_darvo}")
841
+ pattern_labels = [r[0][2][0][0] for r in results] # top label for each message
842
+ timeline_image = generate_abuse_score_chart(dates_used, abuse_scores, pattern_labels)
843
+ out += "\n\n" + escalation_text
844
+ return out, timeline_image
845
+
846
+ message_date_pairs = [
847
+ (
848
+ gr.Textbox(label=f"Message {i+1}"),
849
+ gr.Textbox(label=f"Date {i+1} (optional)", placeholder="YYYY-MM-DD")
850
+ )
851
+ for i in range(3)
852
+ ]
853
+ textbox_inputs = [item for pair in message_date_pairs for item in pair]
854
+ quiz_boxes = [gr.Checkbox(label=q) for q, _ in ESCALATION_QUESTIONS]
855
+ none_box = gr.Checkbox(label="None of the above")
856
+
857
+ iface = gr.Interface(
858
+ fn=analyze_composite,
859
+ inputs=textbox_inputs + quiz_boxes + [none_box],
860
+ outputs=[
861
+ gr.Textbox(label="Results"),
862
+ gr.Image(label="Abuse Score Timeline", type="pil")
863
+ ],
864
+ title="Abuse Pattern Detector + Escalation Quiz",
865
+ allow_flagging="manual"
866
+ )
867
+
868
+ if __name__ == "__main__":
869
+ iface.launch()