SamanthaStorm commited on
Commit
e179789
Β·
verified Β·
1 Parent(s): 4ed85b7

Upload app (23).py

Browse files
Files changed (1) hide show
  1. app (23).py +816 -0
app (23).py ADDED
@@ -0,0 +1,816 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import torch
4
+ import numpy as np
5
+ import re
6
+ import matplotlib.pyplot as plt
7
+ import io
8
+ from PIL import Image
9
+ from datetime import datetime
10
+ from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
11
+ from motif_tagging import detect_motifs
12
+ from functools import lru_cache
13
+ from torch.nn.functional import sigmoid
14
+
15
+ # ----- Models -----
16
+
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+
19
+ # Emotion model (CPU for stability)
20
+ emotion_pipeline = pipeline(
21
+ "text-classification",
22
+ model="j-hartmann/emotion-english-distilroberta-base",
23
+ top_k=6,
24
+ truncation=True,
25
+ device=-1 # Force CPU usage
26
+ )
27
+
28
+ # Abuse Model
29
+ model_name = "SamanthaStorm/tether-multilabel-v4" # Or your HF Hub path
30
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
31
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
32
+ model.to(device)
33
+
34
+ # DARVO Model
35
+ darvo_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1")
36
+ darvo_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1", use_fast=False)
37
+ darvo_model.eval()
38
+ darvo_model.to(device)
39
+
40
+ def get_emotion_profile(text):
41
+ emotions = emotion_pipeline(text)
42
+ if isinstance(emotions, list) and isinstance(emotions[0], list):
43
+ emotions = emotions[0]
44
+ return {e['label'].lower(): round(e['score'], 3) for e in emotions}
45
+ # Emotion model (no retraining needed)
46
+ emotion_pipeline = pipeline(
47
+ "text-classification",
48
+ model="j-hartmann/emotion-english-distilroberta-base",
49
+ top_k=6,
50
+ truncation=True
51
+ )
52
+
53
+ # --- Timeline Visualization Function ---
54
+ def generate_abuse_score_chart(dates, scores, labels):
55
+ import matplotlib.pyplot as plt
56
+ import io
57
+ from PIL import Image
58
+ from datetime import datetime
59
+ import re
60
+
61
+ # Determine if all entries are valid dates
62
+ if all(re.match(r"\d{4}-\d{2}-\d{2}", d) for d in dates):
63
+ parsed_x = [datetime.strptime(d, "%Y-%m-%d") for d in dates]
64
+ x_labels = [d.strftime("%Y-%m-%d") for d in parsed_x]
65
+ else:
66
+ parsed_x = list(range(1, len(dates) + 1))
67
+ x_labels = [f"Message {i+1}" for i in range(len(dates))]
68
+
69
+ fig, ax = plt.subplots(figsize=(8, 3))
70
+ ax.plot(parsed_x, scores, marker='o', linestyle='-', color='darkred', linewidth=2)
71
+
72
+ for x, y in zip(parsed_x, scores):
73
+ ax.text(x, y + 2, f"{int(y)}%", ha='center', fontsize=8, color='black')
74
+
75
+ ax.set_xticks(parsed_x)
76
+ ax.set_xticklabels(x_labels)
77
+ ax.set_xlabel("") # No axis label
78
+ ax.set_ylabel("Abuse Score (%)")
79
+ ax.set_ylim(0, 105)
80
+ ax.grid(True)
81
+ plt.tight_layout()
82
+
83
+ buf = io.BytesIO()
84
+ plt.savefig(buf, format='png')
85
+ buf.seek(0)
86
+ return Image.open(buf)
87
+
88
+
89
+ # --- Abuse Model ---
90
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
91
+
92
+ model_name = "SamanthaStorm/tether-multilabel-v4"
93
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
94
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
95
+
96
+ LABELS = [
97
+ "recovery", "control", "gaslighting", "guilt tripping", "dismissiveness", "blame shifting",
98
+ "nonabusive","projection", "insults", "contradictory statements", "obscure language"
99
+ ]
100
+
101
+ THRESHOLDS = {
102
+ "recovery": 0.27,
103
+ "control": 0.47,
104
+ "gaslighting": 0.48,
105
+ "guilt tripping": .56,
106
+ "dismissiveness": 0.25,
107
+ "blame shifting": 0.55,
108
+ "projection": 0.59,
109
+ "insults": 0.33,
110
+ "contradictory statements": 0.27,
111
+ "obscure language": 0.65,
112
+ "nonabusive": 1.0
113
+ }
114
+
115
+ PATTERN_WEIGHTS = {
116
+ "recovery": 0.5,
117
+ "control": 1.4,
118
+ "gaslighting": 1.0,
119
+ "guilt tripping": 0.9,
120
+ "dismissiveness": 0.9,
121
+ "blame shifting": 0.8,
122
+ "projection": 0.5,
123
+ "insults": 1.2,
124
+ "contradictory statements": 1.0,
125
+ "obscure language": 0.9,
126
+ "nonabusive": 0.0
127
+ }
128
+
129
+ ESCALATION_RISKS = {
130
+ "blame shifting": "low",
131
+ "contradictory statements": "moderate",
132
+ "control": "high",
133
+ "dismissiveness": "moderate",
134
+ "gaslighting": "moderate",
135
+ "guilt tripping": "moderate",
136
+ "insults": "moderate",
137
+ "obscure language": "low",
138
+ "projection": "low",
139
+ "recovery phase": "low"
140
+ }
141
+ RISK_STAGE_LABELS = {
142
+ 1: "πŸŒ€ Risk Stage: Tension-Building\nThis message reflects rising emotional pressure or subtle control attempts.",
143
+ 2: "πŸ”₯ Risk Stage: Escalation\nThis message includes direct or aggressive patterns, suggesting active harm.",
144
+ 3: "🌧️ Risk Stage: Reconciliation\nThis message reflects a reset attemptβ€”apologies or emotional repair without accountability.",
145
+ 4: "🌸 Risk Stage: Calm / Honeymoon\nThis message appears supportive but may follow prior harm, minimizing it."
146
+ }
147
+
148
+ ESCALATION_QUESTIONS = [
149
+ ("Partner has access to firearms or weapons", 4),
150
+ ("Partner threatened to kill you", 3),
151
+ ("Partner threatened you with a weapon", 3),
152
+ ("Partner has ever choked you, even if you considered it consensual at the time", 4),
153
+ ("Partner injured or threatened your pet(s)", 3),
154
+ ("Partner has broken your things, punched or kicked walls, or thrown things ", 2),
155
+ ("Partner forced or coerced you into unwanted sexual acts", 3),
156
+ ("Partner threatened to take away your children", 2),
157
+ ("Violence has increased in frequency or severity", 3),
158
+ ("Partner monitors your calls/GPS/social media", 2)
159
+ ]
160
+ def get_emotional_tone_tag(emotions, sentiment, patterns, abuse_score):
161
+ sadness = emotions.get("sadness", 0)
162
+ joy = emotions.get("joy", 0)
163
+ neutral = emotions.get("neutral", 0)
164
+ disgust = emotions.get("disgust", 0)
165
+ anger = emotions.get("anger", 0)
166
+ fear = emotions.get("fear", 0)
167
+ disgust = emotions.get("disgust", 0)
168
+
169
+ # 1. Performative Regret
170
+ if (
171
+ sadness > 0.4 and
172
+ any(p in patterns for p in ["blame shifting", "guilt tripping", "recovery phase"]) and
173
+ (sentiment == "undermining" or abuse_score > 40)
174
+ ):
175
+ return "performative regret"
176
+
177
+ # 2. Coercive Warmth
178
+ if (
179
+ (joy > 0.3 or sadness > 0.4) and
180
+ any(p in patterns for p in ["control", "gaslighting"]) and
181
+ sentiment == "undermining"
182
+ ):
183
+ return "coercive warmth"
184
+
185
+ # 3. Cold Invalidation
186
+ if (
187
+ (neutral + disgust) > 0.5 and
188
+ any(p in patterns for p in ["dismissiveness", "projection", "obscure language"]) and
189
+ sentiment == "undermining"
190
+ ):
191
+ return "cold invalidation"
192
+
193
+ # 4. Genuine Vulnerability
194
+ if (
195
+ (sadness + fear) > 0.5 and
196
+ sentiment == "supportive" and
197
+ all(p in ["recovery phase"] for p in patterns)
198
+ ):
199
+ return "genuine vulnerability"
200
+
201
+ # 5. Emotional Threat
202
+ if (
203
+ (anger + disgust) > 0.5 and
204
+ any(p in patterns for p in ["control", "insults", "dismissiveness"]) and
205
+ sentiment == "undermining"
206
+ ):
207
+ return "emotional threat"
208
+
209
+ # 6. Weaponized Sadness
210
+ if (
211
+ sadness > 0.6 and
212
+ any(p in patterns for p in ["guilt tripping", "projection"]) and
213
+ sentiment == "undermining"
214
+ ):
215
+ return "weaponized sadness"
216
+
217
+ # 7. Toxic Resignation
218
+ if (
219
+ neutral > 0.5 and
220
+ any(p in patterns for p in ["dismissiveness", "obscure language"]) and
221
+ sentiment == "undermining"
222
+ ):
223
+ return "toxic resignation"
224
+ # 8. Aggressive Dismissal
225
+ if (
226
+ anger > 0.5 and
227
+ any(p in patterns for p in ["aggression", "insults", "control"]) and
228
+ sentiment == "undermining"
229
+ ):
230
+ return "aggressive dismissal"
231
+ # 9. Deflective Hostility
232
+ if (
233
+ (0.2 < anger < 0.7 or 0.2 < disgust < 0.7) and
234
+ any(p in patterns for p in ["deflection", "projection"]) and
235
+ sentiment == "undermining"
236
+ ):
237
+ return "deflective hostility"
238
+ # 10. Mocking Detachment
239
+ if (
240
+ (neutral + joy) > 0.5 and
241
+ any(p in patterns for p in ["mockery", "insults", "projection"]) and
242
+ sentiment == "undermining"
243
+ ):
244
+ return "mocking detachment"
245
+ # 11. Contradictory Gaslight
246
+ if (
247
+ (joy + anger + sadness) > 0.5 and
248
+ any(p in patterns for p in ["gaslighting", "contradictory statements"]) and
249
+ sentiment == "undermining"
250
+ ):
251
+ return "contradictory gaslight"
252
+ # 12. Calculated Neutrality
253
+ if (
254
+ neutral > 0.6 and
255
+ any(p in patterns for p in ["obscure language", "deflection", "dismissiveness"]) and
256
+ sentiment == "undermining"
257
+ ):
258
+ return "calculated neutrality"
259
+ # 13. Forced Accountability Flip
260
+ if (
261
+ (anger + disgust) > 0.5 and
262
+ any(p in patterns for p in ["blame shifting", "manipulation", "projection"]) and
263
+ sentiment == "undermining"
264
+ ):
265
+ return "forced accountability flip"
266
+ # 14. Conditional Affection
267
+ if (
268
+ joy > 0.4 and
269
+ any(p in patterns for p in ["apology baiting", "control", "recovery phase"]) and
270
+ sentiment == "undermining"
271
+ ):
272
+ return "conditional affection"
273
+
274
+ if (
275
+ (anger + disgust) > 0.5 and
276
+ any(p in patterns for p in ["blame shifting", "projection", "deflection"]) and
277
+ sentiment == "undermining"
278
+ ):
279
+ return "forced accountability flip"
280
+
281
+ # Emotional Instability Fallback
282
+ if (
283
+ (anger + sadness + disgust) > 0.6 and
284
+ sentiment == "undermining"
285
+ ):
286
+ return "emotional instability"
287
+
288
+ return None
289
+ # πŸ”„ New DARVO score model (regression-based)
290
+ from torch.nn.functional import sigmoid
291
+ import torch
292
+
293
+ # Load your trained DARVO regressor from Hugging Face Hub
294
+ darvo_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1")
295
+ darvo_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1", use_fast=False)
296
+ darvo_model.eval()
297
+
298
+ def predict_darvo_score(text):
299
+ inputs = darvo_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
300
+ with torch.no_grad():
301
+ logits = darvo_model(**inputs).logits
302
+ score = sigmoid(logits).item()
303
+ return round(score, 4) # Rounded for display/output
304
+ def detect_weapon_language(text):
305
+ weapon_keywords = [
306
+ "knife", "knives", "stab", "cut you", "cutting",
307
+ "gun", "shoot", "rifle", "firearm", "pistol",
308
+ "bomb", "blow up", "grenade", "explode",
309
+ "weapon", "armed", "loaded", "kill you", "take you out"
310
+ ]
311
+ text_lower = text.lower()
312
+ return any(word in text_lower for word in weapon_keywords)
313
+ def get_risk_stage(patterns, sentiment):
314
+ if "insults" in patterns:
315
+ return 2
316
+ elif "recovery phase" in patterns:
317
+ return 3
318
+ elif "control" in patterns or "guilt tripping" in patterns:
319
+ return 1
320
+ elif sentiment == "supportive" and any(p in patterns for p in ["projection", "dismissiveness"]):
321
+ return 4
322
+ return 1
323
+
324
+ def generate_risk_snippet(abuse_score, top_label, escalation_score, stage):
325
+ import re
326
+
327
+ # Extract aggression score if aggression is detected
328
+ if isinstance(top_label, str) and "aggression" in top_label.lower():
329
+ try:
330
+ match = re.search(r"\(?(\d+)\%?\)?", top_label)
331
+ aggression_score = int(match.group(1)) / 100 if match else 0
332
+ except:
333
+ aggression_score = 0
334
+ else:
335
+ aggression_score = 0
336
+
337
+ # Revised risk logic
338
+ if abuse_score >= 85 or escalation_score >= 16:
339
+ risk_level = "high"
340
+ elif abuse_score >= 60 or escalation_score >= 8 or aggression_score >= 0.25:
341
+ risk_level = "moderate"
342
+ elif stage == 2 and abuse_score >= 40:
343
+ risk_level = "moderate"
344
+ else:
345
+ risk_level = "low"
346
+
347
+ if isinstance(top_label, str) and " – " in top_label:
348
+ pattern_label, pattern_score = top_label.split(" – ")
349
+ else:
350
+ pattern_label = str(top_label) if top_label is not None else "Unknown"
351
+ pattern_score = ""
352
+
353
+ WHY_FLAGGED = {
354
+ "control": "This message may reflect efforts to restrict someone’s autonomy, even if it's framed as concern or care.",
355
+ "gaslighting": "This message could be manipulating someone into questioning their perception or feelings.",
356
+ "dismissiveness": "This message may include belittling, invalidating, or ignoring the other person’s experience.",
357
+ "insults": "Direct insults often appear in escalating abusive dynamics and can erode emotional safety.",
358
+ "blame shifting": "This message may redirect responsibility to avoid accountability, especially during conflict.",
359
+ "guilt tripping": "This message may induce guilt in order to control or manipulate behavior.",
360
+ "recovery phase": "This message may be part of a tension-reset cycle, appearing kind but avoiding change.",
361
+ "projection": "This message may involve attributing the abuser’s own behaviors to the victim.",
362
+ "contradictory statements": "This message may contain internal contradictions used to confuse, destabilize, or deflect responsibility.",
363
+ "obscure language": "This message may use overly formal, vague, or complex language to obscure meaning or avoid accountability.",
364
+ "default": "This message contains language patterns that may affect safety, clarity, or emotional autonomy."
365
+ }
366
+
367
+ explanation = WHY_FLAGGED.get(pattern_label.lower(), WHY_FLAGGED["default"])
368
+
369
+ base = f"\n\nπŸ›‘ Risk Level: {risk_level.capitalize()}\n"
370
+ base += f"This message shows strong indicators of **{pattern_label}**. "
371
+
372
+ if risk_level == "high":
373
+ base += "The language may reflect patterns of emotional control, even when expressed in soft or caring terms.\n"
374
+ elif risk_level == "moderate":
375
+ base += "There are signs of emotional pressure or verbal aggression that may escalate if repeated.\n"
376
+ else:
377
+ base += "The message does not strongly indicate abuse, but it's important to monitor for patterns.\n"
378
+
379
+ base += f"\nπŸ’‘ *Why this might be flagged:*\n{explanation}\n"
380
+ base += f"\nDetected Pattern: **{pattern_label} ({pattern_score})**\n"
381
+ base += "🧠 You can review the pattern in context. This tool highlights possible dynamicsβ€”not judgments."
382
+ return base
383
+
384
+
385
+ # --- Step X: Detect Immediate Danger Threats ---
386
+ THREAT_MOTIFS = [
387
+ "i'll kill you", "i’m going to hurt you", "you’re dead", "you won't survive this",
388
+ "i’ll break your face", "i'll bash your head in", "i’ll snap your neck",
389
+ "i’ll come over there and make you shut up", "i'll knock your teeth out",
390
+ "you’re going to bleed", "you want me to hit you?", "i won’t hold back next time",
391
+ "i swear to god i’ll beat you", "next time, i won’t miss", "i’ll make you scream",
392
+ "i know where you live", "i'm outside", "i’ll be waiting", "i saw you with him",
393
+ "you can’t hide from me", "i’m coming to get you", "i'll find you", "i know your schedule",
394
+ "i watched you leave", "i followed you home", "you'll regret this", "you’ll be sorry",
395
+ "you’re going to wish you hadn’t", "you brought this on yourself", "don’t push me",
396
+ "you have no idea what i’m capable of", "you better watch yourself",
397
+ "i don’t care what happens to you anymore", "i’ll make you suffer", "you’ll pay for this",
398
+ "i’ll never let you go", "you’re nothing without me", "if you leave me, i’ll kill myself",
399
+ "i'll ruin you", "i'll tell everyone what you did", "i’ll make sure everyone knows",
400
+ "i’m going to destroy your name", "you’ll lose everyone", "i’ll expose you",
401
+ "your friends will hate you", "i’ll post everything", "you’ll be cancelled",
402
+ "you’ll lose everything", "i’ll take the house", "i’ll drain your account",
403
+ "you’ll never see a dime", "you’ll be broke when i’m done", "i’ll make sure you lose your job",
404
+ "i’ll take your kids", "i’ll make sure you have nothing", "you can’t afford to leave me",
405
+ "don't make me do this", "you know what happens when i’m mad", "you’re forcing my hand",
406
+ "if you just behaved, this wouldn’t happen", "this is your fault",
407
+ "you’re making me hurt you", "i warned you", "you should have listened"
408
+ ]
409
+
410
+
411
+ @spaces.GPU
412
+ def compute_abuse_score(matched_scores, sentiment):
413
+ """
414
+ Compute abuse score with more conservative adjustments.
415
+ """
416
+ if not matched_scores:
417
+ return 0.0
418
+
419
+ sorted_scores = sorted(matched_scores, key=lambda x: x[1], reverse=True)
420
+ highest_score = sorted_scores[0][1]
421
+ num_patterns = len(matched_scores)
422
+
423
+ # Scale down base score more aggressively if multiple patterns are present
424
+ if num_patterns > 1:
425
+ highest_score *= (1 - (num_patterns - 1) * 0.2) # Reduce by 20% for each additional pattern
426
+
427
+ base_score = highest_score * 100
428
+
429
+ critical_patterns = {
430
+ 'gaslighting': 1.4, # Reduced
431
+ 'guilt tripping': 1.3, # Reduced
432
+ 'blame shifting': 1.2, # Reduced
433
+ 'control': 1.3, # Reduced
434
+ 'insults': 1.1, # Reduced
435
+ 'manipulation': 1.2,
436
+ 'love bombing': 1.2,
437
+ 'emotional blackmail': 1.4,
438
+ 'dismissiveness': 1.1,
439
+ 'contradictory statements': 1.1
440
+ }
441
+
442
+ for label, score, _ in matched_scores:
443
+ if label in critical_patterns and score > 0.5:
444
+ base_score *= critical_patterns[label]
445
+
446
+ # Further reduce combination multipliers
447
+ if len(matched_scores) >= 2:
448
+ base_score *= 1.1 # Reduced
449
+ if len(matched_scores) >= 3:
450
+ base_score *= 1.05 # Reduced
451
+
452
+ # Reduce high confidence boost
453
+ if any(score > 0.8 for _, score, _ in matched_scores):
454
+ base_score *= 1.05 # Reduced
455
+
456
+ # Sentiment modifier (more nuanced)
457
+ if emotion_profile.get("neutral", 0) > 0.85 and any(
458
+ scores[LABELS.index(l)] > thresholds[l] * 0.8 # Scale down thresholds for neutral sentiment
459
+ for l in ["control", "blame shifting", "insults", "guilt tripping"] # Consider more labels
460
+ ):
461
+ sentiment = "undermining" # Only override if multiple patterns are present with moderate confidence
462
+ elif sentiment_score > 0.35: # Increased threshold
463
+ sentiment = "undermining"
464
+ else:
465
+ sentiment = "supportive"
466
+
467
+ # Reduce minimum score and threshold for activation
468
+ if any(score > 0.9 for _, score, _ in matched_scores): # Higher threshold
469
+ base_score = max(base_score, 75.0) # Reduced
470
+ elif any(score > 0.7 for _, score, _ in matched_scores): # Moderate threshold
471
+ base_score = max(base_score, 60.0) # Reduced
472
+
473
+ return min(round(base_score, 1), 100.0)
474
+
475
+ @lru_cache(maxsize=1024) # Cache results for performance
476
+ def analyze_single_message(text, thresholds):
477
+ print("⚑ ENTERED analyze_single_message")
478
+ stage = 1
479
+ motif_hits, matched_phrases = detect_motifs(text)
480
+
481
+ # Get emotion profile
482
+ emotion_profile = get_emotion_profile(text)
483
+ sentiment_score = emotion_profile.get("anger", 0) + emotion_profile.get("disgust", 0)
484
+
485
+ # Get model scores
486
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
487
+ with torch.no_grad():
488
+ outputs = model(**inputs)
489
+ scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
490
+
491
+ # Sentiment override
492
+ if emotion_profile.get("neutral", 0) > 0.85 and any(
493
+ scores[LABELS.index(l)] > thresholds[l] * 0.8 # Scale down thresholds for neutral sentiment
494
+ for l in ["control", "blame shifting", "insults", "guilt tripping"] # Consider more labels
495
+ ):
496
+ sentiment = "undermining" # Only override if multiple patterns are present with moderate confidence
497
+ elif sentiment_score > 0.35: # Increased threshold
498
+ sentiment = "undermining"
499
+ else:
500
+ sentiment = "supportive"
501
+
502
+ weapon_flag = detect_weapon_language(text)
503
+
504
+ adjusted_thresholds = {
505
+ k: v + 0.05 if sentiment == "supportive" else v
506
+ for k, v in thresholds.items()
507
+ }
508
+
509
+ darvo_score = predict_darvo_score(text)
510
+
511
+ threshold_labels = [
512
+ label for label, score in zip(LABELS, scores)
513
+ if score > adjusted_thresholds[label]
514
+ ]
515
+ matched_scores = [(label, score, PATTERN_WEIGHTS.get(label, 1.0)) for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
516
+
517
+ if not threshold_labels:
518
+ return 0.0, [], [], {"label": sentiment}, 1, 0.0, None
519
+
520
+ top_patterns = sorted(
521
+ [(label, score) for label, score in zip(LABELS, scores)],
522
+ key=lambda x: x[1],
523
+ reverse=True
524
+ )[:2]
525
+
526
+ # Abuse score
527
+ abuse_score = compute_abuse_score(matched_scores, sentiment) # Calculate before adjustments
528
+
529
+ if weapon_flag:
530
+ abuse_score = min(abuse_score + 25, 100) # Apply weapon adjustment directly to abuse_score
531
+ if stage < 2:
532
+ stage = 2
533
+
534
+ abuse_score = min(abuse_score, 100 if "control" in threshold_labels else 95) # Apply cap after weapon adjustment
535
+
536
+ tone_tag = get_emotional_tone_tag(emotion_profile, sentiment, threshold_labels, abuse_score)
537
+
538
+
539
+ threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
540
+ matched_scores = [(label, score, PATTERN_WEIGHTS.get(label, 1.0)) for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
541
+
542
+ if not threshold_labels:
543
+ return 0.0, [], [], {"label": sentiment}, 1, 0.0, None
544
+
545
+ # Remove recovery tag if tone is fake
546
+ if "recovery" in threshold_labels and tone_tag == "forced accountability flip":
547
+ threshold_labels.remove("recovery")
548
+ top_patterns = [p for p in top_patterns if p[0] != "recovery"]
549
+ print("⚠️ Removing 'recovery' due to undermining sentiment (not genuine repair)")
550
+
551
+ # Override profanity/anger for short texts
552
+ profane_words = {"fuck", "fucking", "bitch", "shit", "cunt", "ho", "asshole", "dick", "whore", "slut"}
553
+ tokens = set(text.lower().split())
554
+ has_profane = any(word in tokens for word in profane_words)
555
+ short_text = len(tokens) <= 10
556
+ anger_score = emotion_profile.get("anger", 0)
557
+ if has_profane and anger_score > 0.75 and short_text:
558
+ print("⚠️ Profanity + Anger Override Triggered")
559
+ insult_score = next((s for l, s in top_patterns if l == "insults"), 0)
560
+ if ("insults", insult_score) not in top_patterns:
561
+ top_patterns = [("insults", insult_score)] + top_patterns
562
+ if "insults" not in threshold_labels:
563
+ threshold_labels.append("insults")
564
+
565
+ # Debug
566
+ print(f"Emotional Tone Tag: {tone_tag}")
567
+ print("Emotion Profile:")
568
+ for emotion, score in emotion_profile.items():
569
+ print(f" {emotion.capitalize():10}: {score}")
570
+ print("\n--- Debug Info ---")
571
+ print(f"Text: {text}")
572
+ print(f"Sentiment (via emotion): {sentiment} (score: {round(sentiment_score, 3)})")
573
+ print("Abuse Pattern Scores:")
574
+ for label, score in zip(LABELS, scores):
575
+ passed = "βœ…" if score > adjusted_thresholds[label] else "❌"
576
+ print(f" {label:25} β†’ {score:.3f} {passed}")
577
+ print(f"Matched for score: {[(l, round(s, 3)) for l, s, _ in matched_scores]}")
578
+ print(f"Abuse Score Raw: {round(abuse_score_raw, 1)}")
579
+ print("------------------\n")
580
+
581
+ return abuse_score, threshold_labels, top_patterns, {"label": sentiment}, stage, darvo_score, tone_tag
582
+
583
+
584
+
585
+ @spaces.GPU
586
+ def analyze_composite(msg1, msg2, msg3, *answers_and_none):
587
+ from collections import Counter
588
+
589
+ none_selected_checked = answers_and_none[-1]
590
+ responses_checked = any(answers_and_none[:-1])
591
+ none_selected = not responses_checked and none_selected_checked
592
+
593
+ if none_selected:
594
+ escalation_score = 0
595
+ escalation_note = "Checklist completed: no danger items reported."
596
+ escalation_completed = True
597
+ elif responses_checked:
598
+ escalation_score = sum(w for (_, w), a in zip(ESCALATION_QUESTIONS, answers_and_none[:-1]) if a)
599
+ escalation_note = "Checklist completed."
600
+ escalation_completed = True
601
+ else:
602
+ escalation_score = None
603
+ escalation_note = "Checklist not completed."
604
+ escalation_completed = False
605
+
606
+ messages = [msg1, msg2, msg3]
607
+ active = [(m, f"Message {i+1}") for i, m in enumerate(messages) if m.strip()]
608
+ if not active:
609
+ return "Please enter at least one message.", None
610
+
611
+ # Flag any threat phrases present in the messages
612
+ import re
613
+
614
+ def normalize(text):
615
+ import unicodedata
616
+ text = text.lower().strip()
617
+ text = unicodedata.normalize("NFKD", text) # handles curly quotes
618
+ text = text.replace("’", "'") # smart to straight
619
+ return re.sub(r"[^a-z0-9 ]", "", text)
620
+
621
+ def detect_threat_motifs(message, motif_list):
622
+ norm_msg = normalize(message)
623
+ return [
624
+ motif for motif in motif_list
625
+ if normalize(motif) in norm_msg
626
+ ]
627
+
628
+ # Collect matches per message
629
+ immediate_threats = [detect_threat_motifs(m, THREAT_MOTIFS) for m, _ in active]
630
+ flat_threats = [t for sublist in immediate_threats for t in sublist]
631
+ threat_risk = "Yes" if flat_threats else "No"
632
+ results = [(analyze_single_message(m.lower(), THRESHOLDS.copy()), d) for m, d in active]
633
+
634
+ abuse_scores = [r[0][0] for r in results]
635
+ stages = [r[0][4] for r in results]
636
+ darvo_scores = [r[0][5] for r in results]
637
+ tone_tags = [r[0][6] for r in results]
638
+ dates_used = [r[1] for r in results]
639
+
640
+ predicted_labels = [label for r in results for label, _ in r[0][2]]
641
+ high = {'control'}
642
+ moderate = {'gaslighting', 'dismissiveness', 'obscure language', 'insults', 'contradictory statements', 'guilt tripping'}
643
+ low = {'blame shifting', 'projection', 'recovery phase'}
644
+ counts = {'high': 0, 'moderate': 0, 'low': 0}
645
+ for label in predicted_labels:
646
+ if label in high:
647
+ counts['high'] += 1
648
+ elif label in moderate:
649
+ counts['moderate'] += 1
650
+ elif label in low:
651
+ counts['low'] += 1
652
+
653
+ # Pattern escalation logic
654
+ pattern_escalation_risk = "Low"
655
+ if counts['high'] >= 2 and counts['moderate'] >= 2:
656
+ pattern_escalation_risk = "Critical"
657
+ elif (counts['high'] >= 2 and counts['moderate'] >= 1) or (counts['moderate'] >= 3) or (counts['high'] >= 1 and counts['moderate'] >= 2):
658
+ pattern_escalation_risk = "High"
659
+ elif (counts['moderate'] == 2) or (counts['high'] == 1 and counts['moderate'] == 1) or (counts['moderate'] == 1 and counts['low'] >= 2) or (counts['high'] == 1 and sum(counts.values()) == 1):
660
+ pattern_escalation_risk = "Moderate"
661
+
662
+ checklist_escalation_risk = "Unknown" if escalation_score is None else (
663
+ "Critical" if escalation_score >= 20 else
664
+ "Moderate" if escalation_score >= 10 else
665
+ "Low"
666
+ )
667
+
668
+ escalation_bump = 0
669
+ for result, _ in results:
670
+ abuse_score, _, _, sentiment, stage, darvo_score, tone_tag = result
671
+ if darvo_score > 0.65:
672
+ escalation_bump += 3
673
+ if tone_tag in ["forced accountability flip", "emotional threat"]:
674
+ escalation_bump += 2
675
+ if abuse_score > 80:
676
+ escalation_bump += 2
677
+ if stage == 2:
678
+ escalation_bump += 3
679
+
680
+ def rank(label):
681
+ return {"Low": 0, "Moderate": 1, "High": 2, "Critical": 3, "Unknown": 0}.get(label, 0)
682
+
683
+ combined_score = rank(pattern_escalation_risk) + rank(checklist_escalation_risk) + escalation_bump
684
+ escalation_risk = (
685
+ "Critical" if combined_score >= 6 else
686
+ "High" if combined_score >= 4 else
687
+ "Moderate" if combined_score >= 2 else
688
+ "Low"
689
+ )
690
+
691
+ none_selected_checked = answers_and_none[-1]
692
+ responses_checked = any(answers_and_none[:-1])
693
+ none_selected = not responses_checked and none_selected_checked
694
+
695
+ # Determine escalation_score
696
+ if none_selected:
697
+ escalation_score = 0
698
+ escalation_completed = True
699
+ elif responses_checked:
700
+ escalation_score = sum(
701
+ w for (_, w), a in zip(ESCALATION_QUESTIONS, answers_and_none[:-1]) if a
702
+ )
703
+ escalation_completed = True
704
+ else:
705
+ escalation_score = None
706
+ escalation_completed = False
707
+
708
+ # Build escalation_text and hybrid_score
709
+ if escalation_score is None:
710
+ escalation_text = (
711
+ "🚫 **Escalation Potential: Unknown** (Checklist not completed)\n"
712
+ "⚠️ This section was not completed. Escalation potential is estimated using message data only.\n"
713
+ )
714
+ hybrid_score = 0
715
+ elif escalation_score == 0:
716
+ escalation_text = (
717
+ "βœ… **Escalation Checklist Completed:** No danger items reported.\n"
718
+ "🧭 **Escalation potential estimated from detected message patterns only.**\n"
719
+ f"β€’ Pattern Risk: {pattern_escalation_risk}\n"
720
+ f"β€’ Checklist Risk: None reported\n"
721
+ f"β€’ Escalation Bump: +{escalation_bump} (from DARVO, tone, intensity, etc.)"
722
+ )
723
+ hybrid_score = escalation_bump
724
+ else:
725
+ hybrid_score = escalation_score + escalation_bump
726
+ escalation_text = (
727
+ f"πŸ“ˆ **Escalation Potential: {escalation_risk} ({hybrid_score}/29)**\n"
728
+ "πŸ“‹ This score combines your safety checklist answers *and* detected high-risk behavior.\n"
729
+ f"β€’ Pattern Risk: {pattern_escalation_risk}\n"
730
+ f"β€’ Checklist Risk: {checklist_escalation_risk}\n"
731
+ f"β€’ Escalation Bump: +{escalation_bump} (from DARVO, tone, intensity, etc.)"
732
+ )
733
+
734
+ # Composite Abuse Score (weighted average based on message length)
735
+ composite_abuse_scores = []
736
+ message_lengths = [len(m.split()) for m, _ in active]
737
+ total_length = sum(message_lengths)
738
+
739
+ for result, length in zip(results, message_lengths):
740
+ abuse_score = result[0][0]
741
+ weight = length / total_length if total_length > 0 else 1 / len(results) if len(results) > 0 else 1
742
+ composite_abuse_scores.append(abuse_score * weight)
743
+ composite_abuse = int(round(sum(composite_abuse_scores)))
744
+
745
+
746
+ most_common_stage = max(set(stages), key=stages.count)
747
+ stage_text = RISK_STAGE_LABELS[most_common_stage]
748
+ # Derive top label list for each message
749
+ # safe derive top_labels
750
+ top_labels = []
751
+ for result, _ in results:
752
+ threshold_labels = result[1]
753
+ top_patterns = result[2]
754
+ if threshold_labels:
755
+ top_labels.append(threshold_labels[0])
756
+ elif top_patterns:
757
+ top_labels.append(top_patterns[0][0])
758
+ else:
759
+ top_labels.append("none") # or whatever default you prefer
760
+ avg_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
761
+ darvo_blurb = ""
762
+ if avg_darvo > 0.25:
763
+ level = "moderate" if avg_darvo < 0.65 else "high"
764
+ darvo_blurb = f"\n\n🎭 **DARVO Score: {avg_darvo}** β†’ This indicates a **{level} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
765
+
766
+ out = f"Abuse Intensity: {composite_abuse}%\n"
767
+ out += "πŸ“Š This reflects the strength and severity of detected abuse patterns in the message(s).\n\n"
768
+ out += generate_risk_snippet(composite_abuse, top_labels[0], hybrid_score, most_common_stage)
769
+ out += f"\n\n{stage_text}"
770
+ out += darvo_blurb
771
+ out += "\n\n🎭 **Emotional Tones Detected:**\n"
772
+ for i, tone in enumerate(tone_tags):
773
+ out += f"β€’ Message {i+1}: *{tone or 'none'}*\n"
774
+ # --- Add Immediate Danger Threats section
775
+ if flat_threats:
776
+ out += "\n\n🚨 **Immediate Danger Threats Detected:**\n"
777
+ for t in set(flat_threats):
778
+ out += f"β€’ \"{t}\"\n"
779
+ out += "\n⚠️ These phrases may indicate an imminent risk to physical safety."
780
+ else:
781
+ out += "\n\n🧩 **Immediate Danger Threats:** None explicitly detected.\n"
782
+ out += "This does *not* rule out risk, but no direct threat phrases were matched."
783
+ pattern_labels = [
784
+ pats[0][0] if (pats := r[0][2]) else "none"
785
+ for r in results
786
+ ]
787
+ timeline_image = generate_abuse_score_chart(dates_used, abuse_scores, top_labels)
788
+ out += "\n\n" + escalation_text
789
+ return out, timeline_image
790
+
791
+ textbox_inputs = [gr.Textbox(label=f"Message {i+1}") for i in range(3)]
792
+ quiz_boxes = [gr.Checkbox(label=q) for q, _ in ESCALATION_QUESTIONS]
793
+ none_box = gr.Checkbox(label="None of the above")
794
+
795
+
796
+ # ─── FINAL β€œFORCE LAUNCH” (no guards) ────────────────────────
797
+
798
+ demo = gr.Interface(
799
+ fn=analyze_composite,
800
+ inputs=textbox_inputs + quiz_boxes + [none_box],
801
+ outputs=[
802
+ gr.Textbox(label="Results"),
803
+ gr.Image(label="Abuse Score Timeline", type="pil")
804
+ ],
805
+ title="Abuse Pattern Detector + Escalation Quiz",
806
+ description=(
807
+ "Enter up to three messages that concern you. "
808
+ "For the most accurate results, include messages from a recent emotionally intense period."
809
+ ),
810
+ flagging_mode="manual"
811
+ )
812
+ # This single call will start the server and block,
813
+ # keeping the container alive on Spaces.
814
+ demo.launch()
815
+
816
+