SamanthaStorm commited on
Commit
9d2e492
·
verified ·
1 Parent(s): ddbb48e

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -816
app.py DELETED
@@ -1,816 +0,0 @@
1
- import gradio as gr
2
- import spaces
3
- import torch
4
- import numpy as np
5
- import re
6
- import matplotlib.pyplot as plt
7
- import io
8
- from PIL import Image
9
- from datetime import datetime
10
- from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
11
- from motif_tagging import detect_motifs
12
- from functools import lru_cache
13
- from torch.nn.functional import sigmoid
14
-
15
- # ----- Models -----
16
-
17
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
-
19
- # Emotion model (CPU for stability)
20
- emotion_pipeline = pipeline(
21
- "text-classification",
22
- model="j-hartmann/emotion-english-distilroberta-base",
23
- top_k=6,
24
- truncation=True,
25
- device=-1 # Force CPU usage
26
- )
27
-
28
- # Abuse Model
29
- model_name = "SamanthaStorm/tether-multilabel-v4" # Or your HF Hub path
30
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
31
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
32
- model.to(device)
33
-
34
- # DARVO Model
35
- darvo_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1")
36
- darvo_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1", use_fast=False)
37
- darvo_model.eval()
38
- darvo_model.to(device)
39
-
40
- def get_emotion_profile(text):
41
- emotions = emotion_pipeline(text)
42
- if isinstance(emotions, list) and isinstance(emotions[0], list):
43
- emotions = emotions[0]
44
- return {e['label'].lower(): round(e['score'], 3) for e in emotions}
45
- # Emotion model (no retraining needed)
46
- emotion_pipeline = pipeline(
47
- "text-classification",
48
- model="j-hartmann/emotion-english-distilroberta-base",
49
- top_k=6,
50
- truncation=True
51
- )
52
-
53
- # --- Timeline Visualization Function ---
54
- def generate_abuse_score_chart(dates, scores, labels):
55
- import matplotlib.pyplot as plt
56
- import io
57
- from PIL import Image
58
- from datetime import datetime
59
- import re
60
-
61
- # Determine if all entries are valid dates
62
- if all(re.match(r"\d{4}-\d{2}-\d{2}", d) for d in dates):
63
- parsed_x = [datetime.strptime(d, "%Y-%m-%d") for d in dates]
64
- x_labels = [d.strftime("%Y-%m-%d") for d in parsed_x]
65
- else:
66
- parsed_x = list(range(1, len(dates) + 1))
67
- x_labels = [f"Message {i+1}" for i in range(len(dates))]
68
-
69
- fig, ax = plt.subplots(figsize=(8, 3))
70
- ax.plot(parsed_x, scores, marker='o', linestyle='-', color='darkred', linewidth=2)
71
-
72
- for x, y in zip(parsed_x, scores):
73
- ax.text(x, y + 2, f"{int(y)}%", ha='center', fontsize=8, color='black')
74
-
75
- ax.set_xticks(parsed_x)
76
- ax.set_xticklabels(x_labels)
77
- ax.set_xlabel("") # No axis label
78
- ax.set_ylabel("Abuse Score (%)")
79
- ax.set_ylim(0, 105)
80
- ax.grid(True)
81
- plt.tight_layout()
82
-
83
- buf = io.BytesIO()
84
- plt.savefig(buf, format='png')
85
- buf.seek(0)
86
- return Image.open(buf)
87
-
88
-
89
- # --- Abuse Model ---
90
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
91
-
92
- model_name = "SamanthaStorm/tether-multilabel-v4"
93
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
94
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
95
-
96
- LABELS = [
97
- "recovery", "control", "gaslighting", "guilt tripping", "dismissiveness", "blame shifting",
98
- "nonabusive","projection", "insults", "contradictory statements", "obscure language"
99
- ]
100
-
101
- THRESHOLDS = {
102
- "recovery": 0.27,
103
- "control": 0.47,
104
- "gaslighting": 0.48,
105
- "guilt tripping": .56,
106
- "dismissiveness": 0.25,
107
- "blame shifting": 0.55,
108
- "projection": 0.59,
109
- "insults": 0.33,
110
- "contradictory statements": 0.27,
111
- "obscure language": 0.65,
112
- "nonabusive": 1.0
113
- }
114
-
115
- PATTERN_WEIGHTS = {
116
- "recovery": 0.5,
117
- "control": 1.4,
118
- "gaslighting": 1.0,
119
- "guilt tripping": 0.9,
120
- "dismissiveness": 0.9,
121
- "blame shifting": 0.8,
122
- "projection": 0.5,
123
- "insults": 1.2,
124
- "contradictory statements": 1.0,
125
- "obscure language": 0.9,
126
- "nonabusive": 0.0
127
- }
128
-
129
- ESCALATION_RISKS = {
130
- "blame shifting": "low",
131
- "contradictory statements": "moderate",
132
- "control": "high",
133
- "dismissiveness": "moderate",
134
- "gaslighting": "moderate",
135
- "guilt tripping": "moderate",
136
- "insults": "moderate",
137
- "obscure language": "low",
138
- "projection": "low",
139
- "recovery phase": "low"
140
- }
141
- RISK_STAGE_LABELS = {
142
- 1: "🌀 Risk Stage: Tension-Building\nThis message reflects rising emotional pressure or subtle control attempts.",
143
- 2: "🔥 Risk Stage: Escalation\nThis message includes direct or aggressive patterns, suggesting active harm.",
144
- 3: "🌧️ Risk Stage: Reconciliation\nThis message reflects a reset attempt—apologies or emotional repair without accountability.",
145
- 4: "🌸 Risk Stage: Calm / Honeymoon\nThis message appears supportive but may follow prior harm, minimizing it."
146
- }
147
-
148
- ESCALATION_QUESTIONS = [
149
- ("Partner has access to firearms or weapons", 4),
150
- ("Partner threatened to kill you", 3),
151
- ("Partner threatened you with a weapon", 3),
152
- ("Partner has ever choked you, even if you considered it consensual at the time", 4),
153
- ("Partner injured or threatened your pet(s)", 3),
154
- ("Partner has broken your things, punched or kicked walls, or thrown things ", 2),
155
- ("Partner forced or coerced you into unwanted sexual acts", 3),
156
- ("Partner threatened to take away your children", 2),
157
- ("Violence has increased in frequency or severity", 3),
158
- ("Partner monitors your calls/GPS/social media", 2)
159
- ]
160
- def get_emotional_tone_tag(emotions, sentiment, patterns, abuse_score):
161
- sadness = emotions.get("sadness", 0)
162
- joy = emotions.get("joy", 0)
163
- neutral = emotions.get("neutral", 0)
164
- disgust = emotions.get("disgust", 0)
165
- anger = emotions.get("anger", 0)
166
- fear = emotions.get("fear", 0)
167
- disgust = emotions.get("disgust", 0)
168
-
169
- # 1. Performative Regret
170
- if (
171
- sadness > 0.4 and
172
- any(p in patterns for p in ["blame shifting", "guilt tripping", "recovery phase"]) and
173
- (sentiment == "undermining" or abuse_score > 40)
174
- ):
175
- return "performative regret"
176
-
177
- # 2. Coercive Warmth
178
- if (
179
- (joy > 0.3 or sadness > 0.4) and
180
- any(p in patterns for p in ["control", "gaslighting"]) and
181
- sentiment == "undermining"
182
- ):
183
- return "coercive warmth"
184
-
185
- # 3. Cold Invalidation
186
- if (
187
- (neutral + disgust) > 0.5 and
188
- any(p in patterns for p in ["dismissiveness", "projection", "obscure language"]) and
189
- sentiment == "undermining"
190
- ):
191
- return "cold invalidation"
192
-
193
- # 4. Genuine Vulnerability
194
- if (
195
- (sadness + fear) > 0.5 and
196
- sentiment == "supportive" and
197
- all(p in ["recovery phase"] for p in patterns)
198
- ):
199
- return "genuine vulnerability"
200
-
201
- # 5. Emotional Threat
202
- if (
203
- (anger + disgust) > 0.5 and
204
- any(p in patterns for p in ["control", "insults", "dismissiveness"]) and
205
- sentiment == "undermining"
206
- ):
207
- return "emotional threat"
208
-
209
- # 6. Weaponized Sadness
210
- if (
211
- sadness > 0.6 and
212
- any(p in patterns for p in ["guilt tripping", "projection"]) and
213
- sentiment == "undermining"
214
- ):
215
- return "weaponized sadness"
216
-
217
- # 7. Toxic Resignation
218
- if (
219
- neutral > 0.5 and
220
- any(p in patterns for p in ["dismissiveness", "obscure language"]) and
221
- sentiment == "undermining"
222
- ):
223
- return "toxic resignation"
224
- # 8. Aggressive Dismissal
225
- if (
226
- anger > 0.5 and
227
- any(p in patterns for p in ["aggression", "insults", "control"]) and
228
- sentiment == "undermining"
229
- ):
230
- return "aggressive dismissal"
231
- # 9. Deflective Hostility
232
- if (
233
- (0.2 < anger < 0.7 or 0.2 < disgust < 0.7) and
234
- any(p in patterns for p in ["deflection", "projection"]) and
235
- sentiment == "undermining"
236
- ):
237
- return "deflective hostility"
238
- # 10. Mocking Detachment
239
- if (
240
- (neutral + joy) > 0.5 and
241
- any(p in patterns for p in ["mockery", "insults", "projection"]) and
242
- sentiment == "undermining"
243
- ):
244
- return "mocking detachment"
245
- # 11. Contradictory Gaslight
246
- if (
247
- (joy + anger + sadness) > 0.5 and
248
- any(p in patterns for p in ["gaslighting", "contradictory statements"]) and
249
- sentiment == "undermining"
250
- ):
251
- return "contradictory gaslight"
252
- # 12. Calculated Neutrality
253
- if (
254
- neutral > 0.6 and
255
- any(p in patterns for p in ["obscure language", "deflection", "dismissiveness"]) and
256
- sentiment == "undermining"
257
- ):
258
- return "calculated neutrality"
259
- # 13. Forced Accountability Flip
260
- if (
261
- (anger + disgust) > 0.5 and
262
- any(p in patterns for p in ["blame shifting", "manipulation", "projection"]) and
263
- sentiment == "undermining"
264
- ):
265
- return "forced accountability flip"
266
- # 14. Conditional Affection
267
- if (
268
- joy > 0.4 and
269
- any(p in patterns for p in ["apology baiting", "control", "recovery phase"]) and
270
- sentiment == "undermining"
271
- ):
272
- return "conditional affection"
273
-
274
- if (
275
- (anger + disgust) > 0.5 and
276
- any(p in patterns for p in ["blame shifting", "projection", "deflection"]) and
277
- sentiment == "undermining"
278
- ):
279
- return "forced accountability flip"
280
-
281
- # Emotional Instability Fallback
282
- if (
283
- (anger + sadness + disgust) > 0.6 and
284
- sentiment == "undermining"
285
- ):
286
- return "emotional instability"
287
-
288
- return None
289
- # 🔄 New DARVO score model (regression-based)
290
- from torch.nn.functional import sigmoid
291
- import torch
292
-
293
- # Load your trained DARVO regressor from Hugging Face Hub
294
- darvo_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1")
295
- darvo_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1", use_fast=False)
296
- darvo_model.eval()
297
-
298
- def predict_darvo_score(text):
299
- inputs = darvo_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
300
- with torch.no_grad():
301
- logits = darvo_model(**inputs).logits
302
- score = sigmoid(logits).item()
303
- return round(score, 4) # Rounded for display/output
304
- def detect_weapon_language(text):
305
- weapon_keywords = [
306
- "knife", "knives", "stab", "cut you", "cutting",
307
- "gun", "shoot", "rifle", "firearm", "pistol",
308
- "bomb", "blow up", "grenade", "explode",
309
- "weapon", "armed", "loaded", "kill you", "take you out"
310
- ]
311
- text_lower = text.lower()
312
- return any(word in text_lower for word in weapon_keywords)
313
- def get_risk_stage(patterns, sentiment):
314
- if "insults" in patterns:
315
- return 2
316
- elif "recovery phase" in patterns:
317
- return 3
318
- elif "control" in patterns or "guilt tripping" in patterns:
319
- return 1
320
- elif sentiment == "supportive" and any(p in patterns for p in ["projection", "dismissiveness"]):
321
- return 4
322
- return 1
323
-
324
- def generate_risk_snippet(abuse_score, top_label, escalation_score, stage):
325
- import re
326
-
327
- # Extract aggression score if aggression is detected
328
- if isinstance(top_label, str) and "aggression" in top_label.lower():
329
- try:
330
- match = re.search(r"\(?(\d+)\%?\)?", top_label)
331
- aggression_score = int(match.group(1)) / 100 if match else 0
332
- except:
333
- aggression_score = 0
334
- else:
335
- aggression_score = 0
336
-
337
- # Revised risk logic
338
- if abuse_score >= 85 or escalation_score >= 16:
339
- risk_level = "high"
340
- elif abuse_score >= 60 or escalation_score >= 8 or aggression_score >= 0.25:
341
- risk_level = "moderate"
342
- elif stage == 2 and abuse_score >= 40:
343
- risk_level = "moderate"
344
- else:
345
- risk_level = "low"
346
-
347
- if isinstance(top_label, str) and " – " in top_label:
348
- pattern_label, pattern_score = top_label.split(" – ")
349
- else:
350
- pattern_label = str(top_label) if top_label is not None else "Unknown"
351
- pattern_score = ""
352
-
353
- WHY_FLAGGED = {
354
- "control": "This message may reflect efforts to restrict someone’s autonomy, even if it's framed as concern or care.",
355
- "gaslighting": "This message could be manipulating someone into questioning their perception or feelings.",
356
- "dismissiveness": "This message may include belittling, invalidating, or ignoring the other person’s experience.",
357
- "insults": "Direct insults often appear in escalating abusive dynamics and can erode emotional safety.",
358
- "blame shifting": "This message may redirect responsibility to avoid accountability, especially during conflict.",
359
- "guilt tripping": "This message may induce guilt in order to control or manipulate behavior.",
360
- "recovery phase": "This message may be part of a tension-reset cycle, appearing kind but avoiding change.",
361
- "projection": "This message may involve attributing the abuser’s own behaviors to the victim.",
362
- "contradictory statements": "This message may contain internal contradictions used to confuse, destabilize, or deflect responsibility.",
363
- "obscure language": "This message may use overly formal, vague, or complex language to obscure meaning or avoid accountability.",
364
- "default": "This message contains language patterns that may affect safety, clarity, or emotional autonomy."
365
- }
366
-
367
- explanation = WHY_FLAGGED.get(pattern_label.lower(), WHY_FLAGGED["default"])
368
-
369
- base = f"\n\n🛑 Risk Level: {risk_level.capitalize()}\n"
370
- base += f"This message shows strong indicators of **{pattern_label}**. "
371
-
372
- if risk_level == "high":
373
- base += "The language may reflect patterns of emotional control, even when expressed in soft or caring terms.\n"
374
- elif risk_level == "moderate":
375
- base += "There are signs of emotional pressure or verbal aggression that may escalate if repeated.\n"
376
- else:
377
- base += "The message does not strongly indicate abuse, but it's important to monitor for patterns.\n"
378
-
379
- base += f"\n💡 *Why this might be flagged:*\n{explanation}\n"
380
- base += f"\nDetected Pattern: **{pattern_label} ({pattern_score})**\n"
381
- base += "🧠 You can review the pattern in context. This tool highlights possible dynamics—not judgments."
382
- return base
383
-
384
-
385
- # --- Step X: Detect Immediate Danger Threats ---
386
- THREAT_MOTIFS = [
387
- "i'll kill you", "i’m going to hurt you", "you’re dead", "you won't survive this",
388
- "i’ll break your face", "i'll bash your head in", "i’ll snap your neck",
389
- "i’ll come over there and make you shut up", "i'll knock your teeth out",
390
- "you’re going to bleed", "you want me to hit you?", "i won’t hold back next time",
391
- "i swear to god i’ll beat you", "next time, i won’t miss", "i’ll make you scream",
392
- "i know where you live", "i'm outside", "i’ll be waiting", "i saw you with him",
393
- "you can’t hide from me", "i’m coming to get you", "i'll find you", "i know your schedule",
394
- "i watched you leave", "i followed you home", "you'll regret this", "you’ll be sorry",
395
- "you’re going to wish you hadn’t", "you brought this on yourself", "don’t push me",
396
- "you have no idea what i’m capable of", "you better watch yourself",
397
- "i don’t care what happens to you anymore", "i’ll make you suffer", "you’ll pay for this",
398
- "i’ll never let you go", "you’re nothing without me", "if you leave me, i’ll kill myself",
399
- "i'll ruin you", "i'll tell everyone what you did", "i’ll make sure everyone knows",
400
- "i’m going to destroy your name", "you’ll lose everyone", "i’ll expose you",
401
- "your friends will hate you", "i’ll post everything", "you’ll be cancelled",
402
- "you’ll lose everything", "i’ll take the house", "i’ll drain your account",
403
- "you’ll never see a dime", "you’ll be broke when i’m done", "i’ll make sure you lose your job",
404
- "i’ll take your kids", "i’ll make sure you have nothing", "you can’t afford to leave me",
405
- "don't make me do this", "you know what happens when i’m mad", "you’re forcing my hand",
406
- "if you just behaved, this wouldn’t happen", "this is your fault",
407
- "you’re making me hurt you", "i warned you", "you should have listened"
408
- ]
409
-
410
-
411
- @spaces.GPU
412
- def compute_abuse_score(matched_scores, sentiment):
413
- """
414
- Compute abuse score with more conservative adjustments.
415
- """
416
- if not matched_scores:
417
- return 0.0
418
-
419
- sorted_scores = sorted(matched_scores, key=lambda x: x[1], reverse=True)
420
- highest_score = sorted_scores[0][1]
421
- num_patterns = len(matched_scores)
422
-
423
- # Scale down base score more aggressively if multiple patterns are present
424
- if num_patterns > 1:
425
- highest_score *= (1 - (num_patterns - 1) * 0.2) # Reduce by 20% for each additional pattern
426
-
427
- base_score = highest_score * 100
428
-
429
- critical_patterns = {
430
- 'gaslighting': 1.4, # Reduced
431
- 'guilt tripping': 1.3, # Reduced
432
- 'blame shifting': 1.2, # Reduced
433
- 'control': 1.3, # Reduced
434
- 'insults': 1.1, # Reduced
435
- 'manipulation': 1.2,
436
- 'love bombing': 1.2,
437
- 'emotional blackmail': 1.4,
438
- 'dismissiveness': 1.1,
439
- 'contradictory statements': 1.1
440
- }
441
-
442
- for label, score, _ in matched_scores:
443
- if label in critical_patterns and score > 0.5:
444
- base_score *= critical_patterns[label]
445
-
446
- # Further reduce combination multipliers
447
- if len(matched_scores) >= 2:
448
- base_score *= 1.1 # Reduced
449
- if len(matched_scores) >= 3:
450
- base_score *= 1.05 # Reduced
451
-
452
- # Reduce high confidence boost
453
- if any(score > 0.8 for _, score, _ in matched_scores):
454
- base_score *= 1.05 # Reduced
455
-
456
- # Sentiment modifier (more nuanced)
457
- if emotion_profile.get("neutral", 0) > 0.85 and any(
458
- scores[LABELS.index(l)] > thresholds[l] * 0.8 # Scale down thresholds for neutral sentiment
459
- for l in ["control", "blame shifting", "insults", "guilt tripping"] # Consider more labels
460
- ):
461
- sentiment = "undermining" # Only override if multiple patterns are present with moderate confidence
462
- elif sentiment_score > 0.35: # Increased threshold
463
- sentiment = "undermining"
464
- else:
465
- sentiment = "supportive"
466
-
467
- # Reduce minimum score and threshold for activation
468
- if any(score > 0.9 for _, score, _ in matched_scores): # Higher threshold
469
- base_score = max(base_score, 75.0) # Reduced
470
- elif any(score > 0.7 for _, score, _ in matched_scores): # Moderate threshold
471
- base_score = max(base_score, 60.0) # Reduced
472
-
473
- return min(round(base_score, 1), 100.0)
474
-
475
- @lru_cache(maxsize=1024) # Cache results for performance
476
- def analyze_single_message(text, thresholds):
477
- print("⚡ ENTERED analyze_single_message")
478
- stage = 1
479
- motif_hits, matched_phrases = detect_motifs(text)
480
-
481
- # Get emotion profile
482
- emotion_profile = get_emotion_profile(text)
483
- sentiment_score = emotion_profile.get("anger", 0) + emotion_profile.get("disgust", 0)
484
-
485
- # Get model scores
486
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
487
- with torch.no_grad():
488
- outputs = model(**inputs)
489
- scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
490
-
491
- # Sentiment override
492
- if emotion_profile.get("neutral", 0) > 0.85 and any(
493
- scores[LABELS.index(l)] > thresholds[l] * 0.8 # Scale down thresholds for neutral sentiment
494
- for l in ["control", "blame shifting", "insults", "guilt tripping"] # Consider more labels
495
- ):
496
- sentiment = "undermining" # Only override if multiple patterns are present with moderate confidence
497
- elif sentiment_score > 0.35: # Increased threshold
498
- sentiment = "undermining"
499
- else:
500
- sentiment = "supportive"
501
-
502
- weapon_flag = detect_weapon_language(text)
503
-
504
- adjusted_thresholds = {
505
- k: v + 0.05 if sentiment == "supportive" else v
506
- for k, v in thresholds.items()
507
- }
508
-
509
- darvo_score = predict_darvo_score(text)
510
-
511
- threshold_labels = [
512
- label for label, score in zip(LABELS, scores)
513
- if score > adjusted_thresholds[label]
514
- ]
515
- matched_scores = [(label, score, PATTERN_WEIGHTS.get(label, 1.0)) for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
516
-
517
- if not threshold_labels:
518
- return 0.0, [], [], {"label": sentiment}, 1, 0.0, None
519
-
520
- top_patterns = sorted(
521
- [(label, score) for label, score in zip(LABELS, scores)],
522
- key=lambda x: x[1],
523
- reverse=True
524
- )[:2]
525
-
526
- # Abuse score
527
- abuse_score = compute_abuse_score(matched_scores, sentiment) # Calculate before adjustments
528
-
529
- if weapon_flag:
530
- abuse_score = min(abuse_score + 25, 100) # Apply weapon adjustment directly to abuse_score
531
- if stage < 2:
532
- stage = 2
533
-
534
- abuse_score = min(abuse_score, 100 if "control" in threshold_labels else 95) # Apply cap after weapon adjustment
535
-
536
- tone_tag = get_emotional_tone_tag(emotion_profile, sentiment, threshold_labels, abuse_score)
537
-
538
-
539
- threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
540
- matched_scores = [(label, score, PATTERN_WEIGHTS.get(label, 1.0)) for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
541
-
542
- if not threshold_labels:
543
- return 0.0, [], [], {"label": sentiment}, 1, 0.0, None
544
-
545
- # Remove recovery tag if tone is fake
546
- if "recovery" in threshold_labels and tone_tag == "forced accountability flip":
547
- threshold_labels.remove("recovery")
548
- top_patterns = [p for p in top_patterns if p[0] != "recovery"]
549
- print("⚠️ Removing 'recovery' due to undermining sentiment (not genuine repair)")
550
-
551
- # Override profanity/anger for short texts
552
- profane_words = {"fuck", "fucking", "bitch", "shit", "cunt", "ho", "asshole", "dick", "whore", "slut"}
553
- tokens = set(text.lower().split())
554
- has_profane = any(word in tokens for word in profane_words)
555
- short_text = len(tokens) <= 10
556
- anger_score = emotion_profile.get("anger", 0)
557
- if has_profane and anger_score > 0.75 and short_text:
558
- print("⚠️ Profanity + Anger Override Triggered")
559
- insult_score = next((s for l, s in top_patterns if l == "insults"), 0)
560
- if ("insults", insult_score) not in top_patterns:
561
- top_patterns = [("insults", insult_score)] + top_patterns
562
- if "insults" not in threshold_labels:
563
- threshold_labels.append("insults")
564
-
565
- # Debug
566
- print(f"Emotional Tone Tag: {tone_tag}")
567
- print("Emotion Profile:")
568
- for emotion, score in emotion_profile.items():
569
- print(f" {emotion.capitalize():10}: {score}")
570
- print("\n--- Debug Info ---")
571
- print(f"Text: {text}")
572
- print(f"Sentiment (via emotion): {sentiment} (score: {round(sentiment_score, 3)})")
573
- print("Abuse Pattern Scores:")
574
- for label, score in zip(LABELS, scores):
575
- passed = "✅" if score > adjusted_thresholds[label] else "❌"
576
- print(f" {label:25} → {score:.3f} {passed}")
577
- print(f"Matched for score: {[(l, round(s, 3)) for l, s, _ in matched_scores]}")
578
- print(f"Abuse Score Raw: {round(abuse_score_raw, 1)}")
579
- print("------------------\n")
580
-
581
- return abuse_score, threshold_labels, top_patterns, {"label": sentiment}, stage, darvo_score, tone_tag
582
-
583
-
584
-
585
- @spaces.GPU
586
- def analyze_composite(msg1, msg2, msg3, *answers_and_none):
587
- from collections import Counter
588
-
589
- none_selected_checked = answers_and_none[-1]
590
- responses_checked = any(answers_and_none[:-1])
591
- none_selected = not responses_checked and none_selected_checked
592
-
593
- if none_selected:
594
- escalation_score = 0
595
- escalation_note = "Checklist completed: no danger items reported."
596
- escalation_completed = True
597
- elif responses_checked:
598
- escalation_score = sum(w for (_, w), a in zip(ESCALATION_QUESTIONS, answers_and_none[:-1]) if a)
599
- escalation_note = "Checklist completed."
600
- escalation_completed = True
601
- else:
602
- escalation_score = None
603
- escalation_note = "Checklist not completed."
604
- escalation_completed = False
605
-
606
- messages = [msg1, msg2, msg3]
607
- active = [(m, f"Message {i+1}") for i, m in enumerate(messages) if m.strip()]
608
- if not active:
609
- return "Please enter at least one message.", None
610
-
611
- # Flag any threat phrases present in the messages
612
- import re
613
-
614
- def normalize(text):
615
- import unicodedata
616
- text = text.lower().strip()
617
- text = unicodedata.normalize("NFKD", text) # handles curly quotes
618
- text = text.replace("’", "'") # smart to straight
619
- return re.sub(r"[^a-z0-9 ]", "", text)
620
-
621
- def detect_threat_motifs(message, motif_list):
622
- norm_msg = normalize(message)
623
- return [
624
- motif for motif in motif_list
625
- if normalize(motif) in norm_msg
626
- ]
627
-
628
- # Collect matches per message
629
- immediate_threats = [detect_threat_motifs(m, THREAT_MOTIFS) for m, _ in active]
630
- flat_threats = [t for sublist in immediate_threats for t in sublist]
631
- threat_risk = "Yes" if flat_threats else "No"
632
- results = [(analyze_single_message(m.lower(), THRESHOLDS.copy()), d) for m, d in active]
633
-
634
- abuse_scores = [r[0][0] for r in results]
635
- stages = [r[0][4] for r in results]
636
- darvo_scores = [r[0][5] for r in results]
637
- tone_tags = [r[0][6] for r in results]
638
- dates_used = [r[1] for r in results]
639
-
640
- predicted_labels = [label for r in results for label, _ in r[0][2]]
641
- high = {'control'}
642
- moderate = {'gaslighting', 'dismissiveness', 'obscure language', 'insults', 'contradictory statements', 'guilt tripping'}
643
- low = {'blame shifting', 'projection', 'recovery phase'}
644
- counts = {'high': 0, 'moderate': 0, 'low': 0}
645
- for label in predicted_labels:
646
- if label in high:
647
- counts['high'] += 1
648
- elif label in moderate:
649
- counts['moderate'] += 1
650
- elif label in low:
651
- counts['low'] += 1
652
-
653
- # Pattern escalation logic
654
- pattern_escalation_risk = "Low"
655
- if counts['high'] >= 2 and counts['moderate'] >= 2:
656
- pattern_escalation_risk = "Critical"
657
- elif (counts['high'] >= 2 and counts['moderate'] >= 1) or (counts['moderate'] >= 3) or (counts['high'] >= 1 and counts['moderate'] >= 2):
658
- pattern_escalation_risk = "High"
659
- elif (counts['moderate'] == 2) or (counts['high'] == 1 and counts['moderate'] == 1) or (counts['moderate'] == 1 and counts['low'] >= 2) or (counts['high'] == 1 and sum(counts.values()) == 1):
660
- pattern_escalation_risk = "Moderate"
661
-
662
- checklist_escalation_risk = "Unknown" if escalation_score is None else (
663
- "Critical" if escalation_score >= 20 else
664
- "Moderate" if escalation_score >= 10 else
665
- "Low"
666
- )
667
-
668
- escalation_bump = 0
669
- for result, _ in results:
670
- abuse_score, _, _, sentiment, stage, darvo_score, tone_tag = result
671
- if darvo_score > 0.65:
672
- escalation_bump += 3
673
- if tone_tag in ["forced accountability flip", "emotional threat"]:
674
- escalation_bump += 2
675
- if abuse_score > 80:
676
- escalation_bump += 2
677
- if stage == 2:
678
- escalation_bump += 3
679
-
680
- def rank(label):
681
- return {"Low": 0, "Moderate": 1, "High": 2, "Critical": 3, "Unknown": 0}.get(label, 0)
682
-
683
- combined_score = rank(pattern_escalation_risk) + rank(checklist_escalation_risk) + escalation_bump
684
- escalation_risk = (
685
- "Critical" if combined_score >= 6 else
686
- "High" if combined_score >= 4 else
687
- "Moderate" if combined_score >= 2 else
688
- "Low"
689
- )
690
-
691
- none_selected_checked = answers_and_none[-1]
692
- responses_checked = any(answers_and_none[:-1])
693
- none_selected = not responses_checked and none_selected_checked
694
-
695
- # Determine escalation_score
696
- if none_selected:
697
- escalation_score = 0
698
- escalation_completed = True
699
- elif responses_checked:
700
- escalation_score = sum(
701
- w for (_, w), a in zip(ESCALATION_QUESTIONS, answers_and_none[:-1]) if a
702
- )
703
- escalation_completed = True
704
- else:
705
- escalation_score = None
706
- escalation_completed = False
707
-
708
- # Build escalation_text and hybrid_score
709
- if escalation_score is None:
710
- escalation_text = (
711
- "🚫 **Escalation Potential: Unknown** (Checklist not completed)\n"
712
- "⚠️ This section was not completed. Escalation potential is estimated using message data only.\n"
713
- )
714
- hybrid_score = 0
715
- elif escalation_score == 0:
716
- escalation_text = (
717
- "✅ **Escalation Checklist Completed:** No danger items reported.\n"
718
- "🧭 **Escalation potential estimated from detected message patterns only.**\n"
719
- f"• Pattern Risk: {pattern_escalation_risk}\n"
720
- f"• Checklist Risk: None reported\n"
721
- f"• Escalation Bump: +{escalation_bump} (from DARVO, tone, intensity, etc.)"
722
- )
723
- hybrid_score = escalation_bump
724
- else:
725
- hybrid_score = escalation_score + escalation_bump
726
- escalation_text = (
727
- f"📈 **Escalation Potential: {escalation_risk} ({hybrid_score}/29)**\n"
728
- "📋 This score combines your safety checklist answers *and* detected high-risk behavior.\n"
729
- f"• Pattern Risk: {pattern_escalation_risk}\n"
730
- f"• Checklist Risk: {checklist_escalation_risk}\n"
731
- f"• Escalation Bump: +{escalation_bump} (from DARVO, tone, intensity, etc.)"
732
- )
733
-
734
- # Composite Abuse Score (weighted average based on message length)
735
- composite_abuse_scores = []
736
- message_lengths = [len(m.split()) for m, _ in active]
737
- total_length = sum(message_lengths)
738
-
739
- for result, length in zip(results, message_lengths):
740
- abuse_score = result[0][0]
741
- weight = length / total_length if total_length > 0 else 1 / len(results) if len(results) > 0 else 1
742
- composite_abuse_scores.append(abuse_score * weight)
743
- composite_abuse = int(round(sum(composite_abuse_scores)))
744
-
745
-
746
- most_common_stage = max(set(stages), key=stages.count)
747
- stage_text = RISK_STAGE_LABELS[most_common_stage]
748
- # Derive top label list for each message
749
- # safe derive top_labels
750
- top_labels = []
751
- for result, _ in results:
752
- threshold_labels = result[1]
753
- top_patterns = result[2]
754
- if threshold_labels:
755
- top_labels.append(threshold_labels[0])
756
- elif top_patterns:
757
- top_labels.append(top_patterns[0][0])
758
- else:
759
- top_labels.append("none") # or whatever default you prefer
760
- avg_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
761
- darvo_blurb = ""
762
- if avg_darvo > 0.25:
763
- level = "moderate" if avg_darvo < 0.65 else "high"
764
- darvo_blurb = f"\n\n🎭 **DARVO Score: {avg_darvo}** → This indicates a **{level} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
765
-
766
- out = f"Abuse Intensity: {composite_abuse}%\n"
767
- out += "📊 This reflects the strength and severity of detected abuse patterns in the message(s).\n\n"
768
- out += generate_risk_snippet(composite_abuse, top_labels[0], hybrid_score, most_common_stage)
769
- out += f"\n\n{stage_text}"
770
- out += darvo_blurb
771
- out += "\n\n🎭 **Emotional Tones Detected:**\n"
772
- for i, tone in enumerate(tone_tags):
773
- out += f"• Message {i+1}: *{tone or 'none'}*\n"
774
- # --- Add Immediate Danger Threats section
775
- if flat_threats:
776
- out += "\n\n🚨 **Immediate Danger Threats Detected:**\n"
777
- for t in set(flat_threats):
778
- out += f"• \"{t}\"\n"
779
- out += "\n⚠️ These phrases may indicate an imminent risk to physical safety."
780
- else:
781
- out += "\n\n🧩 **Immediate Danger Threats:** None explicitly detected.\n"
782
- out += "This does *not* rule out risk, but no direct threat phrases were matched."
783
- pattern_labels = [
784
- pats[0][0] if (pats := r[0][2]) else "none"
785
- for r in results
786
- ]
787
- timeline_image = generate_abuse_score_chart(dates_used, abuse_scores, top_labels)
788
- out += "\n\n" + escalation_text
789
- return out, timeline_image
790
-
791
- textbox_inputs = [gr.Textbox(label=f"Message {i+1}") for i in range(3)]
792
- quiz_boxes = [gr.Checkbox(label=q) for q, _ in ESCALATION_QUESTIONS]
793
- none_box = gr.Checkbox(label="None of the above")
794
-
795
-
796
- # ─── FINAL “FORCE LAUNCH” (no guards) ────────────────────────
797
-
798
- demo = gr.Interface(
799
- fn=analyze_composite,
800
- inputs=textbox_inputs + quiz_boxes + [none_box],
801
- outputs=[
802
- gr.Textbox(label="Results"),
803
- gr.Image(label="Abuse Score Timeline", type="pil")
804
- ],
805
- title="Abuse Pattern Detector + Escalation Quiz",
806
- description=(
807
- "Enter up to three messages that concern you. "
808
- "For the most accurate results, include messages from a recent emotionally intense period."
809
- ),
810
- flagging_mode="manual"
811
- )
812
- # This single call will start the server and block,
813
- # keeping the container alive on Spaces.
814
- demo.launch()
815
-
816
-