SamanthaStorm commited on
Commit
cdaacf1
·
verified ·
1 Parent(s): 9af038c

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -790
app.py DELETED
@@ -1,790 +0,0 @@
1
- import gradio as gr
2
- import spaces
3
- import torch
4
- import numpy as np
5
- from transformers import pipeline, RobertaForSequenceClassification, RobertaTokenizer
6
- from motif_tagging import detect_motifs
7
- import re
8
- import matplotlib.pyplot as plt
9
- import io
10
- from PIL import Image
11
- from datetime import datetime
12
- from transformers import pipeline as hf_pipeline # prevent name collision with gradio pipeline
13
- from functools import lru_cache # Import lru_cache
14
-
15
- def get_emotion_profile(text):
16
- emotions = emotion_pipeline(text)
17
- if isinstance(emotions, list) and isinstance(emotions[0], list):
18
- emotions = emotions[0]
19
- return {e['label'].lower(): round(e['score'], 3) for e in emotions}
20
- # Emotion model (no retraining needed)
21
- emotion_pipeline = hf_pipeline(
22
- "text-classification",
23
- model="j-hartmann/emotion-english-distilroberta-base",
24
- top_k=6,
25
- truncation=True
26
- )
27
-
28
- # --- Timeline Visualization Function ---
29
- def generate_abuse_score_chart(dates, scores, labels):
30
- import matplotlib.pyplot as plt
31
- import io
32
- from PIL import Image
33
- from datetime import datetime
34
- import re
35
-
36
- # Determine if all entries are valid dates
37
- if all(re.match(r"\d{4}-\d{2}-\d{2}", d) for d in dates):
38
- parsed_x = [datetime.strptime(d, "%Y-%m-%d") for d in dates]
39
- x_labels = [d.strftime("%Y-%m-%d") for d in parsed_x]
40
- else:
41
- parsed_x = list(range(1, len(dates) + 1))
42
- x_labels = [f"Message {i+1}" for i in range(len(dates))]
43
-
44
- fig, ax = plt.subplots(figsize=(8, 3))
45
- ax.plot(parsed_x, scores, marker='o', linestyle='-', color='darkred', linewidth=2)
46
-
47
- for x, y in zip(parsed_x, scores):
48
- ax.text(x, y + 2, f"{int(y)}%", ha='center', fontsize=8, color='black')
49
-
50
- ax.set_xticks(parsed_x)
51
- ax.set_xticklabels(x_labels)
52
- ax.set_xlabel("") # No axis label
53
- ax.set_ylabel("Abuse Score (%)")
54
- ax.set_ylim(0, 105)
55
- ax.grid(True)
56
- plt.tight_layout()
57
-
58
- buf = io.BytesIO()
59
- plt.savefig(buf, format='png')
60
- buf.seek(0)
61
- return Image.open(buf)
62
-
63
-
64
- # --- Abuse Model ---
65
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
66
-
67
- model_name = "SamanthaStorm/tether-multilabel-v4"
68
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
69
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
70
-
71
- LABELS = [
72
- "recovery", "control", "gaslighting", "guilt tripping", "dismissiveness", "blame shifting",
73
- "nonabusive","projection", "insults", "contradictory statements", "obscure language"
74
- ]
75
-
76
- THRESHOLDS = {
77
- "recovery": 0.27,
78
- "control": 0.47,
79
- "gaslighting": 0.48,
80
- "guilt tripping": .56,
81
- "dismissiveness": 0.25,
82
- "blame shifting": 0.55,
83
- "projection": 0.59,
84
- "insults": 0.33,
85
- "contradictory statements": 0.27,
86
- "obscure language": 0.65,
87
- "nonabusive": 1.0
88
- }
89
-
90
- PATTERN_WEIGHTS = {
91
- "recovery": 0.7,
92
- "control": 1.4,
93
- "gaslighting": 1.50,
94
- "guilt tripping": 0.9,
95
- "dismissiveness": 0.9,
96
- "blame shifting": 0.8,
97
- "projection": 0.5,
98
- "insults": 1.2,
99
- "contradictory statements": 1.0,
100
- "obscure language": 0.9,
101
- "nonabusive": 0.0
102
- }
103
-
104
- ESCALATION_RISKS = {
105
- "blame shifting": "low",
106
- "contradictory statements": "moderate",
107
- "control": "high",
108
- "dismissiveness": "moderate",
109
- "gaslighting": "moderate",
110
- "guilt tripping": "moderate",
111
- "insults": "moderate",
112
- "obscure language": "low",
113
- "projection": "low",
114
- "recovery phase": "low"
115
- }
116
- RISK_STAGE_LABELS = {
117
- 1: "🌀 Risk Stage: Tension-Building\nThis message reflects rising emotional pressure or subtle control attempts.",
118
- 2: "🔥 Risk Stage: Escalation\nThis message includes direct or aggressive patterns, suggesting active harm.",
119
- 3: "🌧️ Risk Stage: Reconciliation\nThis message reflects a reset attempt—apologies or emotional repair without accountability.",
120
- 4: "🌸 Risk Stage: Calm / Honeymoon\nThis message appears supportive but may follow prior harm, minimizing it."
121
- }
122
-
123
- ESCALATION_QUESTIONS = [
124
- ("Partner has access to firearms or weapons", 4),
125
- ("Partner threatened to kill you", 3),
126
- ("Partner threatened you with a weapon", 3),
127
- ("Partner has ever choked you, even if you considered it consensual at the time", 4),
128
- ("Partner injured or threatened your pet(s)", 3),
129
- ("Partner has broken your things, punched or kicked walls, or thrown things ", 2),
130
- ("Partner forced or coerced you into unwanted sexual acts", 3),
131
- ("Partner threatened to take away your children", 2),
132
- ("Violence has increased in frequency or severity", 3),
133
- ("Partner monitors your calls/GPS/social media", 2)
134
- ]
135
- def get_emotional_tone_tag(emotions, sentiment, patterns, abuse_score):
136
- sadness = emotions.get("sadness", 0)
137
- joy = emotions.get("joy", 0)
138
- neutral = emotions.get("neutral", 0)
139
- disgust = emotions.get("disgust", 0)
140
- anger = emotions.get("anger", 0)
141
- fear = emotions.get("fear", 0)
142
- disgust = emotions.get("disgust", 0)
143
-
144
- # 1. Performative Regret
145
- if (
146
- sadness > 0.4 and
147
- any(p in patterns for p in ["blame shifting", "guilt tripping", "recovery phase"]) and
148
- (sentiment == "undermining" or abuse_score > 40)
149
- ):
150
- return "performative regret"
151
-
152
- # 2. Coercive Warmth
153
- if (
154
- (joy > 0.3 or sadness > 0.4) and
155
- any(p in patterns for p in ["control", "gaslighting"]) and
156
- sentiment == "undermining"
157
- ):
158
- return "coercive warmth"
159
-
160
- # 3. Cold Invalidation
161
- if (
162
- (neutral + disgust) > 0.5 and
163
- any(p in patterns for p in ["dismissiveness", "projection", "obscure language"]) and
164
- sentiment == "undermining"
165
- ):
166
- return "cold invalidation"
167
-
168
- # 4. Genuine Vulnerability
169
- if (
170
- (sadness + fear) > 0.5 and
171
- sentiment == "supportive" and
172
- all(p in ["recovery phase"] for p in patterns)
173
- ):
174
- return "genuine vulnerability"
175
-
176
- # 5. Emotional Threat
177
- if (
178
- (anger + disgust) > 0.5 and
179
- any(p in patterns for p in ["control", "insults", "dismissiveness"]) and
180
- sentiment == "undermining"
181
- ):
182
- return "emotional threat"
183
-
184
- # 6. Weaponized Sadness
185
- if (
186
- sadness > 0.6 and
187
- any(p in patterns for p in ["guilt tripping", "projection"]) and
188
- sentiment == "undermining"
189
- ):
190
- return "weaponized sadness"
191
-
192
- # 7. Toxic Resignation
193
- if (
194
- neutral > 0.5 and
195
- any(p in patterns for p in ["dismissiveness", "obscure language"]) and
196
- sentiment == "undermining"
197
- ):
198
- return "toxic resignation"
199
- # 8. Aggressive Dismissal
200
- if (
201
- anger > 0.5 and
202
- any(p in patterns for p in ["aggression", "insults", "control"]) and
203
- sentiment == "undermining"
204
- ):
205
- return "aggressive dismissal"
206
- # 9. Deflective Hostility
207
- if (
208
- (0.2 < anger < 0.7 or 0.2 < disgust < 0.7) and
209
- any(p in patterns for p in ["deflection", "projection"]) and
210
- sentiment == "undermining"
211
- ):
212
- return "deflective hostility"
213
- # 10. Mocking Detachment
214
- if (
215
- (neutral + joy) > 0.5 and
216
- any(p in patterns for p in ["mockery", "insults", "projection"]) and
217
- sentiment == "undermining"
218
- ):
219
- return "mocking detachment"
220
- # 11. Contradictory Gaslight
221
- if (
222
- (joy + anger + sadness) > 0.5 and
223
- any(p in patterns for p in ["gaslighting", "contradictory statements"]) and
224
- sentiment == "undermining"
225
- ):
226
- return "contradictory gaslight"
227
- # 12. Calculated Neutrality
228
- if (
229
- neutral > 0.6 and
230
- any(p in patterns for p in ["obscure language", "deflection", "dismissiveness"]) and
231
- sentiment == "undermining"
232
- ):
233
- return "calculated neutrality"
234
- # 13. Forced Accountability Flip
235
- if (
236
- (anger + disgust) > 0.5 and
237
- any(p in patterns for p in ["blame shifting", "manipulation", "projection"]) and
238
- sentiment == "undermining"
239
- ):
240
- return "forced accountability flip"
241
- # 14. Conditional Affection
242
- if (
243
- joy > 0.4 and
244
- any(p in patterns for p in ["apology baiting", "control", "recovery phase"]) and
245
- sentiment == "undermining"
246
- ):
247
- return "conditional affection"
248
-
249
- if (
250
- (anger + disgust) > 0.5 and
251
- any(p in patterns for p in ["blame shifting", "projection", "deflection"]) and
252
- sentiment == "undermining"
253
- ):
254
- return "forced accountability flip"
255
-
256
- # Emotional Instability Fallback
257
- if (
258
- (anger + sadness + disgust) > 0.6 and
259
- sentiment == "undermining"
260
- ):
261
- return "emotional instability"
262
-
263
- return None
264
- # 🔄 New DARVO score model (regression-based)
265
- from torch.nn.functional import sigmoid
266
- import torch
267
-
268
- # Load your trained DARVO regressor from Hugging Face Hub
269
- darvo_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1")
270
- darvo_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1", use_fast=False)
271
- darvo_model.eval()
272
-
273
- def predict_darvo_score(text):
274
- inputs = darvo_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
275
- with torch.no_grad():
276
- logits = darvo_model(**inputs).logits
277
- score = sigmoid(logits).item()
278
- return round(score, 4) # Rounded for display/output
279
- def detect_weapon_language(text):
280
- weapon_keywords = [
281
- "knife", "knives", "stab", "cut you", "cutting",
282
- "gun", "shoot", "rifle", "firearm", "pistol",
283
- "bomb", "blow up", "grenade", "explode",
284
- "weapon", "armed", "loaded", "kill you", "take you out"
285
- ]
286
- text_lower = text.lower()
287
- return any(word in text_lower for word in weapon_keywords)
288
- def get_risk_stage(patterns, sentiment):
289
- if "insults" in patterns:
290
- return 2
291
- elif "recovery phase" in patterns:
292
- return 3
293
- elif "control" in patterns or "guilt tripping" in patterns:
294
- return 1
295
- elif sentiment == "supportive" and any(p in patterns for p in ["projection", "dismissiveness"]):
296
- return 4
297
- return 1
298
-
299
- def generate_risk_snippet(abuse_score, top_label, escalation_score, stage):
300
- import re
301
-
302
- # Extract aggression score if aggression is detected
303
- if isinstance(top_label, str) and "aggression" in top_label.lower():
304
- try:
305
- match = re.search(r"\(?(\d+)\%?\)?", top_label)
306
- aggression_score = int(match.group(1)) / 100 if match else 0
307
- except:
308
- aggression_score = 0
309
- else:
310
- aggression_score = 0
311
-
312
- # Revised risk logic
313
- if abuse_score >= 85 or escalation_score >= 16:
314
- risk_level = "high"
315
- elif abuse_score >= 60 or escalation_score >= 8 or aggression_score >= 0.25:
316
- risk_level = "moderate"
317
- elif stage == 2 and abuse_score >= 40:
318
- risk_level = "moderate"
319
- else:
320
- risk_level = "low"
321
-
322
- if isinstance(top_label, str) and " – " in top_label:
323
- pattern_label, pattern_score = top_label.split(" – ")
324
- else:
325
- pattern_label = str(top_label) if top_label is not None else "Unknown"
326
- pattern_score = ""
327
-
328
- WHY_FLAGGED = {
329
- "control": "This message may reflect efforts to restrict someone’s autonomy, even if it's framed as concern or care.",
330
- "gaslighting": "This message could be manipulating someone into questioning their perception or feelings.",
331
- "dismissiveness": "This message may include belittling, invalidating, or ignoring the other person’s experience.",
332
- "insults": "Direct insults often appear in escalating abusive dynamics and can erode emotional safety.",
333
- "blame shifting": "This message may redirect responsibility to avoid accountability, especially during conflict.",
334
- "guilt tripping": "This message may induce guilt in order to control or manipulate behavior.",
335
- "recovery phase": "This message may be part of a tension-reset cycle, appearing kind but avoiding change.",
336
- "projection": "This message may involve attributing the abuser’s own behaviors to the victim.",
337
- "contradictory statements": "This message may contain internal contradictions used to confuse, destabilize, or deflect responsibility.",
338
- "obscure language": "This message may use overly formal, vague, or complex language to obscure meaning or avoid accountability.",
339
- "default": "This message contains language patterns that may affect safety, clarity, or emotional autonomy."
340
- }
341
-
342
- explanation = WHY_FLAGGED.get(pattern_label.lower(), WHY_FLAGGED["default"])
343
-
344
- base = f"\n\n🛑 Risk Level: {risk_level.capitalize()}\n"
345
- base += f"This message shows strong indicators of **{pattern_label}**. "
346
-
347
- if risk_level == "high":
348
- base += "The language may reflect patterns of emotional control, even when expressed in soft or caring terms.\n"
349
- elif risk_level == "moderate":
350
- base += "There are signs of emotional pressure or verbal aggression that may escalate if repeated.\n"
351
- else:
352
- base += "The message does not strongly indicate abuse, but it's important to monitor for patterns.\n"
353
-
354
- base += f"\n💡 *Why this might be flagged:*\n{explanation}\n"
355
- base += f"\nDetected Pattern: **{pattern_label} ({pattern_score})**\n"
356
- base += "🧠 You can review the pattern in context. This tool highlights possible dynamics—not judgments."
357
- return base
358
-
359
-
360
- # --- Step X: Detect Immediate Danger Threats ---
361
- THREAT_MOTIFS = [
362
- "i'll kill you", "i’m going to hurt you", "you’re dead", "you won't survive this",
363
- "i’ll break your face", "i'll bash your head in", "i’ll snap your neck",
364
- "i’ll come over there and make you shut up", "i'll knock your teeth out",
365
- "you’re going to bleed", "you want me to hit you?", "i won’t hold back next time",
366
- "i swear to god i’ll beat you", "next time, i won’t miss", "i’ll make you scream",
367
- "i know where you live", "i'm outside", "i’ll be waiting", "i saw you with him",
368
- "you can’t hide from me", "i’m coming to get you", "i'll find you", "i know your schedule",
369
- "i watched you leave", "i followed you home", "you'll regret this", "you’ll be sorry",
370
- "you’re going to wish you hadn’t", "you brought this on yourself", "don’t push me",
371
- "you have no idea what i’m capable of", "you better watch yourself",
372
- "i don’t care what happens to you anymore", "i’ll make you suffer", "you’ll pay for this",
373
- "i’ll never let you go", "you’re nothing without me", "if you leave me, i’ll kill myself",
374
- "i'll ruin you", "i'll tell everyone what you did", "i’ll make sure everyone knows",
375
- "i’m going to destroy your name", "you’ll lose everyone", "i’ll expose you",
376
- "your friends will hate you", "i’ll post everything", "you’ll be cancelled",
377
- "you’ll lose everything", "i’ll take the house", "i’ll drain your account",
378
- "you’ll never see a dime", "you’ll be broke when i’m done", "i’ll make sure you lose your job",
379
- "i’ll take your kids", "i’ll make sure you have nothing", "you can’t afford to leave me",
380
- "don't make me do this", "you know what happens when i’m mad", "you’re forcing my hand",
381
- "if you just behaved, this wouldn’t happen", "this is your fault",
382
- "you’re making me hurt you", "i warned you", "you should have listened"
383
- ]
384
-
385
-
386
- @spaces.GPU
387
- def compute_abuse_score(matched_scores, sentiment):
388
- """
389
- Compute abuse score with more conservative adjustments.
390
- """
391
- if not matched_scores:
392
- return 0.0
393
-
394
- sorted_scores = sorted(matched_scores, key=lambda x: x[1], reverse=True)
395
- highest_score = sorted_scores[0][1]
396
- num_patterns = len(matched_scores)
397
-
398
- # Scale down base score more aggressively if multiple patterns are present
399
- if num_patterns > 1:
400
- highest_score *= (1 - (num_patterns - 1) * 0.2) # Reduce by 20% for each additional pattern
401
-
402
- base_score = highest_score * 100
403
-
404
- critical_patterns = {
405
- 'gaslighting': 1.4, # Reduced
406
- 'guilt tripping': 1.3, # Reduced
407
- 'blame shifting': 1.2, # Reduced
408
- 'control': 1.3, # Reduced
409
- 'insults': 1.1, # Reduced
410
- 'manipulation': 1.2,
411
- 'love bombing': 1.2,
412
- 'emotional blackmail': 1.4,
413
- 'dismissiveness': 1.1,
414
- 'contradictory statements': 1.1
415
- }
416
-
417
- for label, score, _ in matched_scores:
418
- if label in critical_patterns and score > 0.5:
419
- base_score *= critical_patterns[label]
420
-
421
- # Further reduce combination multipliers
422
- if len(matched_scores) >= 2:
423
- base_score *= 1.1 # Reduced
424
- if len(matched_scores) >= 3:
425
- base_score *= 1.05 # Reduced
426
-
427
- # Reduce high confidence boost
428
- if any(score > 0.8 for _, score, _ in matched_scores):
429
- base_score *= 1.05 # Reduced
430
-
431
- def get_sentiment_predictions(texts, sentiment_model, sentiment_tokenizer, batch_size=16):
432
- predictions = []
433
- for i in tqdm(range(0, len(texts), batch_size), desc="Predicting sentiment"):
434
- batch = texts[i:i+batch_size]
435
- inputs = sentiment_tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
436
- with torch.no_grad(): # Correct indentation
437
- outputs = sentiment_model(**inputs) # Indent within 'with' block
438
- logits = outputs.logits # Indent within 'with' block
439
- predicted_classes = logits.argmax(dim=-1).tolist() # Indent within 'with' block
440
- predictions.extend(predicted_classes) # Indent within 'with' block
441
- return predictions
442
-
443
- # Reduce minimum score and threshold for activation
444
- if any(score > 0.9 for _, score, _ in matched_scores): # Higher threshold
445
- base_score = max(base_score, 75.0) # Reduced
446
- elif any(score > 0.7 for _, score, _ in matched_scores): # Moderate threshold
447
- base_score = max(base_score, 60.0) # Reduced
448
-
449
- return min(round(base_score, 1), 100.0)
450
- @lru_cache(maxsize=1024)
451
- def analyze_single_message(text, thresholds):
452
- print("⚡ ENTERED analyze_single_message")
453
- stage = 1
454
- motif_hits, matched_phrases = detect_motifs(text)
455
-
456
- # Get emotion profile
457
- emotion_profile = get_emotion_profile(text)
458
- sentiment_score = emotion_profile.get("anger", 0) + emotion_profile.get("disgust", 0)
459
-
460
- # Get model scores
461
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device) # Move to device
462
- with torch.no_grad():
463
- outputs = model(**inputs)
464
- scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
465
-
466
- # Sentiment override
467
- if emotion_profile.get("neutral", 0) > 0.85 and any(
468
- scores[LABELS.index(l)] > thresholds[l]
469
- for l in ["control", "blame shifting"]
470
- ):
471
- sentiment = "undermining"
472
- else:
473
- sentiment = "undermining" if sentiment_score > 0.25 else "supportive"
474
-
475
- weapon_flag = detect_weapon_language(text)
476
-
477
- adjusted_thresholds = {k: v + 0.05 if sentiment == "supportive" else v for k, v in thresholds.items()}
478
- darvo_score = predict_darvo_score(text)
479
-
480
- threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
481
-
482
- # Calculate matched scores *before* early exit
483
- matched_scores = [(label, score, PATTERN_WEIGHTS.get(label, 1.0)) for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
484
-
485
- # Early exit if nothing passed, but return tone_tag as None
486
- if not threshold_labels:
487
- return 0.0, [], [], {"label": sentiment}, 1, 0.0, None # Return None for tone_tag
488
-
489
- top_patterns = sorted(
490
- [(label, score) for label, score in zip(LABELS, scores)],
491
- key=lambda x: x[1],
492
- reverse=True
493
- )[:2]
494
-
495
- matched_scores = [
496
- (label, score, PATTERN_WEIGHTS.get(label, 1.0))
497
- for label, score in zip(LABELS, scores)
498
- if score > adjusted_thresholds[label]
499
- ]
500
-
501
-
502
- # Cap subtle insults to avoid excessive abuse score
503
- if (
504
- len(threshold_labels) == 1 and "insults" in threshold_labels
505
- and emotion_profile.get("neutral", 0) > 0.85
506
- ):
507
- abuse_score_raw = min(abuse_score_raw, 40)
508
-
509
- # Abuse score
510
- abuse_score_raw = compute_abuse_score(matched_scores, sentiment)
511
-
512
- # Weapon adjustment
513
- if weapon_flag:
514
- abuse_score_raw = min(abuse_score_raw + 25, 100)
515
- if stage < 2:
516
- stage = 2
517
-
518
- abuse_score = min(abuse_score_raw, 100 if "control" in threshold_labels else 95)
519
-
520
- # Tone tag
521
- tone_tag = get_emotional_tone_tag(emotion_profile, sentiment, threshold_labels, abuse_score)
522
-
523
- # Remove recovery tag if tone is fake
524
- if "recovery" in threshold_labels and tone_tag == "forced accountability flip":
525
- threshold_labels.remove("recovery")
526
- top_patterns = [p for p in top_patterns if p[0] != "recovery"]
527
- print("⚠️ Removing 'recovery' due to undermining sentiment (not genuine repair)")
528
-
529
- # Override profanity/anger for short texts
530
- profane_words = {"fuck", "fucking", "bitch", "shit", "cunt", "ho", "asshole", "dick", "whore", "slut"}
531
- tokens = set(text.lower().split())
532
- has_profane = any(word in tokens for word in profane_words)
533
- short_text = len(tokens) <= 10
534
- anger_score = emotion_profile.get("anger", 0)
535
- if has_profane and anger_score > 0.75 and short_text:
536
- print("⚠️ Profanity + Anger Override Triggered")
537
- insult_score = next((s for l, s in top_patterns if l == "insults"), 0)
538
- if ("insults", insult_score) not in top_patterns:
539
- top_patterns = [("insults", insult_score)] + top_patterns
540
- if "insults" not in threshold_labels:
541
- threshold_labels.append("insults")
542
- return abuse_score, threshold_labels, top_patterns, {"label": sentiment}, stage, darvo_score, tone_tag
543
- # Debug
544
- print(f"Emotional Tone Tag: {tone_tag}")
545
- print("Emotion Profile:")
546
- for emotion, score in emotion_profile.items():
547
- print(f" {emotion.capitalize():10}: {score}")
548
- print("\n--- Debug Info ---")
549
- print(f"Text: {text}")
550
- print(f"Sentiment (via emotion): {sentiment} (score: {round(sentiment_score, 3)})")
551
- print("Abuse Pattern Scores:")
552
- for label, score in zip(LABELS, scores):
553
- passed = "✅" if score > adjusted_thresholds[label] else "❌"
554
- print(f" {label:25} → {score:.3f} {passed}")
555
- print(f"Matched for score: {[(l, round(s, 3)) for l, s, _ in matched_scores]}")
556
- print(f"Abuse Score Raw: {round(abuse_score_raw, 1)}")
557
- print("------------------\n")
558
-
559
- return abuse_score, threshold_labels, top_patterns, {"label": sentiment}, stage, darvo_score, tone_tag
560
-
561
- import spaces
562
-
563
- @spaces.GPU
564
- def analyze_composite(msg1, msg2, msg3, *answers_and_none):
565
- from collections import Counter
566
-
567
- none_selected_checked = answers_and_none[-1]
568
- responses_checked = any(answers_and_none[:-1])
569
- none_selected = not responses_checked and none_selected_checked
570
-
571
- if none_selected:
572
- escalation_score = 0
573
- escalation_note = "Checklist completed: no danger items reported."
574
- escalation_completed = True
575
- elif responses_checked:
576
- escalation_score = sum(w for (_, w), a in zip(ESCALATION_QUESTIONS, answers_and_none[:-1]) if a)
577
- escalation_note = "Checklist completed."
578
- escalation_completed = True
579
- else:
580
- escalation_score = None
581
- escalation_note = "Checklist not completed."
582
- escalation_completed = False
583
-
584
- messages = [msg1, msg2, msg3]
585
- active = [(m, f"Message {i+1}") for i, m in enumerate(messages) if m.strip()]
586
- if not active:
587
- return "Please enter at least one message.", None
588
-
589
- # Flag any threat phrases present in the messages
590
- import re
591
-
592
- def normalize(text):
593
- import unicodedata
594
- text = text.lower().strip()
595
- text = unicodedata.normalize("NFKD", text) # handles curly quotes
596
- text = text.replace("’", "'") # smart to straight
597
- return re.sub(r"[^a-z0-9 ]", "", text)
598
-
599
- def detect_threat_motifs(message, motif_list):
600
- norm_msg = normalize(message)
601
- return [
602
- motif for motif in motif_list
603
- if normalize(motif) in norm_msg
604
- ]
605
-
606
- # Collect matches per message
607
- immediate_threats = [detect_threat_motifs(m, THREAT_MOTIFS) for m, _ in active]
608
- flat_threats = [t for sublist in immediate_threats for t in sublist]
609
- threat_risk = "Yes" if flat_threats else "No"
610
- results = [(analyze_single_message(m, THRESHOLDS.copy()), d) for m, d in active]
611
-
612
- abuse_scores = [r[0][0] for r in results]
613
- stages = [r[0][4] for r in results]
614
- darvo_scores = [r[0][5] for r in results]
615
- tone_tags = [r[0][6] for r in results]
616
- dates_used = [r[1] for r in results]
617
-
618
- predicted_labels = [label for r in results for label, _ in r[0][2]]
619
- high = {'control'}
620
- moderate = {'gaslighting', 'dismissiveness', 'obscure language', 'insults', 'contradictory statements', 'guilt tripping'}
621
- low = {'blame shifting', 'projection', 'recovery phase'}
622
- counts = {'high': 0, 'moderate': 0, 'low': 0}
623
- for label in predicted_labels:
624
- if label in high:
625
- counts['high'] += 1
626
- elif label in moderate:
627
- counts['moderate'] += 1
628
- elif label in low:
629
- counts['low'] += 1
630
-
631
- # Pattern escalation logic
632
- pattern_escalation_risk = "Low"
633
- if counts['high'] >= 2 and counts['moderate'] >= 2:
634
- pattern_escalation_risk = "Critical"
635
- elif (counts['high'] >= 2 and counts['moderate'] >= 1) or (counts['moderate'] >= 3) or (counts['high'] >= 1 and counts['moderate'] >= 2):
636
- pattern_escalation_risk = "High"
637
- elif (counts['moderate'] == 2) or (counts['high'] == 1 and counts['moderate'] == 1) or (counts['moderate'] == 1 and counts['low'] >= 2) or (counts['high'] == 1 and sum(counts.values()) == 1):
638
- pattern_escalation_risk = "Moderate"
639
-
640
- checklist_escalation_risk = "Unknown" if escalation_score is None else (
641
- "Critical" if escalation_score >= 20 else
642
- "Moderate" if escalation_score >= 10 else
643
- "Low"
644
- )
645
-
646
- escalation_bump = 0
647
- for result, _ in results:
648
- abuse_score, _, _, sentiment, stage, darvo_score, tone_tag = result
649
- if darvo_score > 0.65:
650
- escalation_bump += 3
651
- if tone_tag in ["forced accountability flip", "emotional threat"]:
652
- escalation_bump += 2
653
- if abuse_score > 80:
654
- escalation_bump += 2
655
- if stage == 2:
656
- escalation_bump += 3
657
-
658
- def rank(label):
659
- return {"Low": 0, "Moderate": 1, "High": 2, "Critical": 3, "Unknown": 0}.get(label, 0)
660
-
661
- combined_score = rank(pattern_escalation_risk) + rank(checklist_escalation_risk) + escalation_bump
662
- escalation_risk = (
663
- "Critical" if combined_score >= 6 else
664
- "High" if combined_score >= 4 else
665
- "Moderate" if combined_score >= 2 else
666
- "Low"
667
- )
668
-
669
- none_selected_checked = answers_and_none[-1]
670
- responses_checked = any(answers_and_none[:-1])
671
- none_selected = not responses_checked and none_selected_checked
672
-
673
- # Determine escalation_score
674
- if none_selected:
675
- escalation_score = 0
676
- escalation_completed = True
677
- elif responses_checked:
678
- escalation_score = sum(
679
- w for (_, w), a in zip(ESCALATION_QUESTIONS, answers_and_none[:-1]) if a
680
- )
681
- escalation_completed = True
682
- else:
683
- escalation_score = None
684
- escalation_completed = False
685
-
686
- # Build escalation_text and hybrid_score
687
- if escalation_score is None:
688
- escalation_text = (
689
- "🚫 **Escalation Potential: Unknown** (Checklist not completed)\n"
690
- "⚠️ This section was not completed. Escalation potential is estimated using message data only.\n"
691
- )
692
- hybrid_score = 0
693
- elif escalation_score == 0:
694
- escalation_text = (
695
- "✅ **Escalation Checklist Completed:** No danger items reported.\n"
696
- "🧭 **Escalation potential estimated from detected message patterns only.**\n"
697
- f"• Pattern Risk: {pattern_escalation_risk}\n"
698
- f"• Checklist Risk: None reported\n"
699
- f"• Escalation Bump: +{escalation_bump} (from DARVO, tone, intensity, etc.)"
700
- )
701
- hybrid_score = escalation_bump
702
- else:
703
- hybrid_score = escalation_score + escalation_bump
704
- escalation_text = (
705
- f"📈 **Escalation Potential: {escalation_risk} ({hybrid_score}/29)**\n"
706
- "📋 This score combines your safety checklist answers *and* detected high-risk behavior.\n"
707
- f"• Pattern Risk: {pattern_escalation_risk}\n"
708
- f"• Checklist Risk: {checklist_escalation_risk}\n"
709
- f"• Escalation Bump: +{escalation_bump} (from DARVO, tone, intensity, etc.)"
710
- )
711
- # Composite Abuse Score
712
- composite_abuse_scores = []
713
- for result, _ in results:
714
- _, _, top_patterns, sentiment, _, _, _ = result
715
- matched_scores = [(label, score, PATTERN_WEIGHTS.get(label, 1.0)) for label, score in top_patterns]
716
- final_score = compute_abuse_score(matched_scores, sentiment["label"])
717
- composite_abuse_scores.append(final_score)
718
- composite_abuse = int(round(sum(composite_abuse_scores) / len(composite_abuse_scores)))
719
-
720
- most_common_stage = max(set(stages), key=stages.count)
721
- stage_text = RISK_STAGE_LABELS[most_common_stage]
722
- # Derive top label list for each message
723
- # safe derive top_labels
724
- top_labels = []
725
- for result, _ in results:
726
- threshold_labels = result[1]
727
- top_patterns = result[2]
728
- if threshold_labels:
729
- top_labels.append(threshold_labels[0])
730
- elif top_patterns:
731
- top_labels.append(top_patterns[0][0])
732
- else:
733
- top_labels.append("none") # or whatever default you prefer
734
- avg_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
735
- darvo_blurb = ""
736
- if avg_darvo > 0.25:
737
- level = "moderate" if avg_darvo < 0.65 else "high"
738
- darvo_blurb = f"\n\n🎭 **DARVO Score: {avg_darvo}** → This indicates a **{level} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
739
-
740
- out = f"Abuse Intensity: {composite_abuse}%\n"
741
- out += "📊 This reflects the strength and severity of detected abuse patterns in the message(s).\n\n"
742
- out += generate_risk_snippet(composite_abuse, top_labels[0], hybrid_score, most_common_stage)
743
- out += f"\n\n{stage_text}"
744
- out += darvo_blurb
745
- out += "\n\n🎭 **Emotional Tones Detected:**\n"
746
- for i, tone in enumerate(tone_tags):
747
- out += f"• Message {i+1}: *{tone or 'none'}*\n"
748
- # --- Add Immediate Danger Threats section
749
- if flat_threats:
750
- out += "\n\n🚨 **Immediate Danger Threats Detected:**\n"
751
- for t in set(flat_threats):
752
- out += f"• \"{t}\"\n"
753
- out += "\n⚠️ These phrases may indicate an imminent risk to physical safety."
754
- else:
755
- out += "\n\n🧩 **Immediate Danger Threats:** None explicitly detected.\n"
756
- out += "This does *not* rule out risk, but no direct threat phrases were matched."
757
- pattern_labels = [
758
- pats[0][0] if (pats := r[0][2]) else "none"
759
- for r in results
760
- ]
761
- timeline_image = generate_abuse_score_chart(dates_used, abuse_scores, top_labels)
762
- out += "\n\n" + escalation_text
763
- return out, timeline_image
764
-
765
- textbox_inputs = [gr.Textbox(label=f"Message {i+1}") for i in range(3)]
766
- quiz_boxes = [gr.Checkbox(label=q) for q, _ in ESCALATION_QUESTIONS]
767
- none_box = gr.Checkbox(label="None of the above")
768
-
769
-
770
- # ─── FINAL “FORCE LAUNCH” (no guards) ────────────────────────
771
-
772
- demo = gr.Interface(
773
- fn=analyze_composite,
774
- inputs=textbox_inputs + quiz_boxes + [none_box],
775
- outputs=[
776
- gr.Textbox(label="Results"),
777
- gr.Image(label="Abuse Score Timeline", type="pil")
778
- ],
779
- title="Abuse Pattern Detector + Escalation Quiz",
780
- description=(
781
- "Enter up to three messages that concern you. "
782
- "For the most accurate results, include messages from a recent emotionally intense period."
783
- ),
784
- flagging_mode="manual"
785
- )
786
- # This single call will start the server and block,
787
- # keeping the container alive on Spaces.
788
- demo.launch()
789
-
790
-