SamanthaStorm commited on
Commit
161571e
·
verified ·
1 Parent(s): 99e13f9

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -778
app.py DELETED
@@ -1,778 +0,0 @@
1
- import gradio as gr
2
- import torch
3
- import numpy as np
4
- from transformers import pipeline, RobertaForSequenceClassification, RobertaTokenizer
5
- from motif_tagging import detect_motifs
6
- import re
7
- import matplotlib.pyplot as plt
8
- import io
9
- from PIL import Image
10
- from datetime import datetime
11
- from transformers import pipeline as hf_pipeline # prevent name collision with gradio pipeline
12
-
13
- def get_emotion_profile(text):
14
- emotions = emotion_pipeline(text)
15
- if isinstance(emotions, list) and isinstance(emotions[0], list):
16
- emotions = emotions[0]
17
- return {e['label'].lower(): round(e['score'], 3) for e in emotions}
18
- # Emotion model (no retraining needed)
19
- emotion_pipeline = hf_pipeline(
20
- "text-classification",
21
- model="j-hartmann/emotion-english-distilroberta-base",
22
- top_k=None,
23
- truncation=True
24
- )
25
-
26
- # --- Timeline Visualization Function ---
27
- def generate_abuse_score_chart(dates, scores, labels):
28
- import matplotlib.pyplot as plt
29
- import io
30
- from PIL import Image
31
- from datetime import datetime
32
- import re
33
-
34
- # Determine if all entries are valid dates
35
- if all(re.match(r"\d{4}-\d{2}-\d{2}", d) for d in dates):
36
- parsed_x = [datetime.strptime(d, "%Y-%m-%d") for d in dates]
37
- x_labels = [d.strftime("%Y-%m-%d") for d in parsed_x]
38
- else:
39
- parsed_x = list(range(1, len(dates) + 1))
40
- x_labels = [f"Message {i+1}" for i in range(len(dates))]
41
-
42
- fig, ax = plt.subplots(figsize=(8, 3))
43
- ax.plot(parsed_x, scores, marker='o', linestyle='-', color='darkred', linewidth=2)
44
-
45
- for x, y in zip(parsed_x, scores):
46
- ax.text(x, y + 2, f"{int(y)}%", ha='center', fontsize=8, color='black')
47
-
48
- ax.set_xticks(parsed_x)
49
- ax.set_xticklabels(x_labels)
50
- ax.set_xlabel("") # No axis label
51
- ax.set_ylabel("Abuse Score (%)")
52
- ax.set_ylim(0, 105)
53
- ax.grid(True)
54
- plt.tight_layout()
55
-
56
- buf = io.BytesIO()
57
- plt.savefig(buf, format='png')
58
- buf.seek(0)
59
- return Image.open(buf)
60
-
61
-
62
- # --- Abuse Model ---
63
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
64
-
65
- model_name = "SamanthaStorm/tether-multilabel-v3"
66
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
67
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
68
-
69
- LABELS = [
70
- "recovery", "control", "gaslighting", "guilt tripping", "dismissiveness", "blame shifting",
71
- "nonabusive","projection", "insults", "contradictory statements", "obscure language"
72
- ]
73
-
74
- THRESHOLDS = {
75
- "recovery": 0.55,
76
- "control": 0.25,
77
- "gaslighting": 0.50,
78
- "guilt tripping": .20,
79
- "dismissiveness": 0.15,
80
- "blame shifting": 0.30,
81
- "projection": 0.40,
82
- "insults": 0.30,
83
- "contradictory statements": 0.50,
84
- "obscure language": 0.40,
85
- "nonabusive": 1.5
86
- }
87
-
88
- PATTERN_WEIGHTS = {
89
- "recovery": 0.7,
90
- "control": 1.4,
91
- "gaslighting": 1.50,
92
- "guilt tripping": 1.2,
93
- "dismissiveness": 0.9,
94
- "blame shifting": 0.8,
95
- "projection": 0.5,
96
- "insults": 1.4,
97
- "contradictory statements": 1.0,
98
- "obscure language": 0.9,
99
- "nonabusive": 0.01
100
- }
101
- RISK_STAGE_LABELS = {
102
- 1: "🌀 Risk Stage: Tension-Building\nThis message reflects rising emotional pressure or subtle control attempts.",
103
- 2: "🔥 Risk Stage: Escalation\nThis message includes direct or aggressive patterns, suggesting active harm.",
104
- 3: "🌧️ Risk Stage: Reconciliation\nThis message reflects a reset attempt—apologies or emotional repair without accountability.",
105
- 4: "🌸 Risk Stage: Calm / Honeymoon\nThis message appears supportive but may follow prior harm, minimizing it."
106
- }
107
-
108
- ESCALATION_QUESTIONS = [
109
- ("Partner has access to firearms or weapons", 4),
110
- ("Partner threatened to kill you", 3),
111
- ("Partner threatened you with a weapon", 3),
112
- ("Partner has ever choked you, even if you considered it consensual at the time", 4),
113
- ("Partner injured or threatened your pet(s)", 3),
114
- ("Partner has broken your things, punched or kicked walls, or thrown things ", 2),
115
- ("Partner forced or coerced you into unwanted sexual acts", 3),
116
- ("Partner threatened to take away your children", 2),
117
- ("Violence has increased in frequency or severity", 3),
118
- ("Partner monitors your calls/GPS/social media", 2)
119
- ]
120
- DARVO_PATTERNS = [
121
- "blame shifting", # "You're the reason this happens"
122
- "projection", # "You're the abusive one"
123
- "deflection", # "This isn't about that"
124
- "dismissiveness", # "You're overreacting"
125
- "insults", # Personal attacks that redirect attention
126
- "aggression", # Escalates tone to destabilize
127
- "recovery phase", # Sudden affection following aggression
128
- "contradictory statements" # “I never said that” immediately followed by a version of what they said
129
- ]
130
- DARVO_MOTIFS = [
131
- "I never said that.", "You’re imagining things.", "That never happened.",
132
- "You’re making a big deal out of nothing.", "It was just a joke.", "You��re too sensitive.",
133
- "I don’t know what you’re talking about.", "You’re overreacting.", "I didn’t mean it that way.",
134
- "You’re twisting my words.", "You’re remembering it wrong.", "You’re always looking for something to complain about.",
135
- "You’re just trying to start a fight.", "I was only trying to help.", "You’re making things up.",
136
- "You’re blowing this out of proportion.", "You’re being paranoid.", "You’re too emotional.",
137
- "You’re always so dramatic.", "You’re just trying to make me look bad.",
138
-
139
- "You’re crazy.", "You’re the one with the problem.", "You’re always so negative.",
140
- "You’re just trying to control me.", "You’re the abusive one.", "You’re trying to ruin my life.",
141
- "You’re just jealous.", "You’re the one who needs help.", "You’re always playing the victim.",
142
- "You’re the one causing all the problems.", "You’re just trying to make me feel guilty.",
143
- "You’re the one who can’t let go of the past.", "You’re the one who’s always angry.",
144
- "You’re the one who’s always complaining.", "You’re the one who’s always starting arguments.",
145
- "You’re the one who’s always making things worse.", "You’re the one who’s always making me feel bad.",
146
- "You’re the one who’s always making me look like the bad guy.",
147
- "You’re the one who’s always making me feel like a failure.",
148
- "You’re the one who’s always making me feel like I’m not good enough.",
149
-
150
- "I can’t believe you’re doing this to me.", "You’re hurting me.",
151
- "You’re making me feel like a terrible person.", "You’re always blaming me for everything.",
152
- "You’re the one who’s abusive.", "You’re the one who’s controlling.", "You’re the one who’s manipulative.",
153
- "You’re the one who’s toxic.", "You’re the one who’s gaslighting me.",
154
- "You’re the one who’s always putting me down.", "You’re the one who’s always making me feel bad.",
155
- "You’re the one who’s always making me feel like I’m not good enough.",
156
- "You’re the one who’s always making me feel like I’m the problem.",
157
- "You’re the one who’s always making me feel like I’m the bad guy.",
158
- "You’re the one who’s always making me feel like I’m the villain.",
159
- "You’re the one who’s always making me feel like I’m the one who needs to change.",
160
- "You’re the one who’s always making me feel like I’m the one who’s wrong.",
161
- "You’re the one who’s always making me feel like I’m the one who’s crazy.",
162
- "You’re the one who’s always making me feel like I’m the one who’s abusive.",
163
- "You’re the one who’s always making me feel like I’m the one who’s toxic."
164
- ]
165
- def get_emotional_tone_tag(emotions, sentiment, patterns, abuse_score):
166
- sadness = emotions.get("sadness", 0)
167
- joy = emotions.get("joy", 0)
168
- neutral = emotions.get("neutral", 0)
169
- disgust = emotions.get("disgust", 0)
170
- anger = emotions.get("anger", 0)
171
- fear = emotions.get("fear", 0)
172
- disgust = emotions.get("disgust", 0)
173
-
174
- # 1. Performative Regret
175
- if (
176
- sadness > 0.4 and
177
- any(p in patterns for p in ["blame shifting", "guilt tripping", "recovery phase"]) and
178
- (sentiment == "undermining" or abuse_score > 40)
179
- ):
180
- return "performative regret"
181
-
182
- # 2. Coercive Warmth
183
- if (
184
- (joy > 0.3 or sadness > 0.4) and
185
- any(p in patterns for p in ["control", "gaslighting"]) and
186
- sentiment == "undermining"
187
- ):
188
- return "coercive warmth"
189
-
190
- # 3. Cold Invalidation
191
- if (
192
- (neutral + disgust) > 0.5 and
193
- any(p in patterns for p in ["dismissiveness", "projection", "obscure language"]) and
194
- sentiment == "undermining"
195
- ):
196
- return "cold invalidation"
197
-
198
- # 4. Genuine Vulnerability
199
- if (
200
- (sadness + fear) > 0.5 and
201
- sentiment == "supportive" and
202
- all(p in ["recovery phase"] for p in patterns)
203
- ):
204
- return "genuine vulnerability"
205
-
206
- # 5. Emotional Threat
207
- if (
208
- (anger + disgust) > 0.5 and
209
- any(p in patterns for p in ["control", "threat", "insults", "dismissiveness"]) and
210
- sentiment == "undermining"
211
- ):
212
- return "emotional threat"
213
-
214
- # 6. Weaponized Sadness
215
- if (
216
- sadness > 0.6 and
217
- any(p in patterns for p in ["guilt tripping", "projection"]) and
218
- sentiment == "undermining"
219
- ):
220
- return "weaponized sadness"
221
-
222
- # 7. Toxic Resignation
223
- if (
224
- neutral > 0.5 and
225
- any(p in patterns for p in ["dismissiveness", "obscure language"]) and
226
- sentiment == "undermining"
227
- ):
228
- return "toxic resignation"
229
- # 8. Aggressive Dismissal
230
- if (
231
- anger > 0.5 and
232
- any(p in patterns for p in ["aggression", "insults", "control"]) and
233
- sentiment == "undermining"
234
- ):
235
- return "aggressive dismissal"
236
- # 9. Deflective Hostility
237
- if (
238
- (0.2 < anger < 0.7 or 0.2 < disgust < 0.7) and
239
- any(p in patterns for p in ["deflection", "projection"]) and
240
- sentiment == "undermining"
241
- ):
242
- return "deflective hostility"
243
- # 10. Mocking Detachment
244
- if (
245
- (neutral + joy) > 0.5 and
246
- any(p in patterns for p in ["mockery", "insults", "projection"]) and
247
- sentiment == "undermining"
248
- ):
249
- return "mocking detachment"
250
- # 11. Contradictory Gaslight
251
- if (
252
- (joy + anger + sadness) > 0.5 and
253
- any(p in patterns for p in ["gaslighting", "contradictory statements"]) and
254
- sentiment == "undermining"
255
- ):
256
- return "contradictory gaslight"
257
- # 12. Calculated Neutrality
258
- if (
259
- neutral > 0.6 and
260
- any(p in patterns for p in ["obscure language", "deflection", "dismissiveness"]) and
261
- sentiment == "undermining"
262
- ):
263
- return "calculated neutrality"
264
- # 13. Forced Accountability Flip
265
- if (
266
- (anger + disgust) > 0.5 and
267
- any(p in patterns for p in ["blame shifting", "manipulation", "projection"]) and
268
- sentiment == "undermining"
269
- ):
270
- return "forced accountability flip"
271
- # 14. Conditional Affection
272
- if (
273
- joy > 0.4 and
274
- any(p in patterns for p in ["apology baiting", "control", "recovery phase"]) and
275
- sentiment == "undermining"
276
- ):
277
- return "conditional affection"
278
-
279
- if (
280
- (anger + disgust) > 0.5 and
281
- any(p in patterns for p in ["blame shifting", "projection", "deflection"]) and
282
- sentiment == "undermining"
283
- ):
284
- return "forced accountability flip"
285
-
286
- # Emotional Instability Fallback
287
- if (
288
- (anger + sadness + disgust) > 0.6 and
289
- sentiment == "undermining"
290
- ):
291
- return "emotional instability"
292
-
293
- return None
294
- def detect_contradiction(message):
295
- patterns = [
296
- (r"\b(i love you).{0,15}(i hate you|you ruin everything)", re.IGNORECASE),
297
- (r"\b(i’m sorry).{0,15}(but you|if you hadn’t)", re.IGNORECASE),
298
- (r"\b(i’m trying).{0,15}(you never|why do you)", re.IGNORECASE),
299
- (r"\b(do what you want).{0,15}(you’ll regret it|i always give everything)", re.IGNORECASE),
300
- (r"\b(i don’t care).{0,15}(you never think of me)", re.IGNORECASE),
301
- (r"\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)", re.IGNORECASE)
302
- ]
303
- return any(re.search(p, message, flags) for p, flags in patterns)
304
-
305
- def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
306
- # Count all detected DARVO-related patterns
307
- pattern_hits = sum(1 for p in patterns if p.lower() in DARVO_PATTERNS)
308
-
309
- # Sentiment delta
310
- sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
311
-
312
- # Match against DARVO motifs more loosely
313
- motif_hits = sum(
314
- any(phrase.lower() in motif.lower() or motif.lower() in phrase.lower()
315
- for phrase in DARVO_MOTIFS)
316
- for motif in motifs_found
317
- )
318
- motif_score = motif_hits / max(len(DARVO_MOTIFS), 1)
319
-
320
- # Contradiction still binary
321
- contradiction_score = 1.0 if contradiction_flag else 0.0
322
-
323
- # Final DARVO score
324
- return round(min(
325
- 0.3 * pattern_hits +
326
- 0.3 * sentiment_shift_score +
327
- 0.25 * motif_score +
328
- 0.15 * contradiction_score, 1.0
329
- ), 3)
330
- def detect_weapon_language(text):
331
- weapon_keywords = [
332
- "knife", "knives", "stab", "cut you", "cutting",
333
- "gun", "shoot", "rifle", "firearm", "pistol",
334
- "bomb", "blow up", "grenade", "explode",
335
- "weapon", "armed", "loaded", "kill you", "take you out"
336
- ]
337
- text_lower = text.lower()
338
- return any(word in text_lower for word in weapon_keywords)
339
- def get_risk_stage(patterns, sentiment):
340
- if "threat" in patterns or "insults" in patterns:
341
- return 2
342
- elif "recovery phase" in patterns:
343
- return 3
344
- elif "control" in patterns or "guilt tripping" in patterns:
345
- return 1
346
- elif sentiment == "supportive" and any(p in patterns for p in ["projection", "dismissiveness"]):
347
- return 4
348
- return 1
349
-
350
- def generate_risk_snippet(abuse_score, top_label, escalation_score, stage):
351
- import re
352
-
353
- # Extract aggression score if aggression is detected
354
- if isinstance(top_label, str) and "aggression" in top_label.lower():
355
- try:
356
- match = re.search(r"\(?(\d+)\%?\)?", top_label)
357
- aggression_score = int(match.group(1)) / 100 if match else 0
358
- except:
359
- aggression_score = 0
360
- else:
361
- aggression_score = 0
362
-
363
- # Revised risk logic
364
- if abuse_score >= 85 or escalation_score >= 16:
365
- risk_level = "high"
366
- elif abuse_score >= 60 or escalation_score >= 8 or aggression_score >= 0.25:
367
- risk_level = "moderate"
368
- elif stage == 2 and abuse_score >= 40:
369
- risk_level = "moderate"
370
- else:
371
- risk_level = "low"
372
-
373
- if isinstance(top_label, str) and " – " in top_label:
374
- pattern_label, pattern_score = top_label.split(" – ")
375
- else:
376
- pattern_label = str(top_label) if top_label is not None else "Unknown"
377
- pattern_score = ""
378
-
379
- WHY_FLAGGED = {
380
- "control": "This message may reflect efforts to restrict someone’s autonomy, even if it's framed as concern or care.",
381
- "gaslighting": "This message could be manipulating someone into questioning their perception or feelings.",
382
- "dismissiveness": "This message may include belittling, invalidating, or ignoring the other person’s experience.",
383
- "insults": "Direct insults often appear in escalating abusive dynamics and can erode emotional safety.",
384
- "threat": "This message includes threatening language, which is a strong predictor of harm.",
385
- "blame shifting": "This message may redirect responsibility to avoid accountability, especially during conflict.",
386
- "guilt tripping": "This message may induce guilt in order to control or manipulate behavior.",
387
- "recovery phase": "This message may be part of a tension-reset cycle, appearing kind but avoiding change.",
388
- "projection": "This message may involve attributing the abuser’s own behaviors to the victim.",
389
- "contradictory statements": "This message may contain internal contradictions used to confuse, destabilize, or deflect responsibility.",
390
- "obscure language": "This message may use overly formal, vague, or complex language to obscure meaning or avoid accountability.",
391
- "default": "This message contains language patterns that may affect safety, clarity, or emotional autonomy."
392
- }
393
-
394
- explanation = WHY_FLAGGED.get(pattern_label.lower(), WHY_FLAGGED["default"])
395
-
396
- base = f"\n\n🛑 Risk Level: {risk_level.capitalize()}\n"
397
- base += f"This message shows strong indicators of **{pattern_label}**. "
398
-
399
- if risk_level == "high":
400
- base += "The language may reflect patterns of emotional control, even when expressed in soft or caring terms.\n"
401
- elif risk_level == "moderate":
402
- base += "There are signs of emotional pressure or verbal aggression that may escalate if repeated.\n"
403
- else:
404
- base += "The message does not strongly indicate abuse, but it's important to monitor for patterns.\n"
405
-
406
- base += f"\n💡 *Why this might be flagged:*\n{explanation}\n"
407
- base += f"\nDetected Pattern: **{pattern_label} ({pattern_score})**\n"
408
- base += "🧠 You can review the pattern in context. This tool highlights possible dynamics—not judgments."
409
- return base
410
-
411
- WHY_FLAGGED = {
412
- "control": "This message may reflect efforts to restrict someone’s autonomy, even if it's framed as concern or care.",
413
- "gaslighting": "This message could be manipulating someone into questioning their perception or feelings.",
414
- "dismissiveness": "This message may include belittling, invalidating, or ignoring the other person’s experience.",
415
- "insults": "Direct insults often appear in escalating abusive dynamics and can erode emotional safety.",
416
- "threat": "This message includes threatening language, which is a strong predictor of harm.",
417
- "blame shifting": "This message may redirect responsibility to avoid accountability, especially during conflict.",
418
- "guilt tripping": "This message may induce guilt in order to control or manipulate behavior.",
419
- "recovery phase": "This message may be part of a tension-reset cycle, appearing kind but avoiding change.",
420
- "projection": "This message may involve attributing the abuser’s own behaviors to the victim.",
421
- "contradictory statements": "This message may contain internal contradictions used to confuse, destabilize, or deflect responsibility.",
422
- "obscure language": "This message may use overly formal, vague, or complex language to obscure meaning or avoid accountability.",
423
- "default": "This message contains language patterns that may affect safety, clarity, or emotional autonomy."
424
- }
425
- explanation = WHY_FLAGGED.get(pattern_label.lower(), WHY_FLAGGED["default"])
426
-
427
- base = f"\n\n🛑 Risk Level: {risk_level.capitalize()}\n"
428
- base += f"This message shows strong indicators of **{pattern_label}**. "
429
-
430
- if risk_level == "high":
431
- base += "The language may reflect patterns of emotional control, even when expressed in soft or caring terms.\n"
432
- elif risk_level == "moderate":
433
- base += "There are signs of emotional pressure or indirect control that may escalate if repeated.\n"
434
- else:
435
- base += "The message does not strongly indicate abuse, but it's important to monitor for patterns.\n"
436
-
437
- base += f"\n💡 *Why this might be flagged:*\n{explanation}\n"
438
- base += f"\nDetected Pattern: **{pattern_label} ({pattern_score})**\n"
439
- base += "🧠 You can review the pattern in context. This tool highlights possible dynamics—not judgments."
440
- return base
441
- def compute_abuse_score(matched_scores, sentiment):
442
- if not matched_scores:
443
- return 0
444
-
445
- # Weighted average of passed patterns
446
- weighted_total = sum(score * weight for _, score, weight in matched_scores)
447
- weight_sum = sum(weight for _, _, weight in matched_scores)
448
- base_score = (weighted_total / weight_sum) * 100
449
-
450
- # Boost for pattern count
451
- pattern_count = len(matched_scores)
452
- scale = 1.0 + 0.25 * max(0, pattern_count - 1) # 1.25x for 2, 1.5x for 3+
453
- scaled_score = base_score * scale
454
-
455
- # Pattern floors
456
- FLOORS = {
457
- "threat": 70,
458
- "control": 40,
459
- "gaslighting": 30,
460
- "insults": 25,
461
- "aggression": 40
462
- }
463
- floor = max(FLOORS.get(label, 0) for label, _, _ in matched_scores)
464
- adjusted_score = max(scaled_score, floor)
465
-
466
- # Sentiment tweak
467
- if sentiment == "undermining" and adjusted_score < 50:
468
- adjusted_score += 10
469
-
470
- return min(adjusted_score, 100)
471
-
472
-
473
- def analyze_single_message(text, thresholds):
474
- motif_hits, matched_phrases = detect_motifs(text)
475
-
476
- # Get emotion profile
477
- emotion_profile = get_emotion_profile(text)
478
- sentiment_score = emotion_profile.get("anger", 0) + emotion_profile.get("disgust", 0)
479
-
480
- # Get model scores
481
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
482
- with torch.no_grad():
483
- outputs = model(**inputs)
484
- scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
485
-
486
- # Sentiment override if neutral is high while critical thresholds are passed
487
- if emotion_profile.get("neutral", 0) > 0.85 and any(
488
- scores[LABELS.index(l)] > thresholds[l]
489
- for l in ["control", "threat", "blame shifting"]
490
- ):
491
- sentiment = "undermining"
492
- else:
493
- sentiment = "undermining" if sentiment_score > 0.25 else "supportive"
494
-
495
- weapon_flag = detect_weapon_language(text)
496
-
497
- adjusted_thresholds = {
498
- k: v + 0.05 if sentiment == "supportive" else v
499
- for k, v in thresholds.items()
500
- }
501
-
502
- contradiction_flag = detect_contradiction(text)
503
-
504
- threshold_labels = [
505
- label for label, score in zip(LABELS, scores)
506
- if score > adjusted_thresholds[label]
507
- ]
508
- tone_tag = get_emotional_tone_tag(emotion_profile, sentiment, threshold_labels, 0)
509
- motifs = [phrase for _, phrase in matched_phrases]
510
-
511
- darvo_score = calculate_darvo_score(
512
- threshold_labels,
513
- sentiment_before=0.0,
514
- sentiment_after=sentiment_score,
515
- motifs_found=motifs,
516
- contradiction_flag=contradiction_flag
517
- )
518
-
519
- top_patterns = sorted(
520
- [(label, score) for label, score in zip(LABELS, scores)],
521
- key=lambda x: x[1],
522
- reverse=True
523
- )[:2]
524
- # Post-threshold validation: strip recovery if it occurs with undermining sentiment
525
- if "recovery" in threshold_labels and tone_tag == "forced accountability flip":
526
- threshold_labels.remove("recovery")
527
- top_patterns = [p for p in top_patterns if p[0] != "recovery"]
528
- print("⚠️ Removing 'recovery' due to undermining sentiment (not genuine repair)")
529
-
530
- matched_scores = [
531
- (label, score, PATTERN_WEIGHTS.get(label, 1.0))
532
- for label, score in zip(LABELS, scores)
533
- if score > adjusted_thresholds[label]
534
- ]
535
-
536
- abuse_score_raw = compute_abuse_score(matched_scores, sentiment)
537
- abuse_score = abuse_score_raw
538
-
539
- # Risk stage logic
540
- stage = get_risk_stage(threshold_labels, sentiment) if threshold_labels else 1
541
- if weapon_flag and stage < 2:
542
- stage = 2
543
- if weapon_flag:
544
- abuse_score_raw = min(abuse_score_raw + 25, 100)
545
-
546
- abuse_score = min(
547
- abuse_score_raw,
548
- 100 if "threat" in threshold_labels or "control" in threshold_labels else 95
549
- )
550
-
551
- # Tag must happen after abuse score is finalized
552
- tone_tag = get_emotional_tone_tag(emotion_profile, sentiment, threshold_labels, abuse_score)
553
-
554
- # ---- Profanity + Anger Override Logic ----
555
- profane_words = {"fuck", "fucking", "bitch", "shit", "cunt", "ho", "asshole", "dick", "whore", "slut"}
556
- tokens = set(text.lower().split())
557
- has_profane = any(word in tokens for word in profane_words)
558
-
559
- anger_score = emotion_profile.get("Anger", 0)
560
- short_text = len(tokens) <= 10
561
- insult_score = next((s for l, s in top_patterns if l == "insults"), 0)
562
-
563
- if has_profane and anger_score > 0.75 and short_text:
564
- print("⚠️ Profanity + Anger Override Triggered")
565
- top_patterns = sorted(top_patterns, key=lambda x: x[1], reverse=True)
566
- if top_patterns[0][0] != "insults":
567
- top_patterns.insert(0, ("insults", insult_score))
568
- if "insults" not in threshold_labels:
569
- threshold_labels.append("insults")
570
- top_patterns = [("insults", insult_score)] + [p for p in top_patterns if p[0] != "insults"]
571
- # Debug
572
- print(f"Emotional Tone Tag: {tone_tag}")
573
- # Debug
574
- print(f"Emotional Tone Tag: {tone_tag}")
575
- print("Emotion Profile:")
576
- for emotion, score in emotion_profile.items():
577
- print(f" {emotion.capitalize():10}: {score}")
578
- print("\n--- Debug Info ---")
579
- print(f"Text: {text}")
580
- print(f"Sentiment (via emotion): {sentiment} (score: {round(sentiment_score, 3)})")
581
- print("Abuse Pattern Scores:")
582
- for label, score in zip(LABELS, scores):
583
- passed = "✅" if score > adjusted_thresholds[label] else "❌"
584
- print(f" {label:25} → {score:.3f} {passed}")
585
- print(f"Matched for score: {[(l, round(s, 3)) for l, s, _ in matched_scores]}")
586
- print(f"Abuse Score Raw: {round(abuse_score_raw, 1)}")
587
- print(f"Motifs: {motifs}")
588
- print(f"Contradiction: {contradiction_flag}")
589
- print("------------------\n")
590
-
591
- return abuse_score, threshold_labels, top_patterns, {"label": sentiment}, stage, darvo_score, tone_tag
592
-
593
- def analyze_composite(msg1, date1, msg2, date2, msg3, date3, *answers_and_none):
594
- none_selected_checked = answers_and_none[-1]
595
- responses_checked = any(answers_and_none[:-1])
596
- none_selected = not responses_checked and none_selected_checked
597
-
598
- if none_selected:
599
- escalation_score = None
600
- risk_level = "unknown"
601
- else:
602
- escalation_score = sum(w for (_, w), a in zip(ESCALATION_QUESTIONS, answers_and_none[:-1]) if a)
603
-
604
- messages = [msg1, msg2, msg3]
605
- dates = [date1, date2, date3]
606
- active = [(m, d) for m, d in zip(messages, dates) if m.strip()]
607
- if not active:
608
- return "Please enter at least one message."
609
-
610
- # Run model on messages
611
- results = [(analyze_single_message(m, THRESHOLDS.copy()), d) for m, d in active]
612
- # --- Combined Abuse Escalation Scoring ---
613
-
614
- # Extract predicted abuse labels from all messages
615
- predicted_labels = [label for r in results for label, _ in r[0][2]]
616
-
617
- # Categorize by severity
618
- high = {'control'}
619
- moderate = {
620
- 'gaslighting', 'dismissiveness', 'obscure language',
621
- 'insults', 'contradictory statements', 'guilt tripping'
622
- }
623
- low = {'blame shifting', 'projection', 'recovery phase'}
624
-
625
- # Count severity types
626
- counts = {'high': 0, 'moderate': 0, 'low': 0}
627
- for label in predicted_labels:
628
- if label in high:
629
- counts['high'] += 1
630
- elif label in moderate:
631
- counts['moderate'] += 1
632
- elif label in low:
633
- counts['low'] += 1
634
-
635
- # Derive abuse_risk from combinations
636
- if counts['high'] >= 2 and counts['moderate'] >= 2:
637
- abuse_risk = 'Critical'
638
- elif (counts['high'] >= 2 and counts['moderate'] >= 1) or (counts['moderate'] >= 3) or (counts['high'] >= 1 and counts['moderate'] >= 2):
639
- abuse_risk = 'High'
640
- elif (counts['moderate'] == 2) or (counts['high'] == 1 and counts['moderate'] == 1) or (counts['moderate'] == 1 and counts['low'] >= 2) or (counts['high'] == 1 and sum(counts.values()) == 1):
641
- abuse_risk = 'Moderate'
642
- else:
643
- abuse_risk = 'Low'
644
-
645
- # Combine abuse_risk and checklist score into final risk_level
646
- if escalation_score is not None:
647
- if escalation_score >= 8 or abuse_risk == 'Critical':
648
- risk_level = 'Critical'
649
- elif escalation_score >= 5 or abuse_risk == 'High':
650
- risk_level = 'High'
651
- elif escalation_score >= 2 or abuse_risk == 'Moderate':
652
- risk_level = 'Moderate'
653
- else:
654
- risk_level = 'Low'
655
- abuse_scores = [r[0][0] for r in results]
656
- top_labels = [r[0][1][0] if r[0][1] else r[0][2][0][0] for r in results]
657
- top_scores = [r[0][2][0][1] for r in results]
658
- sentiments = [r[0][3]['label'] for r in results]
659
- stages = [r[0][4] for r in results]
660
- darvo_scores = [r[0][5] for r in results]
661
- tone_tags= [r[0][6] for r in results]
662
- dates_used = [r[1] or "Undated" for r in results] # Store dates for future mapping
663
- # Calculate escalation bump *after* model results exist
664
- escalation_bump = 0
665
- for result, _ in results:
666
- abuse_score, threshold_labels, top_patterns, sentiment, stage, darvo_score, tone_tag = result
667
- if darvo_score > 0.65:
668
- escalation_bump += 3
669
- if tone_tag in ["forced accountability flip", "emotional threat"]:
670
- escalation_bump += 2
671
- if abuse_score > 80:
672
- escalation_bump += 2
673
- if stage == 2:
674
- escalation_bump += 3
675
-
676
- # Now we can safely calculate hybrid_score
677
- hybrid_score = escalation_score + escalation_bump if escalation_score is not None else 0
678
- risk_level = (
679
- "High" if hybrid_score >= 16 else
680
- "Moderate" if hybrid_score >= 8 else
681
- "Low"
682
- )
683
-
684
- # Now compute scores and allow override
685
- abuse_scores = [r[0][0] for r in results]
686
- stages = [r[0][4] for r in results]
687
-
688
- # Post-check override (e.g. stage 2 or high abuse score forces Moderate risk)
689
- if any(score > 70 for score in abuse_scores) or any(stage == 2 for stage in stages):
690
- if risk_level == "Low":
691
- risk_level = "Moderate"
692
-
693
- for result, date in results:
694
- assert len(result) == 7, "Unexpected output from analyze_single_message"
695
-
696
- # --- Composite Abuse Score using compute_abuse_score ---
697
- composite_abuse_scores = []
698
-
699
- for result, _ in results:
700
- _, _, top_patterns, sentiment, _, _, _ = result
701
- matched_scores = [(label, score, PATTERN_WEIGHTS.get(label, 1.0)) for label, score in top_patterns]
702
- final_score = compute_abuse_score(matched_scores, sentiment["label"])
703
- composite_abuse_scores.append(final_score)
704
-
705
- composite_abuse = int(round(sum(composite_abuse_scores) / len(composite_abuse_scores)))
706
-
707
-
708
-
709
- most_common_stage = max(set(stages), key=stages.count)
710
- stage_text = RISK_STAGE_LABELS[most_common_stage]
711
-
712
- avg_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
713
- darvo_blurb = ""
714
- if avg_darvo > 0.25:
715
- level = "moderate" if avg_darvo < 0.65 else "high"
716
- darvo_blurb = f"\n\n🎭 **DARVO Score: {avg_darvo}** → This indicates a **{level} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
717
-
718
- out = f"Abuse Intensity: {composite_abuse}%\n"
719
- out += "📊 This reflects the strength and severity of detected abuse patterns in the message(s).\n\n"
720
-
721
- # Save this line for later use at the
722
- if escalation_score is None:
723
- escalation_text = "📉 Escalation Potential: Unknown (Checklist not completed)\n"
724
- escalation_text += "⚠️ *This section was not completed. Escalation potential is unknown.*\n"
725
- hybrid_score = 0 # ✅ fallback so it's defined for generate_risk_snippet
726
- else:
727
- escalation_text = f"🧨 **Escalation Potential: {risk_level} ({escalation_score}/{sum(w for _, w in ESCALATION_QUESTIONS)})**\n"
728
- escalation_text += "This score comes directly from the safety checklist and functions as a standalone escalation risk score.\n"
729
- escalation_text += "It indicates how many serious risk factors are present based on your answers to the safety checklist.\n"
730
- # Derive top_label from the strongest top_patterns across all messages
731
- top_label = None
732
- if results:
733
- sorted_patterns = sorted(
734
- [(label, score) for r in results for label, score in r[0][2]],
735
- key=lambda x: x[1],
736
- reverse=True
737
- )
738
- if sorted_patterns:
739
- top_label = f"{sorted_patterns[0][0]} – {int(round(sorted_patterns[0][1] * 100))}%"
740
- if top_label is None:
741
- top_label = "Unknown – 0%"
742
- out += generate_risk_snippet(composite_abuse, top_label, hybrid_score if escalation_score is not None else 0, most_common_stage)
743
- out += f"\n\n{stage_text}"
744
- out += darvo_blurb
745
- out += "\n\n🎭 **Emotional Tones Detected:**\n"
746
- for i, tone in enumerate(tone_tags):
747
- label = tone if tone else "none"
748
- out += f"• Message {i+1}: *{label}*\n"
749
- print(f"DEBUG: avg_darvo = {avg_darvo}")
750
- pattern_labels = [r[0][2][0][0] for r in results] # top label for each message
751
- timeline_image = generate_abuse_score_chart(dates_used, abuse_scores, pattern_labels)
752
- out += "\n\n" + escalation_text
753
- return out, timeline_image
754
-
755
- message_date_pairs = [
756
- (
757
- gr.Textbox(label=f"Message {i+1}"),
758
- gr.Textbox(label=f"Date {i+1} (optional)", placeholder="YYYY-MM-DD")
759
- )
760
- for i in range(3)
761
- ]
762
- textbox_inputs = [item for pair in message_date_pairs for item in pair]
763
- quiz_boxes = [gr.Checkbox(label=q) for q, _ in ESCALATION_QUESTIONS]
764
- none_box = gr.Checkbox(label="None of the above")
765
-
766
- iface = gr.Interface(
767
- fn=analyze_composite,
768
- inputs=textbox_inputs + quiz_boxes + [none_box],
769
- outputs=[
770
- gr.Textbox(label="Results"),
771
- gr.Image(label="Abuse Score Timeline", type="pil")
772
- ],
773
- title="Abuse Pattern Detector + Escalation Quiz",
774
- allow_flagging="manual"
775
- )
776
-
777
- if __name__ == "__main__":
778
- iface.launch()