SamanthaStorm commited on
Commit
6153eb8
·
verified ·
1 Parent(s): 9133d54

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -55
app.py CHANGED
@@ -4,6 +4,7 @@ import numpy as np
4
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
  from transformers import RobertaForSequenceClassification, RobertaTokenizer
6
  from motif_tagging import detect_motifs
 
7
 
8
  # custom fine-tuned sentiment model
9
  sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
@@ -51,8 +52,8 @@ EXPLANATIONS = {
51
  }
52
 
53
  PATTERN_WEIGHTS = {
54
- "gaslighting": 1.3, "mockery": 1.2, "control": 1.2, "dismissiveness": 0.8, "blame_shifting": 0.8,
55
- "contradictory_statements": 0.75,
56
  }
57
 
58
  # --- DARVO Detection Tools ---
@@ -64,39 +65,29 @@ DARVO_MOTIFS = [
64
  "so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen",
65
  "you’re attacking me", "i’m done trying", "i’m the only one who cares"
66
  ]
67
- import re
68
 
69
  def detect_contradiction(message):
70
  contradiction_flag = False
71
  contradiction_phrases = [
72
- # Emotional flip-flops
73
  (r"\b(i love you).{0,15}(i hate you|you ruin everything)", re.IGNORECASE),
74
  (r"\b(i’m sorry).{0,15}(but you|if you hadn’t)", re.IGNORECASE),
75
  (r"\b(i’m trying).{0,15}(you never|why do you)", re.IGNORECASE),
76
- # Control + helplessness
77
  (r"\b(do what you want).{0,15}(you’ll regret it|i always give everything)", re.IGNORECASE),
78
  (r"\b(i don’t care).{0,15}(you never think of me)", re.IGNORECASE),
79
- # Passive aggression or self-victimization switch
80
  (r"\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)", re.IGNORECASE),
81
  ]
82
-
83
  for pattern, flags in contradiction_phrases:
84
  if re.search(pattern, message, flags):
85
  contradiction_flag = True
86
  break
87
-
88
  return contradiction_flag
89
- contradiction_flag = detect_contradiction(text)
90
-
91
  def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
92
  pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS])
93
  pattern_score = pattern_hits / len(DARVO_PATTERNS)
94
-
95
  sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
96
-
97
  motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS])
98
  motif_score = motif_hits / len(DARVO_MOTIFS)
99
-
100
  contradiction_score = 1.0 if contradiction_flag else 0.0
101
  darvo_score = (
102
  0.3 * pattern_score +
@@ -104,7 +95,6 @@ def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_fo
104
  0.2 * motif_score +
105
  0.2 * contradiction_score
106
  )
107
-
108
  return round(min(darvo_score, 1.0), 3)
109
 
110
  def custom_sentiment(text):
@@ -122,40 +112,23 @@ def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1
122
  base_score *= flag_multiplier
123
  return min(base_score, 100.0)
124
 
125
- def interpret_abuse_level(score):
126
- if score > 80:
127
- return "Extreme / High Risk"
128
- elif score > 60:
129
- return "Severe / Harmful Pattern Present"
130
- elif score > 40:
131
- return "Likely Abuse"
132
- elif score > 20:
133
- return "Mild Concern"
134
- return "Very Low / Likely Safe"
135
-
136
  def analyze_single_message(text, thresholds, motif_flags):
137
  motif_hits, matched_phrases = detect_motifs(text)
138
  sentiment = custom_sentiment(text)
139
  sentiment_score = sentiment["score"] if sentiment["label"] == "undermining" else 0.0
140
-
141
  adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
142
-
143
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
144
  with torch.no_grad():
145
  outputs = model(**inputs)
146
  scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
147
-
148
  threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
149
  phrase_labels = [label for label, _ in matched_phrases]
150
  pattern_labels_used = list(set(threshold_labels + phrase_labels))
151
-
152
  abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
153
-
154
  top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
155
-
156
  motif_phrases = [text for _, text in matched_phrases]
157
- darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag=False)
158
-
159
  return abuse_level, pattern_labels_used, top_patterns, darvo_score
160
 
161
  def analyze_composite(msg1, msg2, msg3, flags):
@@ -164,49 +137,36 @@ def analyze_composite(msg1, msg2, msg3, flags):
164
  active_messages = [m for m in messages if m.strip()]
165
  if not active_messages:
166
  return "Please enter at least one message."
167
-
168
  results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
169
  abuse_scores = [r[0] for r in results]
170
  darvo_scores = [r[3] for r in results]
171
  average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
172
- print(f"Average DARVO Score: {average_darvo}")
173
-
174
  base_score = sum(abuse_scores) / len(abuse_scores)
175
  label_sets = [[label for label, _ in r[2]] for r in results]
176
  label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)}
177
  top_label = max(label_counts.items(), key=lambda x: x[1])
178
  top_explanation = EXPLANATIONS.get(top_label[0], "")
179
-
180
- # Adjust flag-based weight relative to number of messages
181
  danger_weight = 5
182
  flag_weights = {
183
- "They've threatened harm": 6,
184
- "They isolate me": 5,
185
- "I’ve changed my behavior out of fear": 4,
186
- "They monitor/follow me": 4,
187
- "I feel unsafe when alone with them": 6
188
- }
189
  flag_boost = sum(flag_weights.get(f, 3) for f in flags) / len(active_messages)
190
  composite_score = min(base_score + flag_boost, 100)
191
- # Apply message count dampening AFTER base and flag boost
192
  if len(active_messages) == 1:
193
- composite_score *= 0.85 # 15% reduction for 1 message
194
  elif len(active_messages) == 2:
195
- composite_score *= 0.93 # 7% reduction for 2 messages
196
-
197
- composite_score = round(min(composite_score, 100), 2) # re-cap just in case
198
-
199
- # Include pattern explanations
200
  result = f"These messages show a pattern of **{top_label[0]}** and are estimated to be {composite_score}% likely abusive."
201
-
202
  if top_explanation:
203
  result += f"\n• {top_explanation}"
204
-
205
- # Show DARVO score
206
  if average_darvo > 0.25:
207
  darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
208
  result += f"\n\nDARVO Score: {average_darvo} → This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
209
-
210
  return result
211
 
212
  textbox_inputs = [
 
4
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
  from transformers import RobertaForSequenceClassification, RobertaTokenizer
6
  from motif_tagging import detect_motifs
7
+ import re
8
 
9
  # custom fine-tuned sentiment model
10
  sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
 
52
  }
53
 
54
  PATTERN_WEIGHTS = {
55
+ "gaslighting": 1.3, "control": 1.2, "dismissiveness": 0.8, "blame shifting": 0.8,
56
+ "contradictory statements": 0.75
57
  }
58
 
59
  # --- DARVO Detection Tools ---
 
65
  "so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen",
66
  "you’re attacking me", "i’m done trying", "i’m the only one who cares"
67
  ]
 
68
 
69
  def detect_contradiction(message):
70
  contradiction_flag = False
71
  contradiction_phrases = [
 
72
  (r"\b(i love you).{0,15}(i hate you|you ruin everything)", re.IGNORECASE),
73
  (r"\b(i’m sorry).{0,15}(but you|if you hadn’t)", re.IGNORECASE),
74
  (r"\b(i’m trying).{0,15}(you never|why do you)", re.IGNORECASE),
 
75
  (r"\b(do what you want).{0,15}(you’ll regret it|i always give everything)", re.IGNORECASE),
76
  (r"\b(i don’t care).{0,15}(you never think of me)", re.IGNORECASE),
 
77
  (r"\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)", re.IGNORECASE),
78
  ]
 
79
  for pattern, flags in contradiction_phrases:
80
  if re.search(pattern, message, flags):
81
  contradiction_flag = True
82
  break
 
83
  return contradiction_flag
84
+
 
85
  def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
86
  pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS])
87
  pattern_score = pattern_hits / len(DARVO_PATTERNS)
 
88
  sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
 
89
  motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS])
90
  motif_score = motif_hits / len(DARVO_MOTIFS)
 
91
  contradiction_score = 1.0 if contradiction_flag else 0.0
92
  darvo_score = (
93
  0.3 * pattern_score +
 
95
  0.2 * motif_score +
96
  0.2 * contradiction_score
97
  )
 
98
  return round(min(darvo_score, 1.0), 3)
99
 
100
  def custom_sentiment(text):
 
112
  base_score *= flag_multiplier
113
  return min(base_score, 100.0)
114
 
 
 
 
 
 
 
 
 
 
 
 
115
  def analyze_single_message(text, thresholds, motif_flags):
116
  motif_hits, matched_phrases = detect_motifs(text)
117
  sentiment = custom_sentiment(text)
118
  sentiment_score = sentiment["score"] if sentiment["label"] == "undermining" else 0.0
 
119
  adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
 
120
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
121
  with torch.no_grad():
122
  outputs = model(**inputs)
123
  scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
 
124
  threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
125
  phrase_labels = [label for label, _ in matched_phrases]
126
  pattern_labels_used = list(set(threshold_labels + phrase_labels))
 
127
  abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
 
128
  top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
 
129
  motif_phrases = [text for _, text in matched_phrases]
130
+ contradiction_flag = detect_contradiction(text)
131
+ darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag)
132
  return abuse_level, pattern_labels_used, top_patterns, darvo_score
133
 
134
  def analyze_composite(msg1, msg2, msg3, flags):
 
137
  active_messages = [m for m in messages if m.strip()]
138
  if not active_messages:
139
  return "Please enter at least one message."
 
140
  results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
141
  abuse_scores = [r[0] for r in results]
142
  darvo_scores = [r[3] for r in results]
143
  average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
 
 
144
  base_score = sum(abuse_scores) / len(abuse_scores)
145
  label_sets = [[label for label, _ in r[2]] for r in results]
146
  label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)}
147
  top_label = max(label_counts.items(), key=lambda x: x[1])
148
  top_explanation = EXPLANATIONS.get(top_label[0], "")
 
 
149
  danger_weight = 5
150
  flag_weights = {
151
+ "They've threatened harm": 6,
152
+ "They isolate me": 5,
153
+ "I’ve changed my behavior out of fear": 4,
154
+ "They monitor/follow me": 4,
155
+ "I feel unsafe when alone with them": 6
156
+ }
157
  flag_boost = sum(flag_weights.get(f, 3) for f in flags) / len(active_messages)
158
  composite_score = min(base_score + flag_boost, 100)
 
159
  if len(active_messages) == 1:
160
+ composite_score *= 0.85
161
  elif len(active_messages) == 2:
162
+ composite_score *= 0.93
163
+ composite_score = round(min(composite_score, 100), 2)
 
 
 
164
  result = f"These messages show a pattern of **{top_label[0]}** and are estimated to be {composite_score}% likely abusive."
 
165
  if top_explanation:
166
  result += f"\n• {top_explanation}"
 
 
167
  if average_darvo > 0.25:
168
  darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
169
  result += f"\n\nDARVO Score: {average_darvo} → This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
 
170
  return result
171
 
172
  textbox_inputs = [