SamanthaStorm commited on
Commit
1e3558a
·
verified ·
1 Parent(s): 09ebcdf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -5
app.py CHANGED
@@ -55,6 +55,35 @@ PATTERN_WEIGHTS = {
55
  "contradictory_statements": 0.75,
56
  }
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def custom_sentiment(text):
59
  inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
60
  with torch.no_grad():
@@ -84,17 +113,27 @@ def interpret_abuse_level(score):
84
  def analyze_single_message(text, thresholds, motif_flags):
85
  motif_hits, matched_phrases = detect_motifs(text)
86
  sentiment = custom_sentiment(text)
 
 
87
  adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
 
88
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
89
  with torch.no_grad():
90
  outputs = model(**inputs)
91
  scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
 
92
  threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
93
  phrase_labels = [label for label, _ in matched_phrases]
94
  pattern_labels_used = list(set(threshold_labels + phrase_labels))
 
95
  abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
 
96
  top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
97
- return abuse_level, pattern_labels_used, top_patterns
 
 
 
 
98
 
99
  def analyze_composite(msg1, msg2, msg3, flags):
100
  thresholds = THRESHOLDS
@@ -104,7 +143,10 @@ def analyze_composite(msg1, msg2, msg3, flags):
104
  return "Please enter at least one message."
105
 
106
  results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
107
- abuse_scores = [r[0] for r in results]
 
 
 
108
 
109
  base_score = sum(abuse_scores) / len(abuse_scores)
110
  label_sets = [[label for label, _ in r[2]] for r in results]
@@ -131,11 +173,19 @@ def analyze_composite(msg1, msg2, msg3, flags):
131
 
132
  composite_score = round(min(composite_score, 100), 2) # re-cap just in case
133
 
134
- result = f"These messages show patterns of {', '.join(label for label, _ in top_labels)} and are estimated to be {composite_score}% likely abusive."
 
 
135
  for expl in top_explanations:
136
  if expl:
137
- result += f"\n• {expl}"
138
- return result
 
 
 
 
 
 
139
 
140
  textbox_inputs = [
141
  gr.Textbox(label="Message 1"),
 
55
  "contradictory_statements": 0.75,
56
  }
57
 
58
+ # --- DARVO Detection Tools ---
59
+ DARVO_PATTERNS = {
60
+ "blame shifting", "projection", "mockery", "dismissiveness", "deflection", "guilt tripping"
61
+ }
62
+ DARVO_MOTIFS = [
63
+ "i guess i’m the bad guy", "after everything i’ve done", "you always twist everything",
64
+ "so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen",
65
+ "you’re attacking me", "i’m done trying", "i’m the only one who cares"
66
+ ]
67
+
68
+ def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
69
+ pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS])
70
+ pattern_score = pattern_hits / len(DARVO_PATTERNS)
71
+
72
+ sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
73
+
74
+ motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS])
75
+ motif_score = motif_hits / len(DARVO_MOTIFS)
76
+
77
+ contradiction_score = 1.0 if contradiction_flag else 0.0
78
+
79
+ darvo_score = (
80
+ 0.3 * pattern_score +
81
+ 0.3 * sentiment_shift_score +
82
+ 0.2 * motif_score +
83
+ 0.2 * contradiction_score
84
+ )
85
+ return round(min(darvo_score, 1.0), 3)
86
+
87
  def custom_sentiment(text):
88
  inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
89
  with torch.no_grad():
 
113
  def analyze_single_message(text, thresholds, motif_flags):
114
  motif_hits, matched_phrases = detect_motifs(text)
115
  sentiment = custom_sentiment(text)
116
+ sentiment_score = sentiment["score"] if sentiment["label"] == "undermining" else 0.0
117
+
118
  adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
119
+
120
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
121
  with torch.no_grad():
122
  outputs = model(**inputs)
123
  scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
124
+
125
  threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
126
  phrase_labels = [label for label, _ in matched_phrases]
127
  pattern_labels_used = list(set(threshold_labels + phrase_labels))
128
+
129
  abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
130
+
131
  top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
132
+
133
+ motif_phrases = [text for _, text in matched_phrases]
134
+ darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag=False)
135
+
136
+ return abuse_level, pattern_labels_used, top_patterns, darvo_score
137
 
138
  def analyze_composite(msg1, msg2, msg3, flags):
139
  thresholds = THRESHOLDS
 
143
  return "Please enter at least one message."
144
 
145
  results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
146
+ abuse_scores = [r[0] for r in results
147
+ darvo_scores = [r[3] for r in results]
148
+ average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
149
+ print(f"Average DARVO Score: {average_darvo}")
150
 
151
  base_score = sum(abuse_scores) / len(abuse_scores)
152
  label_sets = [[label for label, _ in r[2]] for r in results]
 
173
 
174
  composite_score = round(min(composite_score, 100), 2) # re-cap just in case
175
 
176
+ result = f"These messages show patterns of {', '.join(label for label, _ in top_labels)} and are estimated to be {composite_score}% likely abusive."
177
+
178
+ # Include pattern explanations
179
  for expl in top_explanations:
180
  if expl:
181
+ result += f"\n• {expl}"
182
+
183
+ # Show DARVO score
184
+ if average_darvo > 0.25:
185
+ darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
186
+ result += f"\n\nDARVO Score: {average_darvo} → This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
187
+
188
+ return result
189
 
190
  textbox_inputs = [
191
  gr.Textbox(label="Message 1"),