Spaces:

SamanthaStorm
/

Tether

Running on Zero

App Files Files Community

SamanthaStorm commited on Apr 16

Commit

1e3558a

verified ·

1 Parent(s): 09ebcdf

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -5

app.py CHANGED Viewed

@@ -55,6 +55,35 @@ PATTERN_WEIGHTS = {
     "contradictory_statements": 0.75,
 }
 def custom_sentiment(text):
     inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
     with torch.no_grad():
@@ -84,17 +113,27 @@ def interpret_abuse_level(score):
 def analyze_single_message(text, thresholds, motif_flags):
     motif_hits, matched_phrases = detect_motifs(text)
     sentiment = custom_sentiment(text)
     adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
     with torch.no_grad():
         outputs = model(**inputs)
     scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
     threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
     phrase_labels = [label for label, _ in matched_phrases]
     pattern_labels_used = list(set(threshold_labels + phrase_labels))
     abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
     top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
-    return abuse_level, pattern_labels_used, top_patterns
 def analyze_composite(msg1, msg2, msg3, flags):
     thresholds = THRESHOLDS
@@ -104,7 +143,10 @@ def analyze_composite(msg1, msg2, msg3, flags):
         return "Please enter at least one message."
     results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
-    abuse_scores = [r[0] for r in results]
     base_score = sum(abuse_scores) / len(abuse_scores)
     label_sets = [[label for label, _ in r[2]] for r in results]
@@ -131,11 +173,19 @@ def analyze_composite(msg1, msg2, msg3, flags):
     composite_score = round(min(composite_score, 100), 2)  # re-cap just in case
-    result = f"These messages show patterns of {', '.join(label for label, _ in top_labels)} and are estimated to be {composite_score}% likely abusive."
     for expl in top_explanations:
         if expl:
-            result += f"\n• {expl}"
-    return result
 textbox_inputs = [
     gr.Textbox(label="Message 1"),

     "contradictory_statements": 0.75,
 }
+# --- DARVO Detection Tools ---
+DARVO_PATTERNS = {
+    "blame shifting", "projection", "mockery", "dismissiveness", "deflection", "guilt tripping"
+}
+DARVO_MOTIFS = [
+    "i guess i’m the bad guy", "after everything i’ve done", "you always twist everything",
+    "so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen",
+    "you’re attacking me", "i’m done trying", "i’m the only one who cares"
+]
+def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
+    pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS])
+    pattern_score = pattern_hits / len(DARVO_PATTERNS)
+    sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
+    motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS])
+    motif_score = motif_hits / len(DARVO_MOTIFS)
+    contradiction_score = 1.0 if contradiction_flag else 0.0
+    darvo_score = (
+        0.3 * pattern_score +
+        0.3 * sentiment_shift_score +
+        0.2 * motif_score +
+        0.2 * contradiction_score
+    )
+    return round(min(darvo_score, 1.0), 3)
 def custom_sentiment(text):
     inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
     with torch.no_grad():
 def analyze_single_message(text, thresholds, motif_flags):
     motif_hits, matched_phrases = detect_motifs(text)
     sentiment = custom_sentiment(text)
+    sentiment_score = sentiment["score"] if sentiment["label"] == "undermining" else 0.0
     adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
     with torch.no_grad():
         outputs = model(**inputs)
     scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
     threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
     phrase_labels = [label for label, _ in matched_phrases]
     pattern_labels_used = list(set(threshold_labels + phrase_labels))
     abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
     top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
+    motif_phrases = [text for _, text in matched_phrases]
+    darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag=False)
+    return abuse_level, pattern_labels_used, top_patterns, darvo_score
 def analyze_composite(msg1, msg2, msg3, flags):
     thresholds = THRESHOLDS
         return "Please enter at least one message."
     results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
+    abuse_scores = [r[0] for r in results
+    darvo_scores = [r[3] for r in results]
+    average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
+    print(f"Average DARVO Score: {average_darvo}")
     base_score = sum(abuse_scores) / len(abuse_scores)
     label_sets = [[label for label, _ in r[2]] for r in results]
     composite_score = round(min(composite_score, 100), 2)  # re-cap just in case
+ result = f"These messages show patterns of {', '.join(label for label, _ in top_labels)} and are estimated to be {composite_score}% likely abusive."
+# Include pattern explanations
     for expl in top_explanations:
         if expl:
+        result += f"\n• {expl}"
+# Show DARVO score
+    if average_darvo > 0.25:
+        darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
+        result += f"\n\nDARVO Score: {average_darvo} → This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
+return result
 textbox_inputs = [
     gr.Textbox(label="Message 1"),