Spaces:

SamanthaStorm
/

Tether

Running on Zero

App Files Files Community

SamanthaStorm commited on Apr 16

Commit

6153eb8

verified ·

1 Parent(s): 9133d54

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -55

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import numpy as np
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from transformers import RobertaForSequenceClassification, RobertaTokenizer
 from motif_tagging import detect_motifs
 # custom fine-tuned sentiment model
 sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
@@ -51,8 +52,8 @@ EXPLANATIONS = {
 }
 PATTERN_WEIGHTS = {
-    "gaslighting": 1.3, "mockery": 1.2, "control": 1.2, "dismissiveness": 0.8, "blame_shifting": 0.8,
-    "contradictory_statements": 0.75,
 }
 # --- DARVO Detection Tools ---
@@ -64,39 +65,29 @@ DARVO_MOTIFS = [
     "so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen",
     "you’re attacking me", "i’m done trying", "i’m the only one who cares"
 ]
-import re
 def detect_contradiction(message):
     contradiction_flag = False
     contradiction_phrases = [
-        # Emotional flip-flops
         (r"\b(i love you).{0,15}(i hate you|you ruin everything)", re.IGNORECASE),
         (r"\b(i’m sorry).{0,15}(but you|if you hadn’t)", re.IGNORECASE),
         (r"\b(i’m trying).{0,15}(you never|why do you)", re.IGNORECASE),
-        # Control + helplessness
         (r"\b(do what you want).{0,15}(you’ll regret it|i always give everything)", re.IGNORECASE),
         (r"\b(i don’t care).{0,15}(you never think of me)", re.IGNORECASE),
-        # Passive aggression or self-victimization switch
         (r"\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)", re.IGNORECASE),
     ]
     for pattern, flags in contradiction_phrases:
         if re.search(pattern, message, flags):
             contradiction_flag = True
             break
     return contradiction_flag
-    contradiction_flag = detect_contradiction(text)
 def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
     pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS])
     pattern_score = pattern_hits / len(DARVO_PATTERNS)
     sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
     motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS])
     motif_score = motif_hits / len(DARVO_MOTIFS)
     contradiction_score = 1.0 if contradiction_flag else 0.0
     darvo_score = (
         0.3 * pattern_score +
@@ -104,7 +95,6 @@ def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_fo
         0.2 * motif_score +
         0.2 * contradiction_score
     )
     return round(min(darvo_score, 1.0), 3)
 def custom_sentiment(text):
@@ -122,40 +112,23 @@ def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1
     base_score *= flag_multiplier
     return min(base_score, 100.0)
-def interpret_abuse_level(score):
-    if score > 80:
-        return "Extreme / High Risk"
-    elif score > 60:
-        return "Severe / Harmful Pattern Present"
-    elif score > 40:
-        return "Likely Abuse"
-    elif score > 20:
-        return "Mild Concern"
-    return "Very Low / Likely Safe"
 def analyze_single_message(text, thresholds, motif_flags):
     motif_hits, matched_phrases = detect_motifs(text)
     sentiment = custom_sentiment(text)
     sentiment_score = sentiment["score"] if sentiment["label"] == "undermining" else 0.0
     adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
     with torch.no_grad():
         outputs = model(**inputs)
     scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
     threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
     phrase_labels = [label for label, _ in matched_phrases]
     pattern_labels_used = list(set(threshold_labels + phrase_labels))
     abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
     top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
     motif_phrases = [text for _, text in matched_phrases]
-    darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag=False)
     return abuse_level, pattern_labels_used, top_patterns, darvo_score
 def analyze_composite(msg1, msg2, msg3, flags):
@@ -164,49 +137,36 @@ def analyze_composite(msg1, msg2, msg3, flags):
     active_messages = [m for m in messages if m.strip()]
     if not active_messages:
         return "Please enter at least one message."
     results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
     abuse_scores = [r[0] for r in results]
     darvo_scores = [r[3] for r in results]
     average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
-    print(f"Average DARVO Score: {average_darvo}")
     base_score = sum(abuse_scores) / len(abuse_scores)
     label_sets = [[label for label, _ in r[2]] for r in results]
     label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)}
     top_label = max(label_counts.items(), key=lambda x: x[1])
     top_explanation = EXPLANATIONS.get(top_label[0], "")
-    # Adjust flag-based weight relative to number of messages
     danger_weight = 5
     flag_weights = {
-    "They've threatened harm": 6,
-    "They isolate me": 5,
-    "I’ve changed my behavior out of fear": 4,
-    "They monitor/follow me": 4,
-    "I feel unsafe when alone with them": 6
-}
     flag_boost = sum(flag_weights.get(f, 3) for f in flags) / len(active_messages)
     composite_score = min(base_score + flag_boost, 100)
-  # Apply message count dampening AFTER base and flag boost
     if len(active_messages) == 1:
-        composite_score *= 0.85  # 15% reduction for 1 message
     elif len(active_messages) == 2:
-        composite_score *= 0.93  # 7% reduction for 2 messages
-    composite_score = round(min(composite_score, 100), 2)  # re-cap just in case
-    # Include pattern explanations
     result = f"These messages show a pattern of **{top_label[0]}** and are estimated to be {composite_score}% likely abusive."
     if top_explanation:
         result += f"\n• {top_explanation}"
-# Show DARVO score
     if average_darvo > 0.25:
         darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
         result += f"\n\nDARVO Score: {average_darvo} → This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
     return result
 textbox_inputs = [

 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from transformers import RobertaForSequenceClassification, RobertaTokenizer
 from motif_tagging import detect_motifs
+import re
 # custom fine-tuned sentiment model
 sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
 }
 PATTERN_WEIGHTS = {
+    "gaslighting": 1.3, "control": 1.2, "dismissiveness": 0.8, "blame shifting": 0.8,
+    "contradictory statements": 0.75
 }
 # --- DARVO Detection Tools ---
     "so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen",
     "you’re attacking me", "i’m done trying", "i’m the only one who cares"
 ]
 def detect_contradiction(message):
     contradiction_flag = False
     contradiction_phrases = [
         (r"\b(i love you).{0,15}(i hate you|you ruin everything)", re.IGNORECASE),
         (r"\b(i’m sorry).{0,15}(but you|if you hadn’t)", re.IGNORECASE),
         (r"\b(i’m trying).{0,15}(you never|why do you)", re.IGNORECASE),
         (r"\b(do what you want).{0,15}(you’ll regret it|i always give everything)", re.IGNORECASE),
         (r"\b(i don’t care).{0,15}(you never think of me)", re.IGNORECASE),
         (r"\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)", re.IGNORECASE),
     ]
     for pattern, flags in contradiction_phrases:
         if re.search(pattern, message, flags):
             contradiction_flag = True
             break
     return contradiction_flag
 def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
     pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS])
     pattern_score = pattern_hits / len(DARVO_PATTERNS)
     sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
     motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS])
     motif_score = motif_hits / len(DARVO_MOTIFS)
     contradiction_score = 1.0 if contradiction_flag else 0.0
     darvo_score = (
         0.3 * pattern_score +
         0.2 * motif_score +
         0.2 * contradiction_score
     )
     return round(min(darvo_score, 1.0), 3)
 def custom_sentiment(text):
     base_score *= flag_multiplier
     return min(base_score, 100.0)
 def analyze_single_message(text, thresholds, motif_flags):
     motif_hits, matched_phrases = detect_motifs(text)
     sentiment = custom_sentiment(text)
     sentiment_score = sentiment["score"] if sentiment["label"] == "undermining" else 0.0
     adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
     with torch.no_grad():
         outputs = model(**inputs)
     scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
     threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
     phrase_labels = [label for label, _ in matched_phrases]
     pattern_labels_used = list(set(threshold_labels + phrase_labels))
     abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
     top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
     motif_phrases = [text for _, text in matched_phrases]
+    contradiction_flag = detect_contradiction(text)
+    darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag)
     return abuse_level, pattern_labels_used, top_patterns, darvo_score
 def analyze_composite(msg1, msg2, msg3, flags):
     active_messages = [m for m in messages if m.strip()]
     if not active_messages:
         return "Please enter at least one message."
     results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
     abuse_scores = [r[0] for r in results]
     darvo_scores = [r[3] for r in results]
     average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
     base_score = sum(abuse_scores) / len(abuse_scores)
     label_sets = [[label for label, _ in r[2]] for r in results]
     label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)}
     top_label = max(label_counts.items(), key=lambda x: x[1])
     top_explanation = EXPLANATIONS.get(top_label[0], "")
     danger_weight = 5
     flag_weights = {
+        "They've threatened harm": 6,
+        "They isolate me": 5,
+        "I’ve changed my behavior out of fear": 4,
+        "They monitor/follow me": 4,
+        "I feel unsafe when alone with them": 6
+    }
     flag_boost = sum(flag_weights.get(f, 3) for f in flags) / len(active_messages)
     composite_score = min(base_score + flag_boost, 100)
     if len(active_messages) == 1:
+        composite_score *= 0.85
     elif len(active_messages) == 2:
+        composite_score *= 0.93
+    composite_score = round(min(composite_score, 100), 2)
     result = f"These messages show a pattern of **{top_label[0]}** and are estimated to be {composite_score}% likely abusive."
     if top_explanation:
         result += f"\n• {top_explanation}"
     if average_darvo > 0.25:
         darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
         result += f"\n\nDARVO Score: {average_darvo} → This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
     return result
 textbox_inputs = [