Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,209 Bytes
d6e219c f1948f2 e185e86 e032990 b54664e 0ff864f ec5f81e 4dccd71 e185e86 e032990 37dfdf9 e032990 a9d4250 e185e86 aeed86a e185e86 b602114 aeed86a e185e86 aeed86a e185e86 aeed86a e185e86 b602114 e185e86 23bb2d2 1e3558a 4472a1d 73582bd b98a1ee e032990 4472a1d ec5f81e 38e8859 dcb0de6 ec5f81e 43095bd 23bb2d2 e032990 73582bd 43095bd e032990 38e8859 1e3558a e032990 1e3558a a6c0cf2 2dda625 e032990 1e3558a ec5f81e 1e3558a e032990 1e3558a ec5f81e 1e3558a 2dda625 e185e86 e032990 ec5f81e e032990 ec5f81e 8475dea 1e3558a a5405aa d88c331 036dae9 a5405aa ec5f81e 036dae9 2dc9dfb fbd7c8e b602114 09ebcdf 847586e 1e3558a 09ebcdf 95573fe 1e3558a ec5f81e e032990 a28ef35 ab8c96f a6c0cf2 ec5f81e e032990 a6c0cf2 e032990 ab8c96f 4292d1b 2dda625 cbd8c88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import gradio as gr
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs
# custom fine-tuned sentiment model
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")
# Load abuse pattern model
model_name ="SamanthaStorm/autotrain-jlpi4-mllvp"
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
LABELS = [
"blame shifting", "contradictory statements", "control", "dismissiveness",
"gaslighting", "guilt tripping", "insults", "obscure language",
"projection", "recovery phase", "threat"
]
THRESHOLDS = {
"blame shifting": 0.3,
"contradictory statements": 0.32,
"control": 0.40,
"dismissiveness": 0.45,
"gaslighting": 0.30,
"guilt tripping": 0.20,
"insults": 0.34,
"obscure language": 0.25,
"projection": 0.35,
"recovery phase": 0.25,
"threat": 0.25
}
PATTERN_LABELS = LABELS
EXPLANATIONS = {
"blame shifting": "Blame-shifting is when one person redirects responsibility onto someone else to avoid accountability.",
"contradictory statements": "Contradictory statements confuse the listener by flipping positions or denying previous claims.",
"control": "Control restricts another person’s autonomy through coercion, manipulation, or threats.",
"dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings, needs, or opinions.",
"gaslighting": "Gaslighting involves making someone question their own reality, memory, or perceptions.",
"guilt tripping": "Guilt-tripping uses guilt to manipulate someone’s actions or decisions.",
"insults": "Insults are derogatory or demeaning remarks meant to shame, belittle, or hurt someone.",
"obscure language": "Obscure language manipulates through complexity, vagueness, or superiority to confuse the other person.",
"projection": "Projection accuses someone else of the very behaviors or intentions the speaker is exhibiting.",
"recovery phase": "Recovery phase statements attempt to soothe or reset tension without acknowledging harm or change.",
"threat": "Threats use fear of harm (physical, emotional, or relational) to control or intimidate someone."
}
PATTERN_WEIGHTS = {
"gaslighting": 1.3, "mockery": 1.2, "control": 1.2, "dismissiveness": 0.8, "blame_shifting": 0.8,
"contradictory_statements": 0.75,
}
# --- DARVO Detection Tools ---
DARVO_PATTERNS = {
"blame shifting", "projection", "mockery", "dismissiveness", "deflection", "guilt tripping"
}
DARVO_MOTIFS = [
"i guess i’m the bad guy", "after everything i’ve done", "you always twist everything",
"so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen",
"you’re attacking me", "i’m done trying", "i’m the only one who cares"
]
def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS])
pattern_score = pattern_hits / len(DARVO_PATTERNS)
sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS])
motif_score = motif_hits / len(DARVO_MOTIFS)
contradiction_score = 1.0 if contradiction_flag else 0.0
darvo_score = (
0.3 * pattern_score +
0.3 * sentiment_shift_score +
0.2 * motif_score +
0.2 * contradiction_score
)
return round(min(darvo_score, 1.0), 3)
def custom_sentiment(text):
inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = sentiment_model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=1)
label_idx = torch.argmax(probs).item()
label_map = {0: "supportive", 1: "undermining"}
return {"label": label_map[label_idx], "score": probs[0][label_idx].item()}
def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1.0):
weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]]
base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
base_score *= flag_multiplier
return min(base_score, 100.0)
def interpret_abuse_level(score):
if score > 80:
return "Extreme / High Risk"
elif score > 60:
return "Severe / Harmful Pattern Present"
elif score > 40:
return "Likely Abuse"
elif score > 20:
return "Mild Concern"
return "Very Low / Likely Safe"
def analyze_single_message(text, thresholds, motif_flags):
motif_hits, matched_phrases = detect_motifs(text)
sentiment = custom_sentiment(text)
sentiment_score = sentiment["score"] if sentiment["label"] == "undermining" else 0.0
adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
phrase_labels = [label for label, _ in matched_phrases]
pattern_labels_used = list(set(threshold_labels + phrase_labels))
abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
motif_phrases = [text for _, text in matched_phrases]
darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag=False)
return abuse_level, pattern_labels_used, top_patterns, darvo_score
def analyze_composite(msg1, msg2, msg3, flags):
thresholds = THRESHOLDS
messages = [msg1, msg2, msg3]
active_messages = [m for m in messages if m.strip()]
if not active_messages:
return "Please enter at least one message."
results = [analyze_single_message(m, thresholds, flags) for m in active_messages]
abuse_scores = [r[0] for r in results]
darvo_scores = [r[3] for r in results]
average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
print(f"Average DARVO Score: {average_darvo}")
base_score = sum(abuse_scores) / len(abuse_scores)
label_sets = [[label for label, _ in r[2]] for r in results]
label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)}
top_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)[:2]
top_explanations = [EXPLANATIONS.get(label, "") for label, _ in top_labels]
# Adjust flag-based weight relative to number of messages
danger_weight = 5
flag_weights = {
"They've threatened harm": 6,
"They isolate me": 5,
"I’ve changed my behavior out of fear": 4,
"They monitor/follow me": 4,
"I feel unsafe when alone with them": 6
}
flag_boost = sum(flag_weights.get(f, 3) for f in flags) / len(active_messages)
composite_score = min(base_score + flag_boost, 100)
# Apply message count dampening AFTER base and flag boost
if len(active_messages) == 1:
composite_score *= 0.85 # 15% reduction for 1 message
elif len(active_messages) == 2:
composite_score *= 0.93 # 7% reduction for 2 messages
composite_score = round(min(composite_score, 100), 2) # re-cap just in case
result = f"These messages show patterns of {', '.join(label for label, _ in top_labels)} and are estimated to be {composite_score}% likely abusive."
# Include pattern explanations
for expl in top_explanations:
if expl:
result += f"\n• {expl}"
# Show DARVO score
if average_darvo > 0.25:
darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
result += f"\n\nDARVO Score: {average_darvo} → This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
return result
textbox_inputs = [
gr.Textbox(label="Message 1"),
gr.Textbox(label="Message 2"),
gr.Textbox(label="Message 3")
]
checkboxes = gr.CheckboxGroup(label="Contextual Flags", choices=[
"They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
"They monitor/follow me", "I feel unsafe when alone with them"
])
iface = gr.Interface(
fn=analyze_composite,
inputs=textbox_inputs + [checkboxes],
outputs=gr.Textbox(label="Results"),
title="Abuse Pattern Detector (Multi-Message)",
allow_flagging="manual"
)
if __name__ == "__main__":
iface.launch() |