Spaces:
Running
Running
File size: 11,463 Bytes
d6e219c f1948f2 2f6ac5d e032990 b54664e 6153eb8 0ff864f cb6b46c e185e86 2f6ac5d cb6b46c 2f6ac5d cb6b46c e032990 a9d4250 e185e86 cb6b46c e185e86 cb6b46c e185e86 2f6ac5d cb6b46c 2f6ac5d aeed86a e185e86 cb6b46c e185e86 d33c30b cb6b46c d33c30b cb6b46c d33c30b 1e3558a cb6b46c 1e3558a 2f6ac5d 1e3558a cb6b46c 1e3558a 21fd350 cb6b46c 6153eb8 1e3558a cb6b46c 1e3558a 4472a1d cb6b46c 4472a1d ec5f81e cb6b46c 43095bd e032990 cb6b46c b883fe8 e185e86 cb6b46c ec5f81e cb6b46c e032990 cb6b46c e032990 a28ef35 ab8c96f cb6b46c ab8c96f 4292d1b cb6b46c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 |
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs
import re
— Sentiment Model: T5-based Emotion Classifier —
sentiment_tokenizer = AutoTokenizer.from_pretrained(“mrm8488/t5-base-finetuned-emotion”)
sentiment_model = AutoModelForSeq2SeqLM.from_pretrained(“mrm8488/t5-base-finetuned-emotion”)
EMOTION_TO_SENTIMENT = {
“joy”: “supportive”,
“love”: “supportive”,
“surprise”: “supportive”,
“neutral”: “supportive”,
“sadness”: “undermining”,
“anger”: “undermining”,
“fear”: “undermining”,
“disgust”: “undermining”,
“shame”: “undermining”,
“guilt”: “undermining”
}
— Abuse Detection Model —
model_name = “SamanthaStorm/autotrain-jlpi4-mllvp”
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
LABELS = [
“blame shifting”, “contradictory statements”, “control”, “dismissiveness”,
“gaslighting”, “guilt tripping”, “insults”, “obscure language”,
“projection”, “recovery phase”, “threat”
]
THRESHOLDS = {
“blame shifting”: 0.3,
“contradictory statements”: 0.32,
“control”: 0.48,
“dismissiveness”: 0.45,
“gaslighting”: 0.30,
“guilt tripping”: 0.20,
“insults”: 0.34,
“obscure language”: 0.25,
“projection”: 0.35,
“recovery phase”: 0.25,
“threat”: 0.25
}
PATTERN_WEIGHTS = {
“gaslighting”: 1.3,
“control”: 1.2,
“dismissiveness”: 0.8,
“blame shifting”: 0.8,
“contradictory statements”: 0.75
}
EXPLANATIONS = {
“blame shifting”: “Blame-shifting is when one person redirects responsibility onto someone else to avoid accountability.”,
“contradictory statements”: “Contradictory statements confuse the listener by flipping positions or denying previous claims.”,
“control”: “Control restricts another person’s autonomy through coercion, manipulation, or threats.”,
“dismissiveness”: “Dismissiveness is belittling or disregarding another person’s feelings, needs, or opinions.”,
“gaslighting”: “Gaslighting involves making someone question their own reality, memory, or perceptions.”,
“guilt tripping”: “Guilt-tripping uses guilt to manipulate someone’s actions or decisions.”,
“insults”: “Insults are derogatory or demeaning remarks meant to shame, belittle, or hurt someone.”,
“obscure language”: “Obscure language manipulates through complexity, vagueness, or superiority to confuse the other person.”,
“projection”: “Projection accuses someone else of the very behaviors or intentions the speaker is exhibiting.”,
“recovery phase”: “Recovery phase statements attempt to soothe or reset tension without acknowledging harm or change.”,
“threat”: “Threats use fear of harm (physical, emotional, or relational) to control or intimidate someone.”
}
RISK_SNIPPETS = {
“low”: (
“🟢 Risk Level: Low”,
“The language patterns here do not strongly indicate abuse.”,
“Continue to check in with yourself and notice how you feel in response to repeated patterns.”
),
“moderate”: (
“⚠️ Risk Level: Moderate to High”,
“This language includes control, guilt, or reversal tactics.”,
“These patterns often lead to emotional confusion and reduced self-trust. Document these messages or talk with someone safe.”
),
“high”: (
“🛑 Risk Level: High”,
“Language includes threats or coercive control, which are strong indicators of escalation.”,
“Consider creating a safety plan or contacting a support line. Trust your sense of unease.”
)
}
def generate_risk_snippet(abuse_score, top_label):
if abuse_score >= 85:
risk_level = “high”
elif abuse_score >= 60:
risk_level = “moderate”
else:
risk_level = “low”
title, summary, advice = RISK_SNIPPETS[risk_level]
return f”\n\n{title}\n{summary} (Pattern: {top_label})\n💡 {advice}”
— DARVO Detection —
DARVO_PATTERNS = {
“blame shifting”, “projection”, “dismissiveness”, “guilt tripping”, “contradictory statements”
}
DARVO_MOTIFS = [
“i guess i’m the bad guy”, “after everything i’ve done”, “you always twist everything”,
“so now it’s all my fault”, “i’m the villain”, “i’m always wrong”, “you never listen”,
“you’re attacking me”, “i’m done trying”, “i’m the only one who cares”
]
def detect_contradiction(message):
contradiction_phrases = [
(r”\b(i love you).{0,15}(i hate you|you ruin everything)”, re.IGNORECASE),
(r”\b(i’m sorry).{0,15}(but you|if you hadn’t)”, re.IGNORECASE),
(r”\b(i’m trying).{0,15}(you never|why do you)”, re.IGNORECASE),
(r”\b(do what you want).{0,15}(you’ll regret it|i always give everything)”, re.IGNORECASE),
(r”\b(i don’t care).{0,15}(you never think of me)”, re.IGNORECASE),
(r”\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)”, re.IGNORECASE),
]
return any(re.search(pattern, message, flags) for pattern, flags in contradiction_phrases)
def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS])
pattern_score = pattern_hits / len(DARVO_PATTERNS)
sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS])
motif_score = motif_hits / len(DARVO_MOTIFS)
contradiction_score = 1.0 if contradiction_flag else 0.0
darvo_score = (
0.3 * pattern_score +
0.3 * sentiment_shift_score +
0.25 * motif_score +
0.15 * contradiction_score
)
return round(min(darvo_score, 1.0), 3)
def custom_sentiment(text):
input_ids = sentiment_tokenizer(f”emotion: {text}”, return_tensors=“pt”).input_ids
with torch.no_grad():
outputs = sentiment_model.generate(input_ids)
emotion = sentiment_tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower()
sentiment = EMOTION_TO_SENTIMENT.get(emotion, “undermining”)
return {“label”: sentiment, “emotion”: emotion}
def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1.0):
weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]]
base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
base_score *= flag_multiplier
return min(base_score, 100.0)
def analyze_single_message(text, thresholds, motif_flags):
motif_hits, matched_phrases = detect_motifs(text)
sentiment = custom_sentiment(text)
sentiment_score = 0.5 if sentiment[“label”] == “undermining” else 0.0
print(f”Detected emotion: {sentiment[‘emotion’]} → sentiment: {sentiment[‘label’]}”)
adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment["label"] == "undermining" else thresholds.copy()
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
phrase_labels = [label for label, _ in matched_phrases]
pattern_labels_used = list(set(threshold_labels + phrase_labels))
abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
motif_phrases = [text for _, text in matched_phrases]
contradiction_flag = detect_contradiction(text)
darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag)
return abuse_level, pattern_labels_used, top_patterns, darvo_score, sentiment
def analyze_composite(msg1, msg2, msg3, flags):
thresholds = THRESHOLDS.copy()
messages = [msg1, msg2, msg3]
active_messages = [m for m in messages if m.strip()]
if not active_messages:
return “Please enter at least one message.”
results = []
sentiment_labels = []
sentiment_score_total = 0.0
for m in active_messages:
result = analyze_single_message(m, thresholds, flags)
print(f"Message: {m}")
print(f"Sentiment result: {result[4]}")
results.append(result)
sentiment_labels.append(result[4]["label"])
if result[4]["label"] == "undermining":
sentiment_score_total += 0.5
undermining_count = sentiment_labels.count("undermining")
supportive_count = sentiment_labels.count("supportive")
if undermining_count > supportive_count:
thresholds = {k: v * 0.9 for k, v in thresholds.items()}
elif undermining_count and supportive_count:
thresholds = {k: v * 0.95 for k, v in thresholds.items()}
print("⚖️ Detected conflicting sentiment across messages.")
abuse_scores = [r[0] for r in results]
darvo_scores = [r[3] for r in results]
average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
base_score = sum(abuse_scores) / len(abuse_scores)
label_sets = [[label for label, _ in r[2]] for r in results]
label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)}
top_label = max(label_counts.items(), key=lambda x: x[1])
top_explanation = EXPLANATIONS.get(top_label[0], "")
flag_weights = {
"They've threatened harm": 6,
"They isolate me": 5,
"I’ve changed my behavior out of fear": 4,
"They monitor/follow me": 4,
"I feel unsafe when alone with them": 6
}
flag_boost = sum(flag_weights.get(f, 3) for f in flags) / len(active_messages)
composite_score = min(base_score + flag_boost, 100)
if len(active_messages) == 1:
composite_score *= 0.85
elif len(active_messages) == 2:
composite_score *= 0.93
composite_score = round(min(composite_score, 100), 2)
result = f"These messages show a pattern of **{top_label[0]}** and are estimated to be {composite_score}% likely abusive."
if top_explanation:
result += f"\n• {top_explanation}"
if average_darvo > 0.25:
darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
result += f"\n\nDARVO Score: {average_darvo} → This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
result += generate_risk_snippet(composite_score, top_label[0])
if undermining_count and supportive_count:
result += "\n\n⚖️ These messages contain **conflicting emotional tones** — this may indicate mixed signals, ambivalence, or a push-pull dynamic. Use caution interpreting any one message alone."
return result
textbox_inputs = [
gr.Textbox(label=“Message 1”),
gr.Textbox(label=“Message 2”),
gr.Textbox(label=“Message 3”)
]
checkboxes = gr.CheckboxGroup(label=“Contextual Flags”, choices=[
“They’ve threatened harm”, “They isolate me”, “I’ve changed my behavior out of fear”,
“They monitor/follow me”, “I feel unsafe when alone with them”
])
iface = gr.Interface(
fn=analyze_composite,
inputs=textbox_inputs + [checkboxes],
outputs=gr.Textbox(label=“Results”),
title=“Abuse Pattern Detector (Multi-Message)”,
allow_flagging=“manual”
)
if name == “main”:
iface.launch() |