Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -6,275 +6,252 @@ from transformers import RobertaForSequenceClassification, RobertaTokenizer
|
|
6 |
from motif_tagging import detect_motifs
|
7 |
import re
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
12 |
|
13 |
EMOTION_TO_SENTIMENT = {
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
}
|
25 |
|
26 |
-
|
27 |
-
|
|
|
28 |
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
|
29 |
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
30 |
|
31 |
LABELS = [
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
]
|
36 |
|
37 |
THRESHOLDS = {
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
}
|
50 |
|
51 |
PATTERN_WEIGHTS = {
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
}
|
58 |
|
59 |
EXPLANATIONS = {
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
}
|
72 |
|
73 |
RISK_SNIPPETS = {
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
}
|
90 |
|
91 |
def generate_risk_snippet(abuse_score, top_label):
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
100 |
|
101 |
-
# --- DARVO Detection ---
|
102 |
DARVO_PATTERNS = {
|
103 |
-
|
104 |
}
|
105 |
|
106 |
DARVO_MOTIFS = [
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
]
|
111 |
|
112 |
def detect_contradiction(message):
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
for pattern, flags in contradiction_phrases:
|
123 |
-
if re.search(pattern, message, flags):
|
124 |
-
contradiction_flag = True
|
125 |
-
break
|
126 |
-
return contradiction_flag
|
127 |
|
128 |
def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
|
143 |
-
# --- Sentiment Mapping ---
|
144 |
def custom_sentiment(text):
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
|
152 |
-
# --- Abuse Analysis Core ---
|
153 |
def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1.0):
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
return min(base_score, 100.0)
|
159 |
|
160 |
def analyze_single_message(text, thresholds, motif_flags):
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)],
|
181 |
-
key=lambda x: x[1], reverse=True)[:2]
|
182 |
-
|
183 |
-
motif_phrases = [text for _, text in matched_phrases]
|
184 |
-
contradiction_flag = detect_contradiction(text)
|
185 |
-
darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag)
|
186 |
|
187 |
-
return abuse_level, pattern_labels_used, top_patterns, darvo_score, sentiment
|
188 |
-
|
189 |
-
# --- Composite Message Analysis ---
|
190 |
def analyze_composite(msg1, msg2, msg3, flags):
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
|
251 |
-
result += f"\n\nDARVO Score: {average_darvo} β This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
|
252 |
-
result += generate_risk_snippet(composite_score, top_label[0])
|
253 |
-
|
254 |
-
if undermining_count and supportive_count:
|
255 |
-
result += "\n\nβοΈ These messages contain **conflicting emotional tones** β this may indicate mixed signals, ambivalence, or a push-pull dynamic. Use caution interpreting any one message alone."
|
256 |
-
|
257 |
-
return result
|
258 |
|
259 |
-
# --- Gradio Interface ---
|
260 |
textbox_inputs = [
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
]
|
265 |
|
266 |
-
checkboxes = gr.CheckboxGroup(label
|
267 |
-
|
268 |
-
|
269 |
])
|
270 |
|
271 |
iface = gr.Interface(
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
)
|
278 |
|
279 |
-
if
|
280 |
-
|
|
|
6 |
from motif_tagging import detect_motifs
|
7 |
import re
|
8 |
|
9 |
+
β Sentiment Model: T5-based Emotion Classifier β
|
10 |
+
|
11 |
+
sentiment_tokenizer = AutoTokenizer.from_pretrained(βmrm8488/t5-base-finetuned-emotionβ)
|
12 |
+
sentiment_model = AutoModelForSeq2SeqLM.from_pretrained(βmrm8488/t5-base-finetuned-emotionβ)
|
13 |
|
14 |
EMOTION_TO_SENTIMENT = {
|
15 |
+
βjoyβ: βsupportiveβ,
|
16 |
+
βloveβ: βsupportiveβ,
|
17 |
+
βsurpriseβ: βsupportiveβ,
|
18 |
+
βneutralβ: βsupportiveβ,
|
19 |
+
βsadnessβ: βunderminingβ,
|
20 |
+
βangerβ: βunderminingβ,
|
21 |
+
βfearβ: βunderminingβ,
|
22 |
+
βdisgustβ: βunderminingβ,
|
23 |
+
βshameβ: βunderminingβ,
|
24 |
+
βguiltβ: βunderminingβ
|
25 |
}
|
26 |
|
27 |
+
β Abuse Detection Model β
|
28 |
+
|
29 |
+
model_name = βSamanthaStorm/autotrain-jlpi4-mllvpβ
|
30 |
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
|
31 |
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
32 |
|
33 |
LABELS = [
|
34 |
+
βblame shiftingβ, βcontradictory statementsβ, βcontrolβ, βdismissivenessβ,
|
35 |
+
βgaslightingβ, βguilt trippingβ, βinsultsβ, βobscure languageβ,
|
36 |
+
βprojectionβ, βrecovery phaseβ, βthreatβ
|
37 |
]
|
38 |
|
39 |
THRESHOLDS = {
|
40 |
+
βblame shiftingβ: 0.3,
|
41 |
+
βcontradictory statementsβ: 0.32,
|
42 |
+
βcontrolβ: 0.48,
|
43 |
+
βdismissivenessβ: 0.45,
|
44 |
+
βgaslightingβ: 0.30,
|
45 |
+
βguilt trippingβ: 0.20,
|
46 |
+
βinsultsβ: 0.34,
|
47 |
+
βobscure languageβ: 0.25,
|
48 |
+
βprojectionβ: 0.35,
|
49 |
+
βrecovery phaseβ: 0.25,
|
50 |
+
βthreatβ: 0.25
|
51 |
}
|
52 |
|
53 |
PATTERN_WEIGHTS = {
|
54 |
+
βgaslightingβ: 1.3,
|
55 |
+
βcontrolβ: 1.2,
|
56 |
+
βdismissivenessβ: 0.8,
|
57 |
+
βblame shiftingβ: 0.8,
|
58 |
+
βcontradictory statementsβ: 0.75
|
59 |
}
|
60 |
|
61 |
EXPLANATIONS = {
|
62 |
+
βblame shiftingβ: βBlame-shifting is when one person redirects responsibility onto someone else to avoid accountability.β,
|
63 |
+
βcontradictory statementsβ: βContradictory statements confuse the listener by flipping positions or denying previous claims.β,
|
64 |
+
βcontrolβ: βControl restricts another personβs autonomy through coercion, manipulation, or threats.β,
|
65 |
+
βdismissivenessβ: βDismissiveness is belittling or disregarding another personβs feelings, needs, or opinions.β,
|
66 |
+
βgaslightingβ: βGaslighting involves making someone question their own reality, memory, or perceptions.β,
|
67 |
+
βguilt trippingβ: βGuilt-tripping uses guilt to manipulate someoneβs actions or decisions.β,
|
68 |
+
βinsultsβ: βInsults are derogatory or demeaning remarks meant to shame, belittle, or hurt someone.β,
|
69 |
+
βobscure languageβ: βObscure language manipulates through complexity, vagueness, or superiority to confuse the other person.β,
|
70 |
+
βprojectionβ: βProjection accuses someone else of the very behaviors or intentions the speaker is exhibiting.β,
|
71 |
+
βrecovery phaseβ: βRecovery phase statements attempt to soothe or reset tension without acknowledging harm or change.β,
|
72 |
+
βthreatβ: βThreats use fear of harm (physical, emotional, or relational) to control or intimidate someone.β
|
73 |
}
|
74 |
|
75 |
RISK_SNIPPETS = {
|
76 |
+
βlowβ: (
|
77 |
+
βπ’ Risk Level: Lowβ,
|
78 |
+
βThe language patterns here do not strongly indicate abuse.β,
|
79 |
+
βContinue to check in with yourself and notice how you feel in response to repeated patterns.β
|
80 |
+
),
|
81 |
+
βmoderateβ: (
|
82 |
+
ββ οΈ Risk Level: Moderate to Highβ,
|
83 |
+
βThis language includes control, guilt, or reversal tactics.β,
|
84 |
+
βThese patterns often lead to emotional confusion and reduced self-trust. Document these messages or talk with someone safe.β
|
85 |
+
),
|
86 |
+
βhighβ: (
|
87 |
+
βπ Risk Level: Highβ,
|
88 |
+
βLanguage includes threats or coercive control, which are strong indicators of escalation.β,
|
89 |
+
βConsider creating a safety plan or contacting a support line. Trust your sense of unease.β
|
90 |
+
)
|
91 |
}
|
92 |
|
93 |
def generate_risk_snippet(abuse_score, top_label):
|
94 |
+
if abuse_score >= 85:
|
95 |
+
risk_level = βhighβ
|
96 |
+
elif abuse_score >= 60:
|
97 |
+
risk_level = βmoderateβ
|
98 |
+
else:
|
99 |
+
risk_level = βlowβ
|
100 |
+
title, summary, advice = RISK_SNIPPETS[risk_level]
|
101 |
+
return fβ\n\n{title}\n{summary} (Pattern: {top_label})\nπ‘ {advice}β
|
102 |
+
|
103 |
+
β DARVO Detection β
|
104 |
|
|
|
105 |
DARVO_PATTERNS = {
|
106 |
+
βblame shiftingβ, βprojectionβ, βdismissivenessβ, βguilt trippingβ, βcontradictory statementsβ
|
107 |
}
|
108 |
|
109 |
DARVO_MOTIFS = [
|
110 |
+
βi guess iβm the bad guyβ, βafter everything iβve doneβ, βyou always twist everythingβ,
|
111 |
+
βso now itβs all my faultβ, βiβm the villainβ, βiβm always wrongβ, βyou never listenβ,
|
112 |
+
βyouβre attacking meβ, βiβm done tryingβ, βiβm the only one who caresβ
|
113 |
]
|
114 |
|
115 |
def detect_contradiction(message):
|
116 |
+
contradiction_phrases = [
|
117 |
+
(rβ\b(i love you).{0,15}(i hate you|you ruin everything)β, re.IGNORECASE),
|
118 |
+
(rβ\b(iβm sorry).{0,15}(but you|if you hadnβt)β, re.IGNORECASE),
|
119 |
+
(rβ\b(iβm trying).{0,15}(you never|why do you)β, re.IGNORECASE),
|
120 |
+
(rβ\b(do what you want).{0,15}(youβll regret it|i always give everything)β, re.IGNORECASE),
|
121 |
+
(rβ\b(i donβt care).{0,15}(you never think of me)β, re.IGNORECASE),
|
122 |
+
(rβ\b(i guess iβm just).{0,15}(the bad guy|worthless|never enough)β, re.IGNORECASE),
|
123 |
+
]
|
124 |
+
return any(re.search(pattern, message, flags) for pattern, flags in contradiction_phrases)
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
|
127 |
+
pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS])
|
128 |
+
pattern_score = pattern_hits / len(DARVO_PATTERNS)
|
129 |
+
sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
|
130 |
+
motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS])
|
131 |
+
motif_score = motif_hits / len(DARVO_MOTIFS)
|
132 |
+
contradiction_score = 1.0 if contradiction_flag else 0.0
|
133 |
+
darvo_score = (
|
134 |
+
0.3 * pattern_score +
|
135 |
+
0.3 * sentiment_shift_score +
|
136 |
+
0.25 * motif_score +
|
137 |
+
0.15 * contradiction_score
|
138 |
+
)
|
139 |
+
return round(min(darvo_score, 1.0), 3)
|
140 |
|
|
|
141 |
def custom_sentiment(text):
|
142 |
+
input_ids = sentiment_tokenizer(fβemotion: {text}β, return_tensors=βptβ).input_ids
|
143 |
+
with torch.no_grad():
|
144 |
+
outputs = sentiment_model.generate(input_ids)
|
145 |
+
emotion = sentiment_tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower()
|
146 |
+
sentiment = EMOTION_TO_SENTIMENT.get(emotion, βunderminingβ)
|
147 |
+
return {βlabelβ: sentiment, βemotionβ: emotion}
|
148 |
|
|
|
149 |
def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1.0):
|
150 |
+
weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]]
|
151 |
+
base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
|
152 |
+
base_score *= flag_multiplier
|
153 |
+
return min(base_score, 100.0)
|
|
|
154 |
|
155 |
def analyze_single_message(text, thresholds, motif_flags):
|
156 |
+
motif_hits, matched_phrases = detect_motifs(text)
|
157 |
+
sentiment = custom_sentiment(text)
|
158 |
+
sentiment_score = 0.5 if sentiment[βlabelβ] == βunderminingβ else 0.0
|
159 |
+
print(fβDetected emotion: {sentiment[βemotionβ]} β sentiment: {sentiment[βlabelβ]}β)
|
160 |
+
|
161 |
+
adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment["label"] == "undermining" else thresholds.copy()
|
162 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
|
163 |
+
with torch.no_grad():
|
164 |
+
outputs = model(**inputs)
|
165 |
+
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
|
166 |
+
threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]]
|
167 |
+
phrase_labels = [label for label, _ in matched_phrases]
|
168 |
+
pattern_labels_used = list(set(threshold_labels + phrase_labels))
|
169 |
+
abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
|
170 |
+
top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2]
|
171 |
+
motif_phrases = [text for _, text in matched_phrases]
|
172 |
+
contradiction_flag = detect_contradiction(text)
|
173 |
+
darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag)
|
174 |
+
return abuse_level, pattern_labels_used, top_patterns, darvo_score, sentiment
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
|
|
|
|
|
|
176 |
def analyze_composite(msg1, msg2, msg3, flags):
|
177 |
+
thresholds = THRESHOLDS.copy()
|
178 |
+
messages = [msg1, msg2, msg3]
|
179 |
+
active_messages = [m for m in messages if m.strip()]
|
180 |
+
if not active_messages:
|
181 |
+
return βPlease enter at least one message.β
|
182 |
+
|
183 |
+
results = []
|
184 |
+
sentiment_labels = []
|
185 |
+
sentiment_score_total = 0.0
|
186 |
+
for m in active_messages:
|
187 |
+
result = analyze_single_message(m, thresholds, flags)
|
188 |
+
print(f"Message: {m}")
|
189 |
+
print(f"Sentiment result: {result[4]}")
|
190 |
+
results.append(result)
|
191 |
+
sentiment_labels.append(result[4]["label"])
|
192 |
+
if result[4]["label"] == "undermining":
|
193 |
+
sentiment_score_total += 0.5
|
194 |
+
|
195 |
+
undermining_count = sentiment_labels.count("undermining")
|
196 |
+
supportive_count = sentiment_labels.count("supportive")
|
197 |
+
if undermining_count > supportive_count:
|
198 |
+
thresholds = {k: v * 0.9 for k, v in thresholds.items()}
|
199 |
+
elif undermining_count and supportive_count:
|
200 |
+
thresholds = {k: v * 0.95 for k, v in thresholds.items()}
|
201 |
+
print("βοΈ Detected conflicting sentiment across messages.")
|
202 |
+
|
203 |
+
abuse_scores = [r[0] for r in results]
|
204 |
+
darvo_scores = [r[3] for r in results]
|
205 |
+
average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
|
206 |
+
base_score = sum(abuse_scores) / len(abuse_scores)
|
207 |
+
label_sets = [[label for label, _ in r[2]] for r in results]
|
208 |
+
label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)}
|
209 |
+
top_label = max(label_counts.items(), key=lambda x: x[1])
|
210 |
+
top_explanation = EXPLANATIONS.get(top_label[0], "")
|
211 |
+
flag_weights = {
|
212 |
+
"They've threatened harm": 6,
|
213 |
+
"They isolate me": 5,
|
214 |
+
"Iβve changed my behavior out of fear": 4,
|
215 |
+
"They monitor/follow me": 4,
|
216 |
+
"I feel unsafe when alone with them": 6
|
217 |
+
}
|
218 |
+
flag_boost = sum(flag_weights.get(f, 3) for f in flags) / len(active_messages)
|
219 |
+
composite_score = min(base_score + flag_boost, 100)
|
220 |
+
if len(active_messages) == 1:
|
221 |
+
composite_score *= 0.85
|
222 |
+
elif len(active_messages) == 2:
|
223 |
+
composite_score *= 0.93
|
224 |
+
composite_score = round(min(composite_score, 100), 2)
|
225 |
+
|
226 |
+
result = f"These messages show a pattern of **{top_label[0]}** and are estimated to be {composite_score}% likely abusive."
|
227 |
+
if top_explanation:
|
228 |
+
result += f"\nβ’ {top_explanation}"
|
229 |
+
if average_darvo > 0.25:
|
230 |
+
darvo_descriptor = "moderate" if average_darvo < 0.65 else "high"
|
231 |
+
result += f"\n\nDARVO Score: {average_darvo} β This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
|
232 |
+
result += generate_risk_snippet(composite_score, top_label[0])
|
233 |
+
if undermining_count and supportive_count:
|
234 |
+
result += "\n\nβοΈ These messages contain **conflicting emotional tones** β this may indicate mixed signals, ambivalence, or a push-pull dynamic. Use caution interpreting any one message alone."
|
235 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
|
|
237 |
textbox_inputs = [
|
238 |
+
gr.Textbox(label=βMessage 1β),
|
239 |
+
gr.Textbox(label=βMessage 2β),
|
240 |
+
gr.Textbox(label=βMessage 3β)
|
241 |
]
|
242 |
|
243 |
+
checkboxes = gr.CheckboxGroup(label=βContextual Flagsβ, choices=[
|
244 |
+
βTheyβve threatened harmβ, βThey isolate meβ, βIβve changed my behavior out of fearβ,
|
245 |
+
βThey monitor/follow meβ, βI feel unsafe when alone with themβ
|
246 |
])
|
247 |
|
248 |
iface = gr.Interface(
|
249 |
+
fn=analyze_composite,
|
250 |
+
inputs=textbox_inputs + [checkboxes],
|
251 |
+
outputs=gr.Textbox(label=βResultsβ),
|
252 |
+
title=βAbuse Pattern Detector (Multi-Message)β,
|
253 |
+
allow_flagging=βmanualβ
|
254 |
)
|
255 |
|
256 |
+
if name == βmainβ:
|
257 |
+
iface.launch()
|