Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -5,50 +5,48 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
|
5 |
from transformers import RobertaForSequenceClassification, RobertaTokenizer
|
6 |
|
7 |
# custom fine-tuned sentiment model
|
8 |
-
|
9 |
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
|
10 |
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")
|
11 |
|
12 |
# Load abuse pattern model
|
13 |
-
|
14 |
model_name = "SamanthaStorm/abuse-pattern-detector-v2"
|
15 |
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
|
16 |
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
17 |
|
18 |
LABELS = [
|
19 |
-
"gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
|
20 |
-
"contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase", "non_abusive",
|
21 |
-
"suicidal_threat", "physical_threat", "extreme_control"
|
22 |
]
|
23 |
|
24 |
THRESHOLDS = {
|
25 |
-
"gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.30, "control": 0.43, "guilt_tripping": 0.19,
|
26 |
-
"apology_baiting": 0.45, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
|
27 |
-
"manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25,
|
28 |
-
"non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.36
|
29 |
}
|
30 |
|
31 |
PATTERN_LABELS = LABELS[:15]
|
32 |
DANGER_LABELS = LABELS[15:18]
|
33 |
|
34 |
EXPLANATIONS = {
|
35 |
-
"gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
|
36 |
-
"blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
|
37 |
-
"projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
|
38 |
-
"dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
|
39 |
-
"mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
|
40 |
-
"recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
|
41 |
-
"insults": "Insults are derogatory remarks aimed at degrading someone.",
|
42 |
-
"apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
|
43 |
-
"deflection": "Deflection avoids accountability by redirecting blame.",
|
44 |
-
"control": "Control restricts autonomy through manipulation or coercion.",
|
45 |
-
"extreme_control": "Extreme control dominates decisions and behaviors entirely.",
|
46 |
-
"physical_threat": "Physical threats signal risk of bodily harm.",
|
47 |
-
"suicidal_threat": "Suicidal threats manipulate others using self-harm threats.",
|
48 |
-
"guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.",
|
49 |
-
"manipulation": "Manipulation deceives to influence or control outcomes.",
|
50 |
-
"non_abusive": "Non-abusive language is respectful and free of coercion.",
|
51 |
-
"obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
|
52 |
}
|
53 |
|
54 |
def custom_sentiment(text):
|
@@ -68,74 +66,78 @@ def calculate_abuse_level(scores, thresholds):
|
|
68 |
return round(np.mean(triggered_scores) * 100, 2) if triggered_scores else 0.0
|
69 |
|
70 |
def interpret_abuse_level(score):
|
71 |
-
if score > 80:
|
72 |
-
|
73 |
-
elif score >
|
74 |
-
|
|
|
|
|
|
|
|
|
75 |
return "Very Low / Likely Safe"
|
76 |
|
77 |
def analyze_messages(input_text, risk_flags):
|
78 |
input_text = input_text.strip()
|
79 |
-
|
80 |
return "Please enter a message for analysis."
|
81 |
|
82 |
-
sentiment = custom_sentiment(input_text)
|
83 |
-
sentiment_label = sentiment['label']
|
84 |
-
sentiment_score = sentiment['score']
|
85 |
|
86 |
-
adjusted_thresholds = {k: v * 0.8 for k, v in THRESHOLDS.items()} if sentiment_label == "undermining" else THRESHOLDS.copy()
|
87 |
|
88 |
-
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
|
89 |
-
with torch.no_grad():
|
90 |
-
|
91 |
-
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
|
92 |
|
93 |
-
pattern_count = sum(score > adjusted_thresholds[label] for label, score in zip(PATTERN_LABELS, scores[:15]))
|
94 |
-
danger_flag_count = sum(score > adjusted_thresholds[label] for label, score in zip(DANGER_LABELS, scores[15:18]))
|
95 |
|
96 |
-
contextual_flags = risk_flags if risk_flags else []
|
97 |
-
if len(contextual_flags) >= 2:
|
98 |
-
|
99 |
|
100 |
-
critical_flags = ["They've threatened harm", "They monitor/follow me", "I feel unsafe when alone with them"]
|
101 |
-
high_risk_context = any(flag in contextual_flags for flag in critical_flags)
|
102 |
|
103 |
-
non_abusive_score = scores[LABELS.index('non_abusive')]
|
104 |
-
if non_abusive_score > adjusted_thresholds['non_abusive']:
|
105 |
-
|
106 |
|
107 |
-
abuse_level = calculate_abuse_level(scores, adjusted_thresholds)
|
108 |
-
abuse_description = interpret_abuse_level(abuse_level)
|
109 |
|
110 |
-
if danger_flag_count >= 2:
|
111 |
-
|
112 |
-
else:
|
113 |
-
|
114 |
|
115 |
-
scored_patterns = [
|
116 |
-
|
117 |
-
]
|
118 |
-
top_patterns = sorted(scored_patterns, key=lambda x: x[1], reverse=True)[:2]
|
119 |
-
|
120 |
-
top_pattern_explanations = "\n".join([
|
121 |
-
|
122 |
-
|
123 |
-
])
|
124 |
-
|
125 |
-
result = (
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
)
|
132 |
|
133 |
-
if contextual_flags:
|
134 |
-
|
135 |
-
if high_risk_context:
|
136 |
-
|
137 |
|
138 |
-
return result
|
139 |
|
140 |
iface = gr.Interface(
|
141 |
fn=analyze_messages,
|
@@ -148,8 +150,8 @@ iface = gr.Interface(
|
|
148 |
],
|
149 |
outputs=[gr.Textbox(label="Analysis Result")],
|
150 |
title="Abuse Pattern Detector",
|
151 |
-
live=True
|
152 |
)
|
153 |
|
154 |
-
if
|
155 |
iface.queue().launch()
|
|
|
5 |
from transformers import RobertaForSequenceClassification, RobertaTokenizer
|
6 |
|
7 |
# custom fine-tuned sentiment model
|
|
|
8 |
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
|
9 |
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")
|
10 |
|
11 |
# Load abuse pattern model
|
|
|
12 |
model_name = "SamanthaStorm/abuse-pattern-detector-v2"
|
13 |
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
|
14 |
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
15 |
|
16 |
LABELS = [
|
17 |
+
"gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
|
18 |
+
"contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase", "non_abusive",
|
19 |
+
"suicidal_threat", "physical_threat", "extreme_control"
|
20 |
]
|
21 |
|
22 |
THRESHOLDS = {
|
23 |
+
"gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.30, "control": 0.43, "guilt_tripping": 0.19,
|
24 |
+
"apology_baiting": 0.45, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
|
25 |
+
"manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25,
|
26 |
+
"non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.36
|
27 |
}
|
28 |
|
29 |
PATTERN_LABELS = LABELS[:15]
|
30 |
DANGER_LABELS = LABELS[15:18]
|
31 |
|
32 |
EXPLANATIONS = {
|
33 |
+
"gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
|
34 |
+
"blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
|
35 |
+
"projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
|
36 |
+
"dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
|
37 |
+
"mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
|
38 |
+
"recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
|
39 |
+
"insults": "Insults are derogatory remarks aimed at degrading someone.",
|
40 |
+
"apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
|
41 |
+
"deflection": "Deflection avoids accountability by redirecting blame.",
|
42 |
+
"control": "Control restricts autonomy through manipulation or coercion.",
|
43 |
+
"extreme_control": "Extreme control dominates decisions and behaviors entirely.",
|
44 |
+
"physical_threat": "Physical threats signal risk of bodily harm.",
|
45 |
+
"suicidal_threat": "Suicidal threats manipulate others using self-harm threats.",
|
46 |
+
"guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.",
|
47 |
+
"manipulation": "Manipulation deceives to influence or control outcomes.",
|
48 |
+
"non_abusive": "Non-abusive language is respectful and free of coercion.",
|
49 |
+
"obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
|
50 |
}
|
51 |
|
52 |
def custom_sentiment(text):
|
|
|
66 |
return round(np.mean(triggered_scores) * 100, 2) if triggered_scores else 0.0
|
67 |
|
68 |
def interpret_abuse_level(score):
|
69 |
+
if score > 80:
|
70 |
+
return "Extreme / High Risk"
|
71 |
+
elif score > 60:
|
72 |
+
return "Severe / Harmful Pattern Present"
|
73 |
+
elif score > 40:
|
74 |
+
return "Likely Abuse"
|
75 |
+
elif score > 20:
|
76 |
+
return "Mild Concern"
|
77 |
return "Very Low / Likely Safe"
|
78 |
|
79 |
def analyze_messages(input_text, risk_flags):
|
80 |
input_text = input_text.strip()
|
81 |
+
if not input_text:
|
82 |
return "Please enter a message for analysis."
|
83 |
|
84 |
+
sentiment = custom_sentiment(input_text)
|
85 |
+
sentiment_label = sentiment['label']
|
86 |
+
sentiment_score = sentiment['score']
|
87 |
|
88 |
+
adjusted_thresholds = {k: v * 0.8 for k, v in THRESHOLDS.items()} if sentiment_label == "undermining" else THRESHOLDS.copy()
|
89 |
|
90 |
+
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
|
91 |
+
with torch.no_grad():
|
92 |
+
outputs = model(**inputs)
|
93 |
+
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
|
94 |
|
95 |
+
pattern_count = sum(score > adjusted_thresholds[label] for label, score in zip(PATTERN_LABELS, scores[:15]))
|
96 |
+
danger_flag_count = sum(score > adjusted_thresholds[label] for label, score in zip(DANGER_LABELS, scores[15:18]))
|
97 |
|
98 |
+
contextual_flags = risk_flags if risk_flags else []
|
99 |
+
if len(contextual_flags) >= 2:
|
100 |
+
danger_flag_count += 1
|
101 |
|
102 |
+
critical_flags = ["They've threatened harm", "They monitor/follow me", "I feel unsafe when alone with them"]
|
103 |
+
high_risk_context = any(flag in contextual_flags for flag in critical_flags)
|
104 |
|
105 |
+
non_abusive_score = scores[LABELS.index('non_abusive')]
|
106 |
+
if non_abusive_score > adjusted_thresholds['non_abusive']:
|
107 |
+
return "This message is classified as non-abusive."
|
108 |
|
109 |
+
abuse_level = calculate_abuse_level(scores, adjusted_thresholds)
|
110 |
+
abuse_description = interpret_abuse_level(abuse_level)
|
111 |
|
112 |
+
if danger_flag_count >= 2:
|
113 |
+
resources = "Immediate assistance recommended. Please seek professional help or contact emergency services."
|
114 |
+
else:
|
115 |
+
resources = "For more information on abuse patterns, consider reaching out to support groups or professional counselors."
|
116 |
|
117 |
+
scored_patterns = [
|
118 |
+
(label, score) for label, score in zip(PATTERN_LABELS, scores[:15]) if label != "non_abusive"
|
119 |
+
]
|
120 |
+
top_patterns = sorted(scored_patterns, key=lambda x: x[1], reverse=True)[:2]
|
121 |
+
|
122 |
+
top_pattern_explanations = "\n".join([
|
123 |
+
f"• {label.replace('_', ' ').title()}: {EXPLANATIONS.get(label, 'No explanation available.')}"
|
124 |
+
for label, _ in top_patterns
|
125 |
+
])
|
126 |
+
|
127 |
+
result = (
|
128 |
+
f"Abuse Risk Score: {abuse_level}% – {abuse_description}\n\n"
|
129 |
+
f"Most Likely Patterns:\n{top_pattern_explanations}\n\n"
|
130 |
+
f"⚠️ Critical Danger Flags Detected: {danger_flag_count} of 3\n"
|
131 |
+
"Resources: " + resources + "\n\n"
|
132 |
+
f"Sentiment: {sentiment_label.title()} (Confidence: {sentiment_score*100:.2f}%)"
|
133 |
+
)
|
134 |
|
135 |
+
if contextual_flags:
|
136 |
+
result += "\n\n⚠️ You indicated the following:\n" + "\n".join([f"• {flag}" for flag in contextual_flags])
|
137 |
+
if high_risk_context:
|
138 |
+
result += "\n\n🚨 These responses suggest a high-risk situation. Consider seeking immediate help or safety planning resources."
|
139 |
|
140 |
+
return result
|
141 |
|
142 |
iface = gr.Interface(
|
143 |
fn=analyze_messages,
|
|
|
150 |
],
|
151 |
outputs=[gr.Textbox(label="Analysis Result")],
|
152 |
title="Abuse Pattern Detector",
|
153 |
+
live=True
|
154 |
)
|
155 |
|
156 |
+
if __name__ == "__main__":
|
157 |
iface.queue().launch()
|