Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,50 +5,48 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
|
| 5 |
from transformers import RobertaForSequenceClassification, RobertaTokenizer
|
| 6 |
|
| 7 |
# custom fine-tuned sentiment model
|
| 8 |
-
|
| 9 |
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
|
| 10 |
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")
|
| 11 |
|
| 12 |
# Load abuse pattern model
|
| 13 |
-
|
| 14 |
model_name = "SamanthaStorm/abuse-pattern-detector-v2"
|
| 15 |
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
|
| 16 |
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 17 |
|
| 18 |
LABELS = [
|
| 19 |
-
"gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
|
| 20 |
-
"contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase", "non_abusive",
|
| 21 |
-
"suicidal_threat", "physical_threat", "extreme_control"
|
| 22 |
]
|
| 23 |
|
| 24 |
THRESHOLDS = {
|
| 25 |
-
"gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.30, "control": 0.43, "guilt_tripping": 0.19,
|
| 26 |
-
"apology_baiting": 0.45, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
|
| 27 |
-
"manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25,
|
| 28 |
-
"non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.36
|
| 29 |
}
|
| 30 |
|
| 31 |
PATTERN_LABELS = LABELS[:15]
|
| 32 |
DANGER_LABELS = LABELS[15:18]
|
| 33 |
|
| 34 |
EXPLANATIONS = {
|
| 35 |
-
"gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
|
| 36 |
-
"blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
|
| 37 |
-
"projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
|
| 38 |
-
"dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
|
| 39 |
-
"mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
|
| 40 |
-
"recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
|
| 41 |
-
"insults": "Insults are derogatory remarks aimed at degrading someone.",
|
| 42 |
-
"apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
|
| 43 |
-
"deflection": "Deflection avoids accountability by redirecting blame.",
|
| 44 |
-
"control": "Control restricts autonomy through manipulation or coercion.",
|
| 45 |
-
"extreme_control": "Extreme control dominates decisions and behaviors entirely.",
|
| 46 |
-
"physical_threat": "Physical threats signal risk of bodily harm.",
|
| 47 |
-
"suicidal_threat": "Suicidal threats manipulate others using self-harm threats.",
|
| 48 |
-
"guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.",
|
| 49 |
-
"manipulation": "Manipulation deceives to influence or control outcomes.",
|
| 50 |
-
"non_abusive": "Non-abusive language is respectful and free of coercion.",
|
| 51 |
-
"obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
|
| 52 |
}
|
| 53 |
|
| 54 |
def custom_sentiment(text):
|
|
@@ -68,74 +66,78 @@ def calculate_abuse_level(scores, thresholds):
|
|
| 68 |
return round(np.mean(triggered_scores) * 100, 2) if triggered_scores else 0.0
|
| 69 |
|
| 70 |
def interpret_abuse_level(score):
|
| 71 |
-
if score > 80:
|
| 72 |
-
|
| 73 |
-
elif score >
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
return "Very Low / Likely Safe"
|
| 76 |
|
| 77 |
def analyze_messages(input_text, risk_flags):
|
| 78 |
input_text = input_text.strip()
|
| 79 |
-
|
| 80 |
return "Please enter a message for analysis."
|
| 81 |
|
| 82 |
-
sentiment = custom_sentiment(input_text)
|
| 83 |
-
sentiment_label = sentiment['label']
|
| 84 |
-
sentiment_score = sentiment['score']
|
| 85 |
|
| 86 |
-
adjusted_thresholds = {k: v * 0.8 for k, v in THRESHOLDS.items()} if sentiment_label == "undermining" else THRESHOLDS.copy()
|
| 87 |
|
| 88 |
-
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
|
| 89 |
-
with torch.no_grad():
|
| 90 |
-
|
| 91 |
-
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
|
| 92 |
|
| 93 |
-
pattern_count = sum(score > adjusted_thresholds[label] for label, score in zip(PATTERN_LABELS, scores[:15]))
|
| 94 |
-
danger_flag_count = sum(score > adjusted_thresholds[label] for label, score in zip(DANGER_LABELS, scores[15:18]))
|
| 95 |
|
| 96 |
-
contextual_flags = risk_flags if risk_flags else []
|
| 97 |
-
if len(contextual_flags) >= 2:
|
| 98 |
-
|
| 99 |
|
| 100 |
-
critical_flags = ["They've threatened harm", "They monitor/follow me", "I feel unsafe when alone with them"]
|
| 101 |
-
high_risk_context = any(flag in contextual_flags for flag in critical_flags)
|
| 102 |
|
| 103 |
-
non_abusive_score = scores[LABELS.index('non_abusive')]
|
| 104 |
-
if non_abusive_score > adjusted_thresholds['non_abusive']:
|
| 105 |
-
|
| 106 |
|
| 107 |
-
abuse_level = calculate_abuse_level(scores, adjusted_thresholds)
|
| 108 |
-
abuse_description = interpret_abuse_level(abuse_level)
|
| 109 |
|
| 110 |
-
if danger_flag_count >= 2:
|
| 111 |
-
|
| 112 |
-
else:
|
| 113 |
-
|
| 114 |
|
| 115 |
-
scored_patterns = [
|
| 116 |
-
|
| 117 |
-
]
|
| 118 |
-
top_patterns = sorted(scored_patterns, key=lambda x: x[1], reverse=True)[:2]
|
| 119 |
-
|
| 120 |
-
top_pattern_explanations = "\n".join([
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
])
|
| 124 |
-
|
| 125 |
-
result = (
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
)
|
| 132 |
|
| 133 |
-
if contextual_flags:
|
| 134 |
-
|
| 135 |
-
if high_risk_context:
|
| 136 |
-
|
| 137 |
|
| 138 |
-
return result
|
| 139 |
|
| 140 |
iface = gr.Interface(
|
| 141 |
fn=analyze_messages,
|
|
@@ -148,8 +150,8 @@ iface = gr.Interface(
|
|
| 148 |
],
|
| 149 |
outputs=[gr.Textbox(label="Analysis Result")],
|
| 150 |
title="Abuse Pattern Detector",
|
| 151 |
-
live=True
|
| 152 |
)
|
| 153 |
|
| 154 |
-
if
|
| 155 |
iface.queue().launch()
|
|
|
|
| 5 |
from transformers import RobertaForSequenceClassification, RobertaTokenizer
|
| 6 |
|
| 7 |
# custom fine-tuned sentiment model
|
|
|
|
| 8 |
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
|
| 9 |
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")
|
| 10 |
|
| 11 |
# Load abuse pattern model
|
|
|
|
| 12 |
model_name = "SamanthaStorm/abuse-pattern-detector-v2"
|
| 13 |
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
|
| 14 |
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 15 |
|
| 16 |
LABELS = [
|
| 17 |
+
"gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
|
| 18 |
+
"contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase", "non_abusive",
|
| 19 |
+
"suicidal_threat", "physical_threat", "extreme_control"
|
| 20 |
]
|
| 21 |
|
| 22 |
THRESHOLDS = {
|
| 23 |
+
"gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.30, "control": 0.43, "guilt_tripping": 0.19,
|
| 24 |
+
"apology_baiting": 0.45, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
|
| 25 |
+
"manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25,
|
| 26 |
+
"non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.36
|
| 27 |
}
|
| 28 |
|
| 29 |
PATTERN_LABELS = LABELS[:15]
|
| 30 |
DANGER_LABELS = LABELS[15:18]
|
| 31 |
|
| 32 |
EXPLANATIONS = {
|
| 33 |
+
"gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
|
| 34 |
+
"blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
|
| 35 |
+
"projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
|
| 36 |
+
"dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
|
| 37 |
+
"mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
|
| 38 |
+
"recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
|
| 39 |
+
"insults": "Insults are derogatory remarks aimed at degrading someone.",
|
| 40 |
+
"apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
|
| 41 |
+
"deflection": "Deflection avoids accountability by redirecting blame.",
|
| 42 |
+
"control": "Control restricts autonomy through manipulation or coercion.",
|
| 43 |
+
"extreme_control": "Extreme control dominates decisions and behaviors entirely.",
|
| 44 |
+
"physical_threat": "Physical threats signal risk of bodily harm.",
|
| 45 |
+
"suicidal_threat": "Suicidal threats manipulate others using self-harm threats.",
|
| 46 |
+
"guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.",
|
| 47 |
+
"manipulation": "Manipulation deceives to influence or control outcomes.",
|
| 48 |
+
"non_abusive": "Non-abusive language is respectful and free of coercion.",
|
| 49 |
+
"obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
|
| 50 |
}
|
| 51 |
|
| 52 |
def custom_sentiment(text):
|
|
|
|
| 66 |
return round(np.mean(triggered_scores) * 100, 2) if triggered_scores else 0.0
|
| 67 |
|
| 68 |
def interpret_abuse_level(score):
|
| 69 |
+
if score > 80:
|
| 70 |
+
return "Extreme / High Risk"
|
| 71 |
+
elif score > 60:
|
| 72 |
+
return "Severe / Harmful Pattern Present"
|
| 73 |
+
elif score > 40:
|
| 74 |
+
return "Likely Abuse"
|
| 75 |
+
elif score > 20:
|
| 76 |
+
return "Mild Concern"
|
| 77 |
return "Very Low / Likely Safe"
|
| 78 |
|
| 79 |
def analyze_messages(input_text, risk_flags):
|
| 80 |
input_text = input_text.strip()
|
| 81 |
+
if not input_text:
|
| 82 |
return "Please enter a message for analysis."
|
| 83 |
|
| 84 |
+
sentiment = custom_sentiment(input_text)
|
| 85 |
+
sentiment_label = sentiment['label']
|
| 86 |
+
sentiment_score = sentiment['score']
|
| 87 |
|
| 88 |
+
adjusted_thresholds = {k: v * 0.8 for k, v in THRESHOLDS.items()} if sentiment_label == "undermining" else THRESHOLDS.copy()
|
| 89 |
|
| 90 |
+
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
|
| 91 |
+
with torch.no_grad():
|
| 92 |
+
outputs = model(**inputs)
|
| 93 |
+
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
|
| 94 |
|
| 95 |
+
pattern_count = sum(score > adjusted_thresholds[label] for label, score in zip(PATTERN_LABELS, scores[:15]))
|
| 96 |
+
danger_flag_count = sum(score > adjusted_thresholds[label] for label, score in zip(DANGER_LABELS, scores[15:18]))
|
| 97 |
|
| 98 |
+
contextual_flags = risk_flags if risk_flags else []
|
| 99 |
+
if len(contextual_flags) >= 2:
|
| 100 |
+
danger_flag_count += 1
|
| 101 |
|
| 102 |
+
critical_flags = ["They've threatened harm", "They monitor/follow me", "I feel unsafe when alone with them"]
|
| 103 |
+
high_risk_context = any(flag in contextual_flags for flag in critical_flags)
|
| 104 |
|
| 105 |
+
non_abusive_score = scores[LABELS.index('non_abusive')]
|
| 106 |
+
if non_abusive_score > adjusted_thresholds['non_abusive']:
|
| 107 |
+
return "This message is classified as non-abusive."
|
| 108 |
|
| 109 |
+
abuse_level = calculate_abuse_level(scores, adjusted_thresholds)
|
| 110 |
+
abuse_description = interpret_abuse_level(abuse_level)
|
| 111 |
|
| 112 |
+
if danger_flag_count >= 2:
|
| 113 |
+
resources = "Immediate assistance recommended. Please seek professional help or contact emergency services."
|
| 114 |
+
else:
|
| 115 |
+
resources = "For more information on abuse patterns, consider reaching out to support groups or professional counselors."
|
| 116 |
|
| 117 |
+
scored_patterns = [
|
| 118 |
+
(label, score) for label, score in zip(PATTERN_LABELS, scores[:15]) if label != "non_abusive"
|
| 119 |
+
]
|
| 120 |
+
top_patterns = sorted(scored_patterns, key=lambda x: x[1], reverse=True)[:2]
|
| 121 |
+
|
| 122 |
+
top_pattern_explanations = "\n".join([
|
| 123 |
+
f"• {label.replace('_', ' ').title()}: {EXPLANATIONS.get(label, 'No explanation available.')}"
|
| 124 |
+
for label, _ in top_patterns
|
| 125 |
+
])
|
| 126 |
+
|
| 127 |
+
result = (
|
| 128 |
+
f"Abuse Risk Score: {abuse_level}% – {abuse_description}\n\n"
|
| 129 |
+
f"Most Likely Patterns:\n{top_pattern_explanations}\n\n"
|
| 130 |
+
f"⚠️ Critical Danger Flags Detected: {danger_flag_count} of 3\n"
|
| 131 |
+
"Resources: " + resources + "\n\n"
|
| 132 |
+
f"Sentiment: {sentiment_label.title()} (Confidence: {sentiment_score*100:.2f}%)"
|
| 133 |
+
)
|
| 134 |
|
| 135 |
+
if contextual_flags:
|
| 136 |
+
result += "\n\n⚠️ You indicated the following:\n" + "\n".join([f"• {flag}" for flag in contextual_flags])
|
| 137 |
+
if high_risk_context:
|
| 138 |
+
result += "\n\n🚨 These responses suggest a high-risk situation. Consider seeking immediate help or safety planning resources."
|
| 139 |
|
| 140 |
+
return result
|
| 141 |
|
| 142 |
iface = gr.Interface(
|
| 143 |
fn=analyze_messages,
|
|
|
|
| 150 |
],
|
| 151 |
outputs=[gr.Textbox(label="Analysis Result")],
|
| 152 |
title="Abuse Pattern Detector",
|
| 153 |
+
live=True
|
| 154 |
)
|
| 155 |
|
| 156 |
+
if __name__ == "__main__":
|
| 157 |
iface.queue().launch()
|