Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,034 Bytes
d6e219c f1948f2 4472a1d cc98c96 b54664e 4026c4d 0ff864f c43452c 4dccd71 2efdba9 2272c61 4dccd71 8e4d20e a9d4250 79936aa 2dda625 79936aa f1948f2 c303ab8 2dda625 c303ab8 4292d1b 43095bd 94e76c4 2dda625 94e76c4 23bb2d2 4472a1d 73582bd b98a1ee 2efdba9 73582bd 4472a1d 28fc37c ecc77cc 28fc37c 43095bd 23bb2d2 2dda625 73582bd 43095bd 73582bd 2796d42 03c90d9 2efdba9 931f75a c10440f 867b307 2dda625 2efdba9 2dda625 2efdba9 2dda625 2efdba9 ecc77cc 2efdba9 ecc77cc 2dda625 2efdba9 2dda625 2efdba9 2dda625 bd4971f 2dda625 2efdba9 4026c4d ecc77cc a28ef35 45a52da 2dda625 ecc77cc 2dda625 ecc77cc 9519fb4 ecc77cc d533c27 0e9226e 9519fb4 a28ef35 2dda625 dd85bbd 5795ebb 2dda625 5795ebb dd85bbd 4026c4d 3529542 6297c04 5d7c4ba ab8c96f ad04ec8 2dda625 ab8c96f 4292d1b 2dda625 ecc77cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import gradio as gr
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs
from abuse_type_mapping import determine_abuse_type
# custom fine-tuned sentiment model
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")
# Load abuse pattern model
model_name = "SamanthaStorm/abuse-pattern-detector-v2"
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
LABELS = [
"gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
"contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase", "non_abusive",
"suicidal_threat", "physical_threat", "extreme_control"
]
THRESHOLDS = {
"gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.30, "control": 0.43, "guilt_tripping": 0.19,
"apology_baiting": 0.45, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
"manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25,
"non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.36
}
PATTERN_LABELS = LABELS[:15]
DANGER_LABELS = LABELS[15:18]
EXPLANATIONS = {
"gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
"blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
"projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
"dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
"mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
"recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
"insults": "Insults are derogatory remarks aimed at degrading someone.",
"apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
"deflection": "Deflection avoids accountability by redirecting blame.",
"control": "Control restricts autonomy through manipulation or coercion.",
"extreme_control": "Extreme control dominates decisions and behaviors entirely.",
"physical_threat": "Physical threats signal risk of bodily harm.",
"suicidal_threat": "Suicidal threats manipulate others using self-harm threats.",
"guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.",
"manipulation": "Manipulation deceives to influence or control outcomes.",
"non_abusive": "Non-abusive language is respectful and free of coercion.",
"obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
}
def custom_sentiment(text):
inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = sentiment_model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=1)
label_idx = torch.argmax(probs).item()
label_map = {0: "supportive", 1: "undermining"}
label = label_map[label_idx]
score = probs[0][label_idx].item()
return {"label": label, "score": score}
def calculate_abuse_level(scores, thresholds, motif_hits=None):
triggered_scores = [
score for label, score in zip(LABELS, scores) if score > thresholds[label]
]
base_score = round(np.mean(triggered_scores) * 100, 2) if triggered_scores else 0.0
motif_hits = motif_hits or []
if any(label in motif_hits for label in {"physical_threat", "suicidal_threat", "extreme_control"}):
base_score = max(base_score, 75.0)
return base_score
def interpret_abuse_level(score):
if score > 80:
return "Extreme / High Risk"
elif score > 60:
return "Severe / Harmful Pattern Present"
elif score > 40:
return "Likely Abuse"
elif score > 20:
return "Mild Concern"
return "Very Low / Likely Safe"
def analyze_messages(input_text, risk_flags):
input_text = input_text.strip()
if not input_text:
return "Please enter a message for analysis."
motif_flags, matched_phrases = detect_motifs(input_text)
risk_flags = list(set(risk_flags + motif_flags)) if risk_flags else motif_flags
sentiment = custom_sentiment(input_text)
sentiment_label = sentiment['label']
sentiment_score = sentiment['score']
adjusted_thresholds = {k: v * 0.8 for k, v in THRESHOLDS.items()} if sentiment_label == "undermining" else THRESHOLDS.copy()
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
pattern_labels_used = list(set(
[label for label, score in zip(PATTERN_LABELS, scores[:15]) if score > adjusted_thresholds[label]] +
[label for label, _ in matched_phrases]
))
abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits=[label for label, _ in matched_phrases])
abuse_description = interpret_abuse_level(abuse_level)
abuse_type, abuser_profile, advice = determine_abuse_type(pattern_labels_used)
danger_flag_count = sum(score > adjusted_thresholds[label] for label, score in zip(DANGER_LABELS, scores[15:18]))
contextual_flags = risk_flags if risk_flags else []
if len(contextual_flags) >= 2:
danger_flag_count += 1
critical_flags = ["They've threatened harm", "They monitor/follow me", "I feel unsafe when alone with them"]
high_risk_context = any(flag in contextual_flags for flag in critical_flags)
non_abusive_score = scores[LABELS.index('non_abusive')]
non_abusive_confident = non_abusive_score > adjusted_thresholds['non_abusive']
if non_abusive_confident and danger_flag_count == 0 and not matched_phrases:
return "This message is classified as non-abusive."
scored_patterns = [
(label, score) for label, score in zip(PATTERN_LABELS, scores[:15]) if label != "non_abusive"
]
override_labels = {"physical_threat", "suicidal_threat", "extreme_control"}
override_matches = [label for label, _ in matched_phrases if label in override_labels]
if override_matches:
top_patterns = [(label, 1.0) for label in override_matches]
else:
top_patterns = sorted(scored_patterns, key=lambda x: x[1], reverse=True)[:2]
top_pattern_explanations = "\n".join([
f"• {label.replace('_', ' ').title()}: {EXPLANATIONS.get(label, 'No explanation available.')}"
for label, _ in top_patterns
])
resources = "Immediate assistance recommended. Please seek professional help or contact emergency services." if danger_flag_count >= 2 else "For more information on abuse patterns, consider reaching out to support groups or professional counselors."
result = f"Abuse Risk Score: {abuse_level}% – {abuse_description}\n\n"
if abuse_level >= 15:
result += f"Most Likely Patterns:\n{top_pattern_explanations}\n\n"
result += f"⚠️ Critical Danger Flags Detected: {danger_flag_count} of 3\n"
result += f"Resources: {resources}\n"
result += f"🧠 Sentiment: {sentiment_label.title()} (Confidence: {sentiment_score*100:.2f}%)\n"
if contextual_flags:
result += "\n\n⚠️ You indicated the following:\n" + "\n".join([f"• {flag.replace('_', ' ').title()}" for flag in contextual_flags])
if high_risk_context:
result += "\n\n🚨 These responses suggest a high-risk situation. Consider seeking immediate help or safety planning resources."
if matched_phrases:
result += "\n\n🚨 Detected High-Risk Phrases:\n"
for label, phrase in matched_phrases:
phrase_clean = phrase.replace('"', "'").strip()
result += f"• {label.replace('_', ' ').title()}: “{phrase_clean}”\n"
if abuse_type:
result += f"\n\n🧠 Likely Abuse Type: {abuse_type}"
result += f"\n🧠 Abuser Profile: {abuser_profile}"
result += f"\n📘 Safety Tip: {advice}"
return result
iface = gr.Interface(
fn=analyze_messages,
inputs=[
gr.Textbox(lines=10, placeholder="Enter message here..."),
gr.CheckboxGroup(label="Do any of these apply to your situation?", choices=[
"They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
"They monitor/follow me", "I feel unsafe when alone with them"
])
],
outputs=[gr.Textbox(label="Analysis Result")],
title="Abuse Pattern Detector",
live=True
)
if __name__ == "__main__":
iface.queue().launch()
|