Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,276 Bytes
d6e219c f1948f2 e185e86 b54664e e185e86 0ff864f e185e86 4dccd71 e185e86 dd2f06d 8e4d20e a9d4250 e185e86 43095bd e185e86 23bb2d2 4472a1d 73582bd b98a1ee 73582bd 4472a1d 28fc37c dcb0de6 28fc37c e185e86 ecc77cc 28fc37c 43095bd 23bb2d2 2dda625 73582bd 43095bd e185e86 a6c0cf2 e185e86 a6c0cf2 2dda625 e185e86 2dda625 e185e86 a28ef35 ab8c96f a6c0cf2 ad04ec8 a6c0cf2 e185e86 a6c0cf2 e185e86 ad04ec8 a6c0cf2 e185e86 ab8c96f 4292d1b 2dda625 e185e86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import gradio as gr
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs
from abuse_type_mapping import determine_abuse_type
# custom fine-tuned sentiment model
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")
# Load abuse pattern model
model_name = "SamanthaStorm/autotrain-c1un8-p8vzo"
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
LABELS = [
"gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
"contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase", "non_abusive",
"suicidal_threat", "physical_threat", "extreme_control"
]
THRESHOLDS = {
"gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.45, "control": 0.43, "guilt_tripping": 0.15,
"apology_baiting": 0.2, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
"manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25,
"non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.30
}
PATTERN_LABELS = LABELS[:15]
DANGER_LABELS = LABELS[15:18]
EXPLANATIONS = {
"gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
"blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
"projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
"dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
"mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
"recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
"insults": "Insults are derogatory remarks aimed at degrading someone.",
"apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
"deflection": "Deflection avoids accountability by redirecting blame.",
"control": "Control restricts autonomy through manipulation or coercion.",
"extreme_control": "Extreme control dominates decisions and behaviors entirely.",
"physical_threat": "Physical threats signal risk of bodily harm.",
"suicidal_threat": "Suicidal threats manipulate others using self-harm threats.",
"guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.",
"manipulation": "Manipulation deceives to influence or control outcomes.",
"non_abusive": "Non-abusive language is respectful and free of coercion.",
"obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
}
PATTERN_WEIGHTS = {
"physical_threat": 1.5,
"suicidal_threat": 1.4,
"extreme_control": 1.5,
"gaslighting": 1.3,
"control": 1.2,
"dismissiveness": 0.8,
"non_abusive": 0.0
}
def custom_sentiment(text):
inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = sentiment_model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=1)
label_idx = torch.argmax(probs).item()
label_map = {0: "supportive", 1: "undermining"}
label = label_map[label_idx]
score = probs[0][label_idx].item()
return {"label": label, "score": score}
def calculate_abuse_level(scores, thresholds, motif_hits=None):
weighted_scores = []
for label, score in zip(LABELS, scores):
if score > thresholds[label]:
weight = PATTERN_WEIGHTS.get(label, 1.0)
weighted_scores.append(score * weight)
base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
motif_hits = motif_hits or []
if any(label in motif_hits for label in {"physical_threat", "suicidal_threat", "extreme_control"}):
base_score = max(base_score, 75.0)
return base_score
def interpret_abuse_level(score):
if score > 80:
return "Extreme / High Risk"
elif score > 60:
return "Severe / Harmful Pattern Present"
elif score > 40:
return "Likely Abuse"
elif score > 20:
return "Mild Concern"
return "Very Low / Likely Safe"
def analyze_single_message(text, contextual_flags):
motif_flags, matched_phrases = detect_motifs(text)
risk_flags = list(set(contextual_flags + motif_flags)) if contextual_flags else motif_flags
sentiment_result = custom_sentiment(text)
sentiment_label = sentiment_result["label"]
sentiment_score = sentiment_result["score"]
thresholds = {k: v * 0.8 for k, v in THRESHOLDS.items()} if sentiment_label == "undermining" else THRESHOLDS.copy()
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
threshold_labels = [label for label, score in zip(PATTERN_LABELS, scores[:15]) if score > thresholds[label]]
phrase_labels = [label for label, _ in matched_phrases]
pattern_labels_used = list(set(threshold_labels + phrase_labels))
abuse_level = calculate_abuse_level(scores, thresholds, motif_hits=[label for label, _ in matched_phrases])
abuse_description = interpret_abuse_level(abuse_level)
return {
"text": text,
"score": abuse_level,
"summary": abuse_description,
"sentiment": f"{sentiment_label} ({sentiment_score*100:.2f}%)",
"top_labels": pattern_labels_used[:2],
"matched_phrases": matched_phrases,
"flags": contextual_flags
}
def analyze_composite(msg1, msg2, msg3, flags):
results = [analyze_single_message(t, flags) for t in [msg1, msg2, msg3] if t.strip()]
composite_score = round(np.mean([r['score'] for r in results]), 2) if results else 0.0
return [
f"Score: {r['score']}% – {r['summary']}\nSentiment: {r['sentiment']}\nFlags: {', '.join(r['flags']) if r['flags'] else 'None'}\nLabels: {', '.join(r['top_labels'])}" for r in results
] + [f"Composite Abuse Score: {composite_score}%"]
iface = gr.Interface(
fn=analyze_composite,
inputs=[
gr.Textbox(label="Message 1"),
gr.Textbox(label="Message 2"),
gr.Textbox(label="Message 3"),
gr.CheckboxGroup(label="Contextual Flags", choices=[
"They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
"They monitor/follow me", "I feel unsafe when alone with them"
])
],
outputs=[
gr.Textbox(label="Message 1 Result"),
gr.Textbox(label="Message 2 Result"),
gr.Textbox(label="Message 3 Result"),
gr.Textbox(label="Composite Score")
],
title="Abuse Pattern Detector (Multi-Message)",
flagging_mode="manual"
)
if __name__ == "__main__":
iface.queue().launch()
|