Tether / app.py
SamanthaStorm's picture
Update app.py
ff45556 verified
raw
history blame
10.4 kB
import gradio as gr
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs
from abuse_type_mapping import determine_abuse_type
# custom fine-tuned sentiment model
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")
# Load abuse pattern model
model_name = "SamanthaStorm/abuse-pattern-detector-v2"
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
LABELS = [
"gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
"contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase", "non_abusive",
"suicidal_threat", "physical_threat", "extreme_control"
]
THRESHOLDS = {
"gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.45, "control": 0.43, "guilt_tripping": 0.15,
"apology_baiting": 0.2, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
"manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25,
"non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.30
}
PATTERN_LABELS = LABELS[:15]
DANGER_LABELS = LABELS[15:18]
EXPLANATIONS = {
"gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
"blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
"projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
"dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
"mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
"recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
"insults": "Insults are derogatory remarks aimed at degrading someone.",
"apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
"deflection": "Deflection avoids accountability by redirecting blame.",
"control": "Control restricts autonomy through manipulation or coercion.",
"extreme_control": "Extreme control dominates decisions and behaviors entirely.",
"physical_threat": "Physical threats signal risk of bodily harm.",
"suicidal_threat": "Suicidal threats manipulate others using self-harm threats.",
"guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.",
"manipulation": "Manipulation deceives to influence or control outcomes.",
"non_abusive": "Non-abusive language is respectful and free of coercion.",
"obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
}
def custom_sentiment(text):
inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = sentiment_model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=1)
label_idx = torch.argmax(probs).item()
label_map = {0: "supportive", 1: "undermining"}
label = label_map[label_idx]
score = probs[0][label_idx].item()
return {"label": label, "score": score}
PATTERN_WEIGHTS = {
"physical_threat": 1.5,
"suicidal_threat": 1.4,
"extreme_control": 1.5,
"gaslighting": 1.3,
"control": 1.2,
"dismissiveness": 0.8,
"non_abusive": 0.0 # shouldn't contribute to abuse score
}
def calculate_abuse_level(scores, thresholds, motif_hits=None):
weighted_scores = []
for label, score in zip(LABELS, scores):
if score > thresholds[label]:
weight = PATTERN_WEIGHTS.get(label, 1.0)
weighted_scores.append(score * weight)
base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
motif_hits = motif_hits or []
if any(label in motif_hits for label in {"physical_threat", "suicidal_threat", "extreme_control"}):
base_score = max(base_score, 75.0)
return base_score
def interpret_abuse_level(score):
if score > 80:
return "Extreme / High Risk"
elif score > 60:
return "Severe / Harmful Pattern Present"
elif score > 40:
return "Likely Abuse"
elif score > 20:
return "Mild Concern"
return "Very Low / Likely Safe"
def analyze_messages(input_text, risk_flags):
input_text = input_text.strip()
if not input_text:
return "Please enter a message for analysis."
# Normalize the text (example: lower case)
normalized_text = input_text.strip().lower()
motif_flags, matched_phrases = detect_motifs(input_text)
risk_flags = list(set(risk_flags + motif_flags)) if risk_flags else motif_flags
sentiment = custom_sentiment(input_text)
sentiment_label = sentiment['label']
sentiment_score = sentiment['score']
adjusted_thresholds = {k: v * 0.8 for k, v in THRESHOLDS.items()} if sentiment_label == "undermining" else THRESHOLDS.copy()
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
threshold_labels = [label for label, score in zip(PATTERN_LABELS, scores[:15]) if score > adjusted_thresholds[label]]
phrase_labels = [label for label, _ in matched_phrases]
pattern_labels_used = list(set(threshold_labels + phrase_labels))
contextual_flags = risk_flags if risk_flags else []
# Note: If there are two or more contextual flags, you might wish to adjust a danger counter
# danger_flag_count += 1 <-- Ensure that danger_flag_count is defined before incrementing.
abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits=[label for label, _ in matched_phrases])
abuse_description = interpret_abuse_level(abuse_level)
# Escalate risk if user checks a critical context box
if contextual_flags and abuse_level < 15:
abuse_level = 15 # bump to at least Mild Concern
abuse_type, abuser_profile, advice = determine_abuse_type(pattern_labels_used)
danger_flag_count = sum(score > adjusted_thresholds[label] for label, score in zip(DANGER_LABELS, scores[15:18]))
critical_flags = ["They've threatened harm", "They monitor/follow me", "I feel unsafe when alone with them"]
high_risk_context = any(flag in contextual_flags for flag in critical_flags)
non_abusive_score = scores[LABELS.index('non_abusive')]
non_abusive_confident = non_abusive_score > adjusted_thresholds['non_abusive']
if non_abusive_confident and danger_flag_count == 0 and not matched_phrases:
return "This message is classified as non-abusive."
# Supportive override logic
if (
sentiment_label == "supportive"
and sentiment_score > 0.95
and non_abusive_confident
and danger_flag_count == 0
and not matched_phrases
):
return "This message is classified as non-abusive. It appears emotionally supportive and safe."
scored_patterns = [
(label, score) for label, score in zip(PATTERN_LABELS, scores[:15]) if label != "non_abusive"
]
override_labels = {"physical_threat", "suicidal_threat", "extreme_control"}
override_matches = [label for label, _ in matched_phrases if label in override_labels]
if override_matches:
top_patterns = [(label, 1.0) for label in override_matches]
else:
top_patterns = sorted(scored_patterns, key=lambda x: x[1], reverse=True)[:2]
top_pattern_explanations = "\n".join([
f"• {label.replace('_', ' ').title()}: {EXPLANATIONS.get(label, 'No explanation available.')}"
for label, _ in top_patterns
])
resources = "Immediate assistance recommended. Please seek professional help or contact emergency services." if danger_flag_count >= 2 else "For more information on abuse patterns, consider reaching out to support groups or professional counselors."
result = f"Abuse Risk Score: {abuse_level}% – {abuse_description}\n\n"
if abuse_level >= 15:
result += f"Most Likely Patterns:\n{top_pattern_explanations}\n\n"
result += f"⚠️ Critical Danger Flags Detected: {danger_flag_count} of 3\n"
result += f"Resources: {resources}\n"
result += f"🧠 Sentiment: {sentiment_label.title()} (Confidence: {sentiment_score*100:.2f}%)\n"
if contextual_flags:
result += "\n\n⚠️ You indicated the following:\n" + "\n".join([f"• {flag.replace('_', ' ').title()}" for flag in contextual_flags])
if high_risk_context:
result += "\n\n🚨 These responses suggest a high-risk situation. Consider seeking immediate help or safety planning resources."
if matched_phrases:
result += "\n\n🚨 Detected High-Risk Phrases:\n"
for label, phrase in matched_phrases:
phrase_clean = phrase.replace('"', "'").strip()
result += f"• {label.replace('_', ' ').title()}: “{phrase_clean}”\n"
if abuse_type:
result += f"\n\n🧠 Likely Abuse Type: {abuse_type}"
result += f"\n🧠 Abuser Profile: {abuser_profile}"
result += f"\n📘 Safety Tip: {advice}"
return result
# Updated Interface: Added flagging functionality to allow users to flag mispredictions.
iface = gr.Interface(
fn=analyze_messages,
inputs=[
gr.Textbox(lines=10, placeholder="Enter message here..."),
gr.CheckboxGroup(label="Do any of these apply to your situation?", choices=[
"They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
"They monitor/follow me", "I feel unsafe when alone with them"
])
],
outputs=[gr.Textbox(label="Analysis Result")],
title="Abuse Pattern Detector",
live=True,
allow_flagging="manual" # This enables the manual flagging button for user feedback.
)
if __name__ == "__main__":
iface.queue().launch()