Tether / app.py
SamanthaStorm's picture
Update app.py
909b775 verified
raw
history blame
8.7 kB
import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs
import re
# --- SST Sentiment Model ---
sst_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
# --- Abuse Model ---
model_name = "SamanthaStorm/autotrain-jlpi4-mllvp"
model = RobertaForSequenceClassification.from_pretrained(model_name)
tokenizer = RobertaTokenizer.from_pretrained(model_name)
LABELS = [
"blame shifting", "contradictory statements", "control", "dismissiveness",
"gaslighting", "guilt tripping", "insults", "obscure language",
"projection", "recovery phase", "threat"
]
THRESHOLDS = {
"blame shifting": 0.3, "contradictory statements": 0.32, "control": 0.48, "dismissiveness": 0.45,
"gaslighting": 0.30, "guilt tripping": 0.20, "insults": 0.34, "obscure language": 0.25,
"projection": 0.35, "recovery phase": 0.25, "threat": 0.25
}
PATTERN_WEIGHTS = {
"gaslighting": 1.3, "control": 1.2, "dismissiveness": 0.8,
"blame shifting": 0.8, "contradictory statements": 0.75
}
EXPLANATIONS = {
"blame shifting": "Blame-shifting redirects responsibility to avoid accountability.",
"contradictory statements": "Flipping positions or denying previous claims.",
"control": "Attempts to restrict another person’s autonomy.",
"dismissiveness": "Disregarding or belittling someone’s feelings or needs.",
"gaslighting": "Manipulating someone into questioning their reality.",
"guilt tripping": "Using guilt to control or pressure.",
"insults": "Derogatory or demeaning language.",
"obscure language": "Vague, superior, or confusing language used manipulatively.",
"projection": "Accusing someone else of your own behaviors.",
"recovery phase": "Resetting tension without real change.",
"threat": "Using fear or harm to control or intimidate."
}
RISK_SNIPPETS = {
"low": (
"🟢 Risk Level: Low",
"The language patterns here do not strongly indicate abuse.",
"Check in with yourself and monitor for repeated patterns."
),
"moderate": (
"⚠️ Risk Level: Moderate to High",
"Language includes control, guilt, or reversal tactics.",
"These patterns reduce self-trust. Document or talk with someone safe."
),
"high": (
"🛑 Risk Level: High",
"Strong indicators of coercive control or threat present.",
"Consider building a safety plan or contacting support."
)
}
DARVO_PATTERNS = {
"blame shifting", "projection", "dismissiveness", "guilt tripping", "contradictory statements"
}
DARVO_MOTIFS = [
"i guess i’m the bad guy", "after everything i’ve done", "you always twist everything",
"so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen",
"you’re attacking me", "i’m done trying", "i’m the only one who cares"
]
ESCALATION_QUESTIONS = [
("Partner has access to firearms or weapons", 4),
("Partner threatened to kill you", 3),
("Partner threatened you with a weapon", 3),
("Partner has ever choked you, even if you considered it consensual at the time", 4),
("Partner injured or threatened your pet(s)", 3),
("Partner has broken your things, punched or kicked walls, or thrown things ", 2),
("Partner forced or coerced you into unwanted sexual acts", 3),
("Partner threatened to take away your children", 2),
("Violence has increased in frequency or severity", 3),
("Partner monitors your calls/GPS/social media", 2)
]
def detect_contradiction(message):
patterns = [
(r"\b(i love you).{0,15}(i hate you|you ruin everything)", re.IGNORECASE),
(r"\b(i’m sorry).{0,15}(but you|if you hadn’t)", re.IGNORECASE),
(r"\b(i’m trying).{0,15}(you never|why do you)", re.IGNORECASE),
(r"\b(do what you want).{0,15}(you’ll regret it|i always give everything)", re.IGNORECASE),
(r"\b(i don’t care).{0,15}(you never think of me)", re.IGNORECASE),
(r"\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)", re.IGNORECASE)
]
return any(re.search(p, message, flags) for p, flags in patterns)
def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
pattern_hits = len([p for p in patterns if p in DARVO_PATTERNS])
pattern_score = pattern_hits / len(DARVO_PATTERNS)
sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
motif_hits = len([m for m in motifs_found if m.lower() in DARVO_MOTIFS])
motif_score = motif_hits / len(DARVO_MOTIFS)
contradiction_score = 1.0 if contradiction_flag else 0.0
return round(min(0.3 * pattern_score + 0.3 * sentiment_shift_score + 0.25 * motif_score + 0.15 * contradiction_score, 1.0), 3)
def generate_risk_snippet(abuse_score, top_label):
if abuse_score >= 85:
risk_level = "high"
elif abuse_score >= 60:
risk_level = "moderate"
else:
risk_level = "low"
title, summary, advice = RISK_SNIPPETS[risk_level]
return f"\n\n{title}\n{summary} (Pattern: **{str(top_label)}**)\n💡 {advice}"
def analyze_single_message(text, thresholds, motif_flags):
motif_hits, matched_phrases = detect_motifs(text)
# SST Sentiment
result = sst_pipeline(text)[0]
sentiment = "supportive" if result['label'] == "POSITIVE" else "undermining"
sentiment_score = result['score'] if sentiment == "undermining" else 0.0
adjusted_thresholds = {
k: v + 0.05 if sentiment == "supportive" else v
for k, v in thresholds.items()
}
contradiction_flag = detect_contradiction(text)
motifs = [phrase for _, phrase in matched_phrases]
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
threshold_labels = [
label for label, score in zip(LABELS, scores)
if score > adjusted_thresholds[label]
]
top_patterns = sorted(
[(label, score) for label, score in zip(LABELS, scores)],
key=lambda x: x[1],
reverse=True
)[:2]
pattern_labels = threshold_labels + [label for label, _ in matched_phrases]
darvo_score = calculate_darvo_score(pattern_labels, 0.0, sentiment_score, motifs, contradiction_flag)
return (
np.mean([score for _, score in top_patterns]) * 100,
threshold_labels,
top_patterns,
darvo_score,
{"label": sentiment, "raw_label": result['label'], "score": result['score']}
)
def analyze_composite(msg1, msg2, msg3, *answers_and_none):
responses = answers_and_none[:len(ESCALATION_QUESTIONS)]
none_selected = answers_and_none[-1]
escalation_score = 0 if none_selected else sum(w for (_, w), a in zip(ESCALATION_QUESTIONS, responses) if a)
escalation_level = "High" if escalation_score >= 16 else "Moderate" if escalation_score >= 8 else "Low"
messages = [msg1, msg2, msg3]
active = [m for m in messages if m.strip()]
if not active:
return "Please enter at least one message."
results = [analyze_single_message(m, THRESHOLDS.copy(), []) for m in active]
abuse_scores = [r[0] for r in results]
darvo_scores = [r[3] for r in results]
top_label = max({label for r in results for label in r[2]}, key=lambda l: abuse_scores[0])
composite_abuse = int(round(sum(abuse_scores) / len(abuse_scores)))
avg_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
out = f"Abuse Intensity: {composite_abuse}%\n"
out += f"Escalation Potential: {escalation_level} ({escalation_score}/{sum(w for _, w in ESCALATION_QUESTIONS)})"
out += generate_risk_snippet(composite_abuse, top_label)
if avg_darvo > 0.25:
level = "moderate" if avg_darvo < 0.65 else "high"
out += f"\n\nDARVO Score: {avg_darvo} → This indicates a **{level} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
return out
textbox_inputs = [gr.Textbox(label=f"Message {i+1}") for i in range(3)]
quiz_boxes = [gr.Checkbox(label=q) for q, _ in ESCALATION_QUESTIONS]
none_box = gr.Checkbox(label="None of the above")
iface = gr.Interface(
fn=analyze_composite,
inputs=textbox_inputs + quiz_boxes + [none_box],
outputs=gr.Textbox(label="Results"),
title="Abuse Pattern Detector + Escalation Quiz",
allow_flagging="manual"
)
if __name__ == "__main__":
iface.launch()