Tether / app.py
SamanthaStorm's picture
Update app.py
8a5406e verified
raw
history blame
9.59 kB
import gradio as gr
import torch
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs
import re
# --- SST Sentiment Model ---
sst_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
# --- Abuse Model ---
model_name = "SamanthaStorm/autotrain-jlpi4-mllvp"
model = RobertaForSequenceClassification.from_pretrained(model_name)
tokenizer = RobertaTokenizer.from_pretrained(model_name)
LABELS = [
"blame shifting", "contradictory statements", "control", "dismissiveness",
"gaslighting", "guilt tripping", "insults", "obscure language",
"projection", "recovery phase", "threat"
]
THRESHOLDS = {
"blame shifting": 0.3, "contradictory statements": 0.32, "control": 0.48, "dismissiveness": 0.45,
"gaslighting": 0.30, "guilt tripping": 0.20, "insults": 0.34, "obscure language": 0.25,
"projection": 0.35, "recovery phase": 0.25, "threat": 0.25
}
PATTERN_WEIGHTS = {
"gaslighting": 1.3, "control": 1.2, "dismissiveness": 0.8,
"blame shifting": 0.8, "contradictory statements": 0.75
}
EXPLANATIONS = {
"blame shifting": "Blame-shifting redirects responsibility to avoid accountability.",
"contradictory statements": "Flipping positions or denying previous claims.",
"control": "Attempts to restrict another person’s autonomy.",
"dismissiveness": "Disregarding or belittling someone’s feelings or needs.",
"gaslighting": "Manipulating someone into questioning their reality.",
"guilt tripping": "Using guilt to control or pressure.",
"insults": "Derogatory or demeaning language.",
"obscure language": "Vague, superior, or confusing language used manipulatively.",
"projection": "Accusing someone else of your own behaviors.",
"recovery phase": "Resetting tension without real change.",
"threat": "Using fear or harm to control or intimidate."
}
RISK_SNIPPETS = {
"low": (
"🟢 Risk Level: Low",
"The language patterns here do not strongly indicate abuse.",
"Check in with yourself and monitor for repeated patterns."
),
"moderate": (
"⚠️ Risk Level: Moderate to High",
"Language includes control, guilt, or reversal tactics.",
"These patterns reduce self-trust. Document or talk with someone safe."
),
"high": (
"🛑 Risk Level: High",
"Strong indicators of coercive control or threat present.",
"Consider building a safety plan or contacting support."
)
}
DARVO_PATTERNS = {
"blame shifting", "projection", "dismissiveness", "guilt tripping", "contradictory statements"
}
DARVO_MOTIFS = [
"i guess i’m the bad guy", "after everything i’ve done", "you always twist everything",
"so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen",
"you’re attacking me", "i’m done trying", "i’m the only one who cares"
]
ESCALATION_QUESTIONS = [
("Partner has access to firearms or weapons", 4),
("Partner threatened to kill you", 3),
("Partner threatened you with a weapon", 3),
("Partner has ever choked you, even if you considered it consensual at the time", 4),
("Partner injured or threatened your pet(s)", 3),
("Partner has broken your things, punched or kicked walls, or thrown things ", 2),
("Partner forced or coerced you into unwanted sexual acts", 3),
("Partner threatened to take away your children", 2),
("Violence has increased in frequency or severity", 3),
("Partner monitors your calls/GPS/social media", 2)
]
def detect_contradiction(message):
patterns = [
(r"\b(i love you).{0,15}(i hate you|you ruin everything)", re.IGNORECASE),
(r"\b(i’m sorry).{0,15}(but you|if you hadn’t)", re.IGNORECASE),
(r"\b(i’m trying).{0,15}(you never|why do you)", re.IGNORECASE),
(r"\b(do what you want).{0,15}(you’ll regret it|i always give everything)", re.IGNORECASE),
(r"\b(i don’t care).{0,15}(you never think of me)", re.IGNORECASE),
(r"\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)", re.IGNORECASE)
]
return any(re.search(p, message, flags) for p, flags in patterns)
def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
pattern_hits = len([p for p in patterns if p in DARVO_PATTERNS])
pattern_score = pattern_hits / len(DARVO_PATTERNS)
sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
motif_hits = len([m for m in motifs_found if m.lower() in DARVO_MOTIFS])
motif_score = motif_hits / len(DARVO_MOTIFS)
contradiction_score = 1.0 if contradiction_flag else 0.0
return round(min(0.3 * pattern_score + 0.3 * sentiment_shift_score + 0.25 * motif_score + 0.15 * contradiction_score, 1.0), 3)
def generate_risk_snippet(abuse_score, top_label, escalation_score):
if abuse_score >= 85 or escalation_score >= 16:
risk_level = "high"
elif abuse_score >= 60 or escalation_score >= 8:
risk_level = "moderate"
else:
risk_level = "low"
title, summary, advice = RISK_SNIPPETS[risk_level]
return f"\n\n{title}\n{summary} (Pattern: **{str(top_label)}**)\n💡 {advice}"
def analyze_single_message(text, thresholds, motif_flags):
motif_hits, matched_phrases = detect_motifs(text)
result = sst_pipeline(text)[0]
sentiment = "supportive" if result['label'] == "POSITIVE" else "undermining"
sentiment_score = result['score'] if sentiment == "undermining" else 0.0
adjusted_thresholds = {
k: v + 0.05 if sentiment == "supportive" else v
for k, v in thresholds.items()
}
contradiction_flag = detect_contradiction(text)
motifs = [phrase for _, phrase in matched_phrases]
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
threshold_labels = []
for label, score in zip(LABELS, scores):
if label in {"control", "dismissiveness", "blame shifting"}:
if sentiment == "undermining" and result["score"] > 0.85 and score > adjusted_thresholds[label]:
threshold_labels.append(label)
elif score > adjusted_thresholds[label]:
threshold_labels.append(label)
top_patterns = sorted(
[(label, score) for label, score in zip(LABELS, scores)],
key=lambda x: x[1],
reverse=True
)[:2]
pattern_labels = threshold_labels + [label for label, _ in matched_phrases]
darvo_score = calculate_darvo_score(pattern_labels, 0.0, sentiment_score, motifs, contradiction_flag)
print("\n--- Debug Info ---")
print(f"Text: {text}")
print(f"Sentiment: {sentiment} (raw: {result['label']}, score: {result['score']:.3f})")
print("Abuse Pattern Scores:")
for label, score in zip(LABELS, scores):
passed = "✅" if label in threshold_labels else "❌"
print(f" {label:25}{score:.3f} {passed}")
print(f"Motifs: {motifs}")
print(f"Contradiction: {contradiction_flag}")
print("------------------\n")
return (
np.mean([score for _, score in top_patterns]) * 100,
threshold_labels,
top_patterns,
darvo_score,
{"label": sentiment, "raw_label": result['label'], "score": result['score']}
)
def analyze_composite(msg1, msg2, msg3, *answers_and_none):
responses = answers_and_none[:len(ESCALATION_QUESTIONS)]
none_selected = answers_and_none[-1]
escalation_score = 0 if none_selected else sum(w for (_, w), a in zip(ESCALATION_QUESTIONS, responses) if a)
escalation_level = "High" if escalation_score >= 16 else "Moderate" if escalation_score >= 8 else "Low"
messages = [msg1, msg2, msg3]
active = [m for m in messages if m.strip()]
if not active:
return "Please enter at least one message."
results = [analyze_single_message(m, THRESHOLDS.copy(), []) for m in active]
abuse_scores = [r[0] for r in results]
darvo_scores = [r[3] for r in results]
top_pattern = max(
[(label, score) for r in results for label, score in r[2]],
key=lambda x: x[1]
)
top_label = f"{top_pattern[0]}{int(round(top_pattern[1] * 100))}%"
composite_abuse = int(round(sum(abuse_scores) / len(abuse_scores)))
avg_darvo = round(sum(darvo_scores) / len(darvo_scores), 3)
out = f"Abuse Intensity: {composite_abuse}%\n"
out += f"Escalation Potential: {escalation_level} ({escalation_score}/{sum(w for _, w in ESCALATION_QUESTIONS)})"
out += generate_risk_snippet(composite_abuse, top_label, escalation_score)
if avg_darvo > 0.25:
level = "moderate" if avg_darvo < 0.65 else "high"
out += f"\n\nDARVO Score: {avg_darvo} → This indicates a **{level} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame."
return out
textbox_inputs = [gr.Textbox(label=f"Message {i+1}") for i in range(3)]
quiz_boxes = [gr.Checkbox(label=q) for q, _ in ESCALATION_QUESTIONS]
none_box = gr.Checkbox(label="None of the above")
iface = gr.Interface(
fn=analyze_composite,
inputs=textbox_inputs + quiz_boxes + [none_box],
outputs=gr.Textbox(label="Results"),
title="Abuse Pattern Detector + Escalation Quiz",
allow_flagging="manual"
)
if __name__ == "__main__":
iface.launch()