Spaces:

SamanthaStorm
/

Tether

Running on Zero

File size: 7,276 Bytes

d6e219c
f1948f2
 
e185e86
 
b54664e
e185e86
0ff864f
e185e86
4dccd71
 
e185e86
 
dd2f06d
8e4d20e
 
a9d4250
e185e86
 
 
 
 
 
 
 
 
 
 
 
 
43095bd
 
 
e185e86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23bb2d2
4472a1d
73582bd
 
b98a1ee
 
 
73582bd
 
 
 
4472a1d
28fc37c
dcb0de6
 
 
 
 
 
28fc37c
e185e86
ecc77cc
28fc37c
43095bd
23bb2d2
2dda625
 
 
 
 
 
 
 
73582bd
43095bd
e185e86
a6c0cf2
e185e86
 
 
 
 
a6c0cf2
2dda625
 
 
e185e86
 
 
 
 
 
 
 
 
 
 
 
 
 
2dda625
e185e86
 
 
 
 
 
a28ef35
ab8c96f
a6c0cf2
ad04ec8
a6c0cf2
 
e185e86
 
 
 
 
a6c0cf2
 
 
 
 
e185e86
ad04ec8
a6c0cf2
e185e86
ab8c96f
4292d1b
2dda625
e185e86

import gradio as gr
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from motif_tagging import detect_motifs
from abuse_type_mapping import determine_abuse_type

# custom fine-tuned sentiment model
sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")

# Load abuse pattern model
model_name = "SamanthaStorm/autotrain-c1un8-p8vzo"
model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)

LABELS = [
    "gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
    "contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase", "non_abusive",
    "suicidal_threat", "physical_threat", "extreme_control"
]

THRESHOLDS = {
    "gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.45, "control": 0.43, "guilt_tripping": 0.15,
    "apology_baiting": 0.2, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25,
    "manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25,
    "non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.30
}

PATTERN_LABELS = LABELS[:15]
DANGER_LABELS = LABELS[15:18]

EXPLANATIONS = {
    "gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
    "blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
    "projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
    "dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
    "mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
    "recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
    "insults": "Insults are derogatory remarks aimed at degrading someone.",
    "apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
    "deflection": "Deflection avoids accountability by redirecting blame.",
    "control": "Control restricts autonomy through manipulation or coercion.",
    "extreme_control": "Extreme control dominates decisions and behaviors entirely.",
    "physical_threat": "Physical threats signal risk of bodily harm.",
    "suicidal_threat": "Suicidal threats manipulate others using self-harm threats.",
    "guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.",
    "manipulation": "Manipulation deceives to influence or control outcomes.",
    "non_abusive": "Non-abusive language is respectful and free of coercion.",
    "obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
}

PATTERN_WEIGHTS = {
    "physical_threat": 1.5,
    "suicidal_threat": 1.4,
    "extreme_control": 1.5,
    "gaslighting": 1.3,
    "control": 1.2,
    "dismissiveness": 0.8,
    "non_abusive": 0.0
}

def custom_sentiment(text):
    inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = sentiment_model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        label_idx = torch.argmax(probs).item()
    label_map = {0: "supportive", 1: "undermining"}
    label = label_map[label_idx]
    score = probs[0][label_idx].item()
    return {"label": label, "score": score}

def calculate_abuse_level(scores, thresholds, motif_hits=None):
    weighted_scores = []
    for label, score in zip(LABELS, scores):
        if score > thresholds[label]:
            weight = PATTERN_WEIGHTS.get(label, 1.0)
            weighted_scores.append(score * weight)
    base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
    motif_hits = motif_hits or []
    if any(label in motif_hits for label in {"physical_threat", "suicidal_threat", "extreme_control"}):
        base_score = max(base_score, 75.0)
    return base_score

def interpret_abuse_level(score):
    if score > 80:
        return "Extreme / High Risk"
    elif score > 60:
        return "Severe / Harmful Pattern Present"
    elif score > 40:
        return "Likely Abuse"
    elif score > 20:
        return "Mild Concern"
    return "Very Low / Likely Safe"

def analyze_single_message(text, contextual_flags):
    motif_flags, matched_phrases = detect_motifs(text)
    risk_flags = list(set(contextual_flags + motif_flags)) if contextual_flags else motif_flags
    sentiment_result = custom_sentiment(text)
    sentiment_label = sentiment_result["label"]
    sentiment_score = sentiment_result["score"]
    thresholds = {k: v * 0.8 for k, v in THRESHOLDS.items()} if sentiment_label == "undermining" else THRESHOLDS.copy()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
    threshold_labels = [label for label, score in zip(PATTERN_LABELS, scores[:15]) if score > thresholds[label]]
    phrase_labels = [label for label, _ in matched_phrases]
    pattern_labels_used = list(set(threshold_labels + phrase_labels))
    abuse_level = calculate_abuse_level(scores, thresholds, motif_hits=[label for label, _ in matched_phrases])
    abuse_description = interpret_abuse_level(abuse_level)
    return {
        "text": text,
        "score": abuse_level,
        "summary": abuse_description,
        "sentiment": f"{sentiment_label} ({sentiment_score*100:.2f}%)",
        "top_labels": pattern_labels_used[:2],
        "matched_phrases": matched_phrases,
        "flags": contextual_flags
    }

def analyze_composite(msg1, msg2, msg3, flags):
    results = [analyze_single_message(t, flags) for t in [msg1, msg2, msg3] if t.strip()]
    composite_score = round(np.mean([r['score'] for r in results]), 2) if results else 0.0
    return [
        f"Score: {r['score']}% – {r['summary']}\nSentiment: {r['sentiment']}\nFlags: {', '.join(r['flags']) if r['flags'] else 'None'}\nLabels: {', '.join(r['top_labels'])}" for r in results
    ] + [f"Composite Abuse Score: {composite_score}%"]

iface = gr.Interface(
    fn=analyze_composite,
    inputs=[
        gr.Textbox(label="Message 1"),
        gr.Textbox(label="Message 2"),
        gr.Textbox(label="Message 3"),
        gr.CheckboxGroup(label="Contextual Flags", choices=[
            "They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
            "They monitor/follow me", "I feel unsafe when alone with them"
        ])
    ],
    outputs=[
        gr.Textbox(label="Message 1 Result"),
        gr.Textbox(label="Message 2 Result"),
        gr.Textbox(label="Message 3 Result"),
        gr.Textbox(label="Composite Score")
    ],
    title="Abuse Pattern Detector (Multi-Message)",
    flagging_mode="manual"
)

if __name__ == "__main__":
    iface.queue().launch()