import gradio as gr import torch import numpy as np from transformers import AutoModelForSequenceClassification, AutoTokenizer from transformers import RobertaForSequenceClassification, RobertaTokenizer from motif_tagging import detect_motifs # custom fine-tuned sentiment model sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment") sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment") # Load abuse pattern model model_name ="SamanthaStorm/autotrain-jlpi4-mllvp" model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True) tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True) LABELS = [ "blame shifting", "contradictory statements", "control", "dismissiveness", "gaslighting", "guilt tripping", "insults", "obscure language", "projection", "recovery phase", "threat" ] THRESHOLDS = { "blame shifting": 0.3, "contradictory statements": 0.32, "control": 0.40, "dismissiveness": 0.45, "gaslighting": 0.30, "guilt tripping": 0.20, "insults": 0.34, "obscure language": 0.25, "projection": 0.35, "recovery phase": 0.25, "threat": 0.25 } PATTERN_LABELS = LABELS EXPLANATIONS = { "blame shifting": "Blame-shifting is when one person redirects responsibility onto someone else to avoid accountability.", "contradictory statements": "Contradictory statements confuse the listener by flipping positions or denying previous claims.", "control": "Control restricts another person’s autonomy through coercion, manipulation, or threats.", "dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings, needs, or opinions.", "gaslighting": "Gaslighting involves making someone question their own reality, memory, or perceptions.", "guilt tripping": "Guilt-tripping uses guilt to manipulate someone’s actions or decisions.", "insults": "Insults are derogatory or demeaning remarks meant to shame, belittle, or hurt someone.", "obscure language": "Obscure language manipulates through complexity, vagueness, or superiority to confuse the other person.", "projection": "Projection accuses someone else of the very behaviors or intentions the speaker is exhibiting.", "recovery phase": "Recovery phase statements attempt to soothe or reset tension without acknowledging harm or change.", "threat": "Threats use fear of harm (physical, emotional, or relational) to control or intimidate someone." } PATTERN_WEIGHTS = { "gaslighting": 1.3, "mockery": 1.2, "control": 1.2, "dismissiveness": 0.8, "blame_shifting": 0.8, "contradictory_statements": 0.75, } # --- DARVO Detection Tools --- DARVO_PATTERNS = { "blame shifting", "projection", "mockery", "dismissiveness", "deflection", "guilt tripping" } DARVO_MOTIFS = [ "i guess i’m the bad guy", "after everything i’ve done", "you always twist everything", "so now it’s all my fault", "i’m the villain", "i’m always wrong", "you never listen", "you’re attacking me", "i’m done trying", "i’m the only one who cares" ] import re def detect_contradiction(message): contradiction_flag = False contradiction_phrases = [ # Emotional flip-flops (r"\b(i love you).{0,15}(i hate you|you ruin everything)", re.IGNORECASE), (r"\b(i’m sorry).{0,15}(but you|if you hadn’t)", re.IGNORECASE), (r"\b(i’m trying).{0,15}(you never|why do you)", re.IGNORECASE), # Control + helplessness (r"\b(do what you want).{0,15}(you’ll regret it|i always give everything)", re.IGNORECASE), (r"\b(i don’t care).{0,15}(you never think of me)", re.IGNORECASE), # Passive aggression or self-victimization switch (r"\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)", re.IGNORECASE), ] for pattern, flags in contradiction_phrases: if re.search(pattern, message, flags): contradiction_flag = True break return contradiction_flag contradiction_flag = detect_contradiction(text) def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False): pattern_hits = len([p.lower() for p in patterns if p.lower() in DARVO_PATTERNS]) pattern_score = pattern_hits / len(DARVO_PATTERNS) sentiment_shift_score = max(0.0, sentiment_after - sentiment_before) motif_hits = len([m.lower() for m in motifs_found if m.lower() in DARVO_MOTIFS]) motif_score = motif_hits / len(DARVO_MOTIFS) contradiction_score = 1.0 if contradiction_flag else 0.0 darvo_score = ( 0.3 * pattern_score + 0.3 * sentiment_shift_score + 0.2 * motif_score + 0.2 * contradiction_score ) return round(min(darvo_score, 1.0), 3) def custom_sentiment(text): inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = sentiment_model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=1) label_idx = torch.argmax(probs).item() label_map = {0: "supportive", 1: "undermining"} return {"label": label_map[label_idx], "score": probs[0][label_idx].item()} def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1.0): weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]] base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0 base_score *= flag_multiplier return min(base_score, 100.0) def interpret_abuse_level(score): if score > 80: return "Extreme / High Risk" elif score > 60: return "Severe / Harmful Pattern Present" elif score > 40: return "Likely Abuse" elif score > 20: return "Mild Concern" return "Very Low / Likely Safe" def analyze_single_message(text, thresholds, motif_flags): motif_hits, matched_phrases = detect_motifs(text) sentiment = custom_sentiment(text) sentiment_score = sentiment["score"] if sentiment["label"] == "undermining" else 0.0 adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy() inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = model(**inputs) scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy() threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]] phrase_labels = [label for label, _ in matched_phrases] pattern_labels_used = list(set(threshold_labels + phrase_labels)) abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits) top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2] motif_phrases = [text for _, text in matched_phrases] darvo_score = calculate_darvo_score(pattern_labels_used, 0.0, sentiment_score, motif_phrases, contradiction_flag=False) return abuse_level, pattern_labels_used, top_patterns, darvo_score def analyze_composite(msg1, msg2, msg3, flags): thresholds = THRESHOLDS messages = [msg1, msg2, msg3] active_messages = [m for m in messages if m.strip()] if not active_messages: return "Please enter at least one message." results = [analyze_single_message(m, thresholds, flags) for m in active_messages] abuse_scores = [r[0] for r in results] darvo_scores = [r[3] for r in results] average_darvo = round(sum(darvo_scores) / len(darvo_scores), 3) print(f"Average DARVO Score: {average_darvo}") base_score = sum(abuse_scores) / len(abuse_scores) label_sets = [[label for label, _ in r[2]] for r in results] label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)} top_label = max(label_counts.items(), key=lambda x: x[1]) top_explanation = EXPLANATIONS.get(top_label[0], "") # Adjust flag-based weight relative to number of messages danger_weight = 5 flag_weights = { "They've threatened harm": 6, "They isolate me": 5, "I’ve changed my behavior out of fear": 4, "They monitor/follow me": 4, "I feel unsafe when alone with them": 6 } flag_boost = sum(flag_weights.get(f, 3) for f in flags) / len(active_messages) composite_score = min(base_score + flag_boost, 100) # Apply message count dampening AFTER base and flag boost if len(active_messages) == 1: composite_score *= 0.85 # 15% reduction for 1 message elif len(active_messages) == 2: composite_score *= 0.93 # 7% reduction for 2 messages composite_score = round(min(composite_score, 100), 2) # re-cap just in case result = f"These messages show a pattern of **{top_label[0]}** and are estimated to be {composite_score}% likely abusive." # Include pattern explanations result = f"These messages show a pattern of **{top_label[0]}** and are estimated to be {composite_score}% likely abusive." if top_explanation: result += f"\n• {top_explanation}" # Show DARVO score if average_darvo > 0.25: darvo_descriptor = "moderate" if average_darvo < 0.65 else "high" result += f"\n\nDARVO Score: {average_darvo} → This indicates a **{darvo_descriptor} likelihood** of narrative reversal (DARVO), where the speaker may be denying, attacking, or reversing blame." return result textbox_inputs = [ gr.Textbox(label="Message 1"), gr.Textbox(label="Message 2"), gr.Textbox(label="Message 3") ] checkboxes = gr.CheckboxGroup(label="Contextual Flags", choices=[ "They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear", "They monitor/follow me", "I feel unsafe when alone with them" ]) iface = gr.Interface( fn=analyze_composite, inputs=textbox_inputs + [checkboxes], outputs=gr.Textbox(label="Results"), title="Abuse Pattern Detector (Multi-Message)", allow_flagging="manual" ) if __name__ == "__main__": iface.launch()