import gradio as gr import torch import numpy as np from transformers import AutoModelForSequenceClassification, AutoTokenizer from transformers import RobertaForSequenceClassification, RobertaTokenizer from motif_tagging import detect_motifs # custom fine-tuned sentiment model sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment") sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment") # Load abuse pattern model model_name ="SamanthaStorm/autotrain-jlpi4-mllvp" model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True) tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True) LABELS = [ "blame shifting", "contradictory statements", "control", "dismissiveness", "gaslighting", "guilt tripping", "insults", "obscure language", "projection", "recovery phase", "threat" ] THRESHOLDS = { "blame shifting": 0.3, "contradictory statements": 0.32, "control": 0.40, "dismissiveness": 0.45, "gaslighting": 0.30, "guilt tripping": 0.20, "insults": 0.34, "obscure language": 0.25, "projection": 0.35, "recovery phase": 0.25, "threat": 0.25 } PATTERN_LABELS = LABELS EXPLANATIONS = { "blame shifting": "Blame-shifting is when one person redirects responsibility onto someone else to avoid accountability.", "contradictory statements": "Contradictory statements confuse the listener by flipping positions or denying previous claims.", "control": "Control restricts another person’s autonomy through coercion, manipulation, or threats.", "dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings, needs, or opinions.", "gaslighting": "Gaslighting involves making someone question their own reality, memory, or perceptions.", "guilt tripping": "Guilt-tripping uses guilt to manipulate someone’s actions or decisions.", "insults": "Insults are derogatory or demeaning remarks meant to shame, belittle, or hurt someone.", "obscure language": "Obscure language manipulates through complexity, vagueness, or superiority to confuse the other person.", "projection": "Projection accuses someone else of the very behaviors or intentions the speaker is exhibiting.", "recovery phase": "Recovery phase statements attempt to soothe or reset tension without acknowledging harm or change.", "threat": "Threats use fear of harm (physical, emotional, or relational) to control or intimidate someone." } PATTERN_WEIGHTS = { "gaslighting": 1.3, "mockery": 1.2, "control": 1.2, "dismissiveness": 0.8, "blame_shifting": 0.8, "contradictory_statements": 0.75, } def custom_sentiment(text): inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = sentiment_model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=1) label_idx = torch.argmax(probs).item() label_map = {0: "supportive", 1: "undermining"} return {"label": label_map[label_idx], "score": probs[0][label_idx].item()} def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1.0): weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]] base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0 base_score *= flag_multiplier return min(base_score, 100.0) def interpret_abuse_level(score): if score > 80: return "Extreme / High Risk" elif score > 60: return "Severe / Harmful Pattern Present" elif score > 40: return "Likely Abuse" elif score > 20: return "Mild Concern" return "Very Low / Likely Safe" def analyze_single_message(text, thresholds, motif_flags): motif_hits, matched_phrases = detect_motifs(text) sentiment = custom_sentiment(text) adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy() inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = model(**inputs) scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy() threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]] phrase_labels = [label for label, _ in matched_phrases] pattern_labels_used = list(set(threshold_labels + phrase_labels)) abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits) top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2] return abuse_level, pattern_labels_used, top_patterns def analyze_composite(msg1, msg2, msg3, flags): thresholds = THRESHOLDS messages = [msg1, msg2, msg3] active_messages = [m for m in messages if m.strip()] if not active_messages: return "Please enter at least one message." results = [analyze_single_message(m, thresholds, flags) for m in active_messages] abuse_scores = [r[0] for r in results] base_score = sum(abuse_scores) / len(abuse_scores) label_sets = [[label for label, _ in r[2]] for r in results] label_counts = {label: sum(label in s for s in label_sets) for label in set().union(*label_sets)} top_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)[:2] top_explanations = [EXPLANATIONS.get(label, "") for label, _ in top_labels] # Adjust flag-based weight relative to number of messages danger_weight = 5 flag_weights = { "They've threatened harm": 6, "They isolate me": 5, "I’ve changed my behavior out of fear": 4, "They monitor/follow me": 4, "I feel unsafe when alone with them": 6 } flag_boost = sum(flag_weights.get(f, 3) for f in flags) / len(active_messages) composite_score = min(base_score + flag_boost, 100) # Apply message count dampening AFTER base and flag boost if len(active_messages) == 1: composite_score *= 0.85 # 15% reduction for 1 message elif len(active_messages) == 2: composite_score *= 0.93 # 7% reduction for 2 messages composite_score = round(min(composite_score, 100), 2) # re-cap just in case result = f"These messages show patterns of {', '.join(label for label, _ in top_labels)} and are estimated to be {composite_score}% likely abusive." for expl in top_explanations: if expl: result += f"\n• {expl}" return result textbox_inputs = [ gr.Textbox(label="Message 1"), gr.Textbox(label="Message 2"), gr.Textbox(label="Message 3") ] checkboxes = gr.CheckboxGroup(label="Contextual Flags", choices=[ "They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear", "They monitor/follow me", "I feel unsafe when alone with them" ]) iface = gr.Interface( fn=analyze_composite, inputs=textbox_inputs + [checkboxes], outputs=gr.Textbox(label="Results"), title="Abuse Pattern Detector (Multi-Message)", allow_flagging="manual" ) if __name__ == "__main__": iface.launch()