import gradio as gr import torch import numpy as np from transformers import AutoModelForSequenceClassification, AutoTokenizer from transformers import RobertaForSequenceClassification, RobertaTokenizer from motif_tagging import detect_motifs # custom fine-tuned sentiment model sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment") sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment") # Load abuse pattern model model_name ="SamanthaStorm/autotrain-jlp14-mllvp" model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True) tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True) LABELS = [ "gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection", "contradictory_statements", "manipulation", "deflection", "insults", "obscure_formal", "recovery_phase" ] THRESHOLDS = { "gaslighting": 0.25, "mockery": 0.15, "dismissiveness": 0.45, "control": 0.43, "guilt_tripping": 0.15, "apology_baiting": 0.2, "blame_shifting": 0.23, "projection": 0.50, "contradictory_statements": 0.25, "manipulation": 0.25, "deflection": 0.30, "insults": 0.34, "obscure_formal": 0.25, "recovery_phase": 0.25 } EXPLANATIONS = { "gaslighting": "Gaslighting involves making someone question their own reality or perceptions...", "blame_shifting": "Blame-shifting is when one person redirects the responsibility...", "projection": "Projection involves accusing the victim of behaviors the abuser exhibits.", "dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.", "mockery": "Mockery ridicules someone in a hurtful, humiliating way.", "recovery_phase": "Recovery phase dismisses someone's emotional healing process.", "insults": "Insults are derogatory remarks aimed at degrading someone.", "apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.", "deflection": "Deflection avoids accountability by redirecting blame.", "control": "Control restricts autonomy through manipulation or coercion.", "guilt_tripping": "Guilt-tripping uses guilt to manipulate someone’s actions.", "manipulation": "Manipulation deceives to influence or control outcomes.", "obscure_formal": "Obscure/formal language manipulates through confusion or superiority." } PATTERN_WEIGHTS = { "gaslighting": 1.3, "mockery": 1.2, "control": 1.2, "dismissiveness": 0.8 } def custom_sentiment(text): inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = sentiment_model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=1) label_idx = torch.argmax(probs).item() label_map = {0: "supportive", 1: "undermining"} return {"label": label_map[label_idx], "score": probs[0][label_idx].item()} def calculate_abuse_level(scores, thresholds, motif_hits=None, flag_multiplier=1.0): weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]] base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0 base_score *= flag_multiplier return min(base_score, 100.0) def interpret_abuse_level(score): if score > 80: return "Extreme / High Risk" elif score > 60: return "Severe / Harmful Pattern Present" elif score > 40: return "Likely Abuse" elif score > 20: return "Mild Concern" return "Very Low / Likely Safe" def analyze_single_message(text, thresholds, motif_flags): motif_hits, matched_phrases = detect_motifs(text) sentiment = custom_sentiment(text) adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy() inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = model(**inputs) scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy() threshold_labels = [label for label, score in zip(LABELS, scores) if score > adjusted_thresholds[label]] phrase_labels = [label for label, _ in matched_phrases] pattern_labels_used = list(set(threshold_labels + phrase_labels)) abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits) top_patterns = sorted([(label, score) for label, score in zip(LABELS, scores)], key=lambda x: x[1], reverse=True)[:2] return abuse_level, pattern_labels_used, top_patterns def analyze_composite(msg1, msg2, msg3, flags): thresholds = THRESHOLDS messages = [msg1, msg2, msg3] active_messages = [m for m in messages if m.strip()] if not active_messages: return "Please enter at least one message." flag_multiplier = 1 + (0.1 * len(flags)) # each checked flag increases weight by 10% results = [analyze_single_message(m, thresholds, flags) for m in active_messages] abuse_scores = [r[0] for r in results] composite_score = round(sum(abuse_scores) / len(abuse_scores), 2) label_sets = [label for result in results for label in result[1]] label_counts = {label: label_sets.count(label) for label in set(label_sets)} top_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)[:2] top_explanations = [EXPLANATIONS.get(label, "") for label, _ in top_labels] result = f"These messages show patterns of {', '.join(label for label, _ in top_labels)} and are estimated to be {composite_score}% likely abusive." for expl in top_explanations: if expl: result += f"\n• {expl}" return result textbox_inputs = [ gr.Textbox(label="Message 1"), gr.Textbox(label="Message 2"), gr.Textbox(label="Message 3") ] checkboxes = gr.CheckboxGroup(label="Contextual Flags", choices=[ "They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear", "They monitor/follow me", "I feel unsafe when alone with them" ]) iface = gr.Interface( fn=analyze_composite, inputs=textbox_inputs + [checkboxes], outputs=gr.Textbox(label="Results"), title="Abuse Pattern Detector (Multi-Message)", allow_flagging="manual" ) if __name__ == "__main__": iface.launch()