SamanthaStorm commited on
Commit
e032990
·
verified ·
1 Parent(s): 38e8859

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -62
app.py CHANGED
@@ -2,15 +2,18 @@ import gradio as gr
2
  import torch
3
  import numpy as np
4
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
5
  from motif_tagging import detect_motifs
 
6
 
7
- # Load models
8
  sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
9
  sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")
10
 
 
11
  model_name = "SamanthaStorm/autotrain-c1un8-p8vzo"
12
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
13
- tokenizer = AutoTokenizer.from_pretrained(model_name)
14
 
15
  LABELS = [
16
  "gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
@@ -25,32 +28,36 @@ THRESHOLDS = {
25
  "non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.30
26
  }
27
 
 
 
 
28
  EXPLANATIONS = {
29
  "gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
30
- "blame_shifting": "Redirecting responsibility to the victim...",
31
- "projection": "Accusing the victim of behaviors the abuser exhibits...",
32
- "dismissiveness": "Belittling or disregarding someone's feelings...",
33
- "mockery": "Ridiculing someone in a hurtful, humiliating way...",
34
- "recovery_phase": "Dismissing someone's emotional healing...",
35
- "insults": "Derogatory remarks aimed at degrading someone...",
36
- "apology_baiting": "Manipulating victims into apologizing for abuse...",
37
- "deflection": "Redirecting blame to avoid accountability...",
38
- "control": "Restricting autonomy through manipulation...",
39
- "extreme_control": "Dominating decisions and behaviors entirely...",
40
- "physical_threat": "Signals risk of bodily harm...",
41
- "suicidal_threat": "Manipulates others using self-harm threats...",
42
- "guilt_tripping": "Uses guilt to manipulate someone's actions...",
43
- "manipulation": "Deceives to influence or control outcomes...",
44
- "non_abusive": "Respectful and free of coercion...",
45
- "obscure_formal": "Uses confusion/superiority to manipulate..."
46
  }
47
 
48
- DANGER_LABELS = LABELS[15:18]
49
- PATTERN_LABELS = LABELS[:15]
50
-
51
  PATTERN_WEIGHTS = {
52
- "physical_threat": 1.5, "suicidal_threat": 1.4, "extreme_control": 1.5,
53
- "gaslighting": 1.3, "control": 1.2, "dismissiveness": 0.8,
 
 
 
 
54
  "non_abusive": 0.0
55
  }
56
 
@@ -60,64 +67,77 @@ def custom_sentiment(text):
60
  outputs = sentiment_model(**inputs)
61
  probs = torch.nn.functional.softmax(outputs.logits, dim=1)
62
  label_idx = torch.argmax(probs).item()
63
- return {"label": "supportive" if label_idx == 0 else "undermining", "score": probs[0][label_idx].item()}
 
64
 
65
  def calculate_abuse_level(scores, thresholds, motif_hits=None):
66
  weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]]
67
  base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
68
- if any(label in (motif_hits or []) for label in DANGER_LABELS):
 
69
  base_score = max(base_score, 75.0)
70
  return base_score
71
 
72
  def interpret_abuse_level(score):
73
- if score > 80: return "Extreme / High Risk"
74
- if score > 60: return "Severe / Harmful Pattern Present"
75
- if score > 40: return "Likely Abuse"
76
- if score > 20: return "Mild Concern"
 
 
 
 
77
  return "Very Low / Likely Safe"
78
 
79
- def analyze_single_message(text, thresholds, context_flags):
80
- motif_flags, matched_phrases = detect_motifs(text)
81
  sentiment = custom_sentiment(text)
82
- thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
83
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
84
  with torch.no_grad():
85
- scores = torch.sigmoid(model(**inputs).logits.squeeze(0)).numpy()
86
- labels_used = list(set([l for l, s in zip(PATTERN_LABELS, scores[:15]) if s > thresholds[l]] + [l for l, _ in matched_phrases]))
87
- abuse_level = calculate_abuse_level(scores, thresholds, motif_hits=[l for l, _ in matched_phrases])
 
 
 
88
  abuse_description = interpret_abuse_level(abuse_level)
89
- danger_count = sum(scores[LABELS.index(lbl)] > thresholds[lbl] for lbl in DANGER_LABELS)
90
- output = f"Score: {abuse_level}% {abuse_description}\nLabels: {', '.join(labels_used)}"
91
- return output, abuse_level
92
 
93
  def analyze_composite(msg1, msg2, msg3, flags):
94
- thresholds = THRESHOLDS.copy()
95
- results = [analyze_single_message(t, thresholds, flags) for t in [msg1, msg2, msg3] if t.strip()]
96
- result_texts = [r[0] for r in results]
97
- composite_score = round(np.mean([r[1] for r in results]), 2) if results else 0.0
98
- result_texts.append(f"\nComposite Abuse Score: {composite_score}%")
99
- return tuple(result_texts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  iface = gr.Interface(
102
  fn=analyze_composite,
103
- inputs=[
104
- gr.Textbox(lines=3, label="Message 1"),
105
- gr.Textbox(lines=3, label="Message 2"),
106
- gr.Textbox(lines=3, label="Message 3"),
107
- gr.CheckboxGroup(label="Contextual Flags", choices=[
108
- "They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
109
- "They monitor/follow me", "I feel unsafe when alone with them"
110
- ])
111
- ],
112
- outputs=[
113
- gr.Textbox(label="Message 1 Result"),
114
- gr.Textbox(label="Message 2 Result"),
115
- gr.Textbox(label="Message 3 Result"),
116
- gr.Textbox(label="Composite Score")
117
- ],
118
  title="Abuse Pattern Detector (Multi-Message)",
119
- flagging_mode="manual"
120
  )
121
 
122
  if __name__ == "__main__":
123
- iface.launch()
 
2
  import torch
3
  import numpy as np
4
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
+ from transformers import RobertaForSequenceClassification, RobertaTokenizer
6
  from motif_tagging import detect_motifs
7
+ from abuse_type_mapping import determine_abuse_type
8
 
9
+ # custom fine-tuned sentiment model
10
  sentiment_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-sentiment")
11
  sentiment_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-sentiment")
12
 
13
+ # Load abuse pattern model
14
  model_name = "SamanthaStorm/autotrain-c1un8-p8vzo"
15
+ model = RobertaForSequenceClassification.from_pretrained(model_name, trust_remote_code=True)
16
+ tokenizer = RobertaTokenizer.from_pretrained(model_name, trust_remote_code=True)
17
 
18
  LABELS = [
19
  "gaslighting", "mockery", "dismissiveness", "control", "guilt_tripping", "apology_baiting", "blame_shifting", "projection",
 
28
  "non_abusive": 2.0, "suicidal_threat": 0.45, "physical_threat": 0.02, "extreme_control": 0.30
29
  }
30
 
31
+ PATTERN_LABELS = LABELS[:15]
32
+ DANGER_LABELS = LABELS[15:18]
33
+
34
  EXPLANATIONS = {
35
  "gaslighting": "Gaslighting involves making someone question their own reality or perceptions...",
36
+ "blame_shifting": "Blame-shifting is when one person redirects the responsibility...",
37
+ "projection": "Projection involves accusing the victim of behaviors the abuser exhibits.",
38
+ "dismissiveness": "Dismissiveness is belittling or disregarding another person’s feelings.",
39
+ "mockery": "Mockery ridicules someone in a hurtful, humiliating way.",
40
+ "recovery_phase": "Recovery phase dismisses someone's emotional healing process.",
41
+ "insults": "Insults are derogatory remarks aimed at degrading someone.",
42
+ "apology_baiting": "Apology-baiting manipulates victims into apologizing for abuser's behavior.",
43
+ "deflection": "Deflection avoids accountability by redirecting blame.",
44
+ "control": "Control restricts autonomy through manipulation or coercion.",
45
+ "extreme_control": "Extreme control dominates decisions and behaviors entirely.",
46
+ "physical_threat": "Physical threats signal risk of bodily harm.",
47
+ "suicidal_threat": "Suicidal threats manipulate others using self-harm threats.",
48
+ "guilt_tripping": "Guilt-tripping uses guilt to manipulate someones actions.",
49
+ "manipulation": "Manipulation deceives to influence or control outcomes.",
50
+ "non_abusive": "Non-abusive language is respectful and free of coercion.",
51
+ "obscure_formal": "Obscure/formal language manipulates through confusion or superiority."
52
  }
53
 
 
 
 
54
  PATTERN_WEIGHTS = {
55
+ "physical_threat": 1.5,
56
+ "suicidal_threat": 1.4,
57
+ "extreme_control": 1.5,
58
+ "gaslighting": 1.3,
59
+ "control": 1.2,
60
+ "dismissiveness": 0.8,
61
  "non_abusive": 0.0
62
  }
63
 
 
67
  outputs = sentiment_model(**inputs)
68
  probs = torch.nn.functional.softmax(outputs.logits, dim=1)
69
  label_idx = torch.argmax(probs).item()
70
+ label_map = {0: "supportive", 1: "undermining"}
71
+ return {"label": label_map[label_idx], "score": probs[0][label_idx].item()}
72
 
73
  def calculate_abuse_level(scores, thresholds, motif_hits=None):
74
  weighted_scores = [score * PATTERN_WEIGHTS.get(label, 1.0) for label, score in zip(LABELS, scores) if score > thresholds[label]]
75
  base_score = round(np.mean(weighted_scores) * 100, 2) if weighted_scores else 0.0
76
+ motif_hits = motif_hits or []
77
+ if any(label in motif_hits for label in {"physical_threat", "suicidal_threat", "extreme_control"}):
78
  base_score = max(base_score, 75.0)
79
  return base_score
80
 
81
  def interpret_abuse_level(score):
82
+ if score > 80:
83
+ return "Extreme / High Risk"
84
+ elif score > 60:
85
+ return "Severe / Harmful Pattern Present"
86
+ elif score > 40:
87
+ return "Likely Abuse"
88
+ elif score > 20:
89
+ return "Mild Concern"
90
  return "Very Low / Likely Safe"
91
 
92
+ def analyze_single_message(text, thresholds, motif_flags):
93
+ motif_hits, matched_phrases = detect_motifs(text)
94
  sentiment = custom_sentiment(text)
95
+ adjusted_thresholds = {k: v * 0.8 for k, v in thresholds.items()} if sentiment['label'] == "undermining" else thresholds.copy()
96
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
97
  with torch.no_grad():
98
+ outputs = model(**inputs)
99
+ scores = torch.sigmoid(outputs.logits.squeeze(0)).numpy()
100
+ threshold_labels = [label for label, score in zip(PATTERN_LABELS, scores[:15]) if score > adjusted_thresholds[label]]
101
+ phrase_labels = [label for label, _ in matched_phrases]
102
+ pattern_labels_used = list(set(threshold_labels + phrase_labels))
103
+ abuse_level = calculate_abuse_level(scores, adjusted_thresholds, motif_hits)
104
  abuse_description = interpret_abuse_level(abuse_level)
105
+ top_patterns = sorted([(label, score) for label, score in zip(PATTERN_LABELS, scores[:15]) if label != "non_abusive"], key=lambda x: x[1], reverse=True)[:2]
106
+ pattern_expl = "\n".join([f" {label.replace('_', ' ').title()}: {EXPLANATIONS.get(label)}" for label, _ in top_patterns])
107
+ return abuse_level, abuse_description, pattern_expl
108
 
109
  def analyze_composite(msg1, msg2, msg3, flags):
110
+ thresholds = THRESHOLDS
111
+ results = [analyze_single_message(m, thresholds, flags) for m in [msg1, msg2, msg3] if m.strip()]
112
+ if not results:
113
+ return "Please enter at least one message."
114
+ result_lines = []
115
+ total_score = 0
116
+ for i, (score, desc, patterns) in enumerate(results, 1):
117
+ total_score += score
118
+ result_lines.append(f"Message {i}: {score:.2f}% – {desc}\n{patterns}\n")
119
+ composite = round(total_score / len(results), 2)
120
+ result_lines.append(f"\nComposite Abuse Score: {composite}%")
121
+ return "\n\n".join(result_lines)
122
+
123
+ txt_inputs = [
124
+ gr.Textbox(label="Message 1"),
125
+ gr.Textbox(label="Message 2"),
126
+ gr.Textbox(label="Message 3")
127
+ ]
128
+
129
+ checkboxes = gr.CheckboxGroup(label="Contextual Flags", choices=[
130
+ "They've threatened harm", "They isolate me", "I’ve changed my behavior out of fear",
131
+ "They monitor/follow me", "I feel unsafe when alone with them"
132
+ ])
133
 
134
  iface = gr.Interface(
135
  fn=analyze_composite,
136
+ inputs=txt_inputs + [checkboxes],
137
+ outputs=gr.Textbox(label="Results"),
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  title="Abuse Pattern Detector (Multi-Message)",
139
+ allow_flagging="manual"
140
  )
141
 
142
  if __name__ == "__main__":
143
+ iface.launch()