SamanthaStorm commited on
Commit
3164573
·
verified ·
1 Parent(s): 0407a58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -14
app.py CHANGED
@@ -794,7 +794,34 @@ def compute_abuse_score(matched_scores, sentiment):
794
  except Exception as e:
795
  logger.error(f"Error computing abuse score: {e}")
796
  return 0.0
797
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
798
  @spaces.GPU
799
  def analyze_single_message(text, thresholds):
800
  """Analyze a single message for abuse patterns"""
@@ -805,10 +832,33 @@ def analyze_single_message(text, thresholds):
805
  if not text.strip():
806
  logger.debug("Empty text, returning zeros")
807
  return 0.0, [], [], {"label": "none"}, 1, 0.0, None
808
-
809
- # Check for explicit abuse
810
- explicit_abuse_words = ['fuck', 'bitch', 'shit', 'ass', 'dick']
811
- explicit_abuse = any(word in text.lower() for word in explicit_abuse_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
812
  logger.debug(f"Explicit abuse detected: {explicit_abuse}")
813
 
814
  # Abuse model inference
@@ -845,7 +895,7 @@ def analyze_single_message(text, thresholds):
845
  if label not in threshold_labels:
846
  threshold_labels.append(label)
847
 
848
- logger.debug("\nLabels that passed thresholds:", threshold_labels)
849
 
850
  # Calculate matched scores
851
  matched_scores = []
@@ -884,15 +934,20 @@ def analyze_single_message(text, thresholds):
884
  abuse_score = compute_abuse_score(matched_scores, sentiment)
885
  if explicit_abuse:
886
  abuse_score = max(abuse_score, 70.0)
 
 
 
 
 
 
887
 
888
  # Check for compound threats
889
- compound_threat_flag, threat_type = detect_compound_threat(
890
- text, threshold_labels
891
- )
892
 
893
- if compound_threat_flag:
 
894
  logger.debug(f"⚠️ Compound threat detected in message: {threat_type}")
895
- abuse_score = max(abuse_score, 85.0) # Force high score for compound threats
896
 
897
  # Get DARVO score
898
  darvo_score = predict_darvo_score(text)
@@ -903,11 +958,11 @@ def analyze_single_message(text, thresholds):
903
  # Log tone usage
904
  log_emotional_tone_usage(tone_tag, threshold_labels)
905
 
906
- # Check for the specific combination
907
- highest_pattern = max(matched_scores, key=lambda x: x[1])[0] if matched_scores else None # Get highest pattern
908
  if sentiment == "supportive" and tone_tag == "neutral" and highest_pattern == "obscure language":
909
  logger.debug("Message classified as likely non-abusive (supportive, neutral, and obscure language). Returning low risk.")
910
- return 0.0, [], [], {"label": "supportive"}, 1, 0.0, "neutral" # Return non-abusive values
911
 
912
  # Set stage
913
  stage = 2 if explicit_abuse or abuse_score > 70 else 1
 
794
  except Exception as e:
795
  logger.error(f"Error computing abuse score: {e}")
796
  return 0.0
797
+
798
+ def detect_explicit_abuse(text):
799
+ """Improved explicit abuse detection with word boundary checking"""
800
+ import re
801
+
802
+ explicit_abuse_words = ['fuck', 'bitch', 'shit', 'dick'] # Removed 'ass'
803
+ # Add more specific patterns for actual abusive uses of 'ass'
804
+ abusive_ass_patterns = [
805
+ r'\bass\b(?!\s*glass)', # 'ass' not followed by 'glass'
806
+ r'\bdumb\s*ass\b',
807
+ r'\bkiss\s*my\s*ass\b',
808
+ r'\bget\s*your\s*ass\b'
809
+ ]
810
+
811
+ text_lower = text.lower()
812
+
813
+ # Check basic explicit words
814
+ for word in explicit_abuse_words:
815
+ if re.search(r'\b' + word + r'\b', text_lower):
816
+ return True
817
+
818
+ # Check specific abusive 'ass' patterns
819
+ for pattern in abusive_ass_patterns:
820
+ if re.search(pattern, text_lower):
821
+ return True
822
+
823
+ return False
824
+
825
  @spaces.GPU
826
  def analyze_single_message(text, thresholds):
827
  """Analyze a single message for abuse patterns"""
 
832
  if not text.strip():
833
  logger.debug("Empty text, returning zeros")
834
  return 0.0, [], [], {"label": "none"}, 1, 0.0, None
835
+
836
+ # EARLY SUPPORTIVE MESSAGE CHECK
837
+ innocent_indicators = [
838
+ 'broken', 'not working', 'cracked', 'glass', 'screen', 'phone',
839
+ 'device', 'battery', 'charger', 'wifi', 'internet', 'computer',
840
+ 'sorry', 'apologize', 'my fault', 'mistake'
841
+ ]
842
+
843
+ # If message contains innocent indicators and is short/simple
844
+ if (any(indicator in text.lower() for indicator in innocent_indicators) and
845
+ len(text.split()) < 20 and
846
+ not any(threat in text.lower() for threat in ['kill', 'hurt', 'destroy', 'hate'])):
847
+
848
+ # Run quick sentiment check
849
+ sent_inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
850
+ sent_inputs = {k: v.to(device) for k, v in sent_inputs.items()}
851
+ with torch.no_grad():
852
+ sent_logits = sentiment_model(**sent_inputs).logits[0]
853
+ sent_probs = torch.softmax(sent_logits, dim=-1).cpu().numpy()
854
+
855
+ # If sentiment is strongly supportive, return early
856
+ if sent_probs[0] > 0.8: # 80% supportive
857
+ logger.debug("Early return: Message appears to be innocent/supportive")
858
+ return 0.0, [], [], {"label": "supportive"}, 1, 0.0, "neutral"
859
+
860
+ # Check for explicit abuse (moved AFTER early return check)
861
+ explicit_abuse = detect_explicit_abuse(text)
862
  logger.debug(f"Explicit abuse detected: {explicit_abuse}")
863
 
864
  # Abuse model inference
 
895
  if label not in threshold_labels:
896
  threshold_labels.append(label)
897
 
898
+ logger.debug(f"\nLabels that passed thresholds: {threshold_labels}")
899
 
900
  # Calculate matched scores
901
  matched_scores = []
 
934
  abuse_score = compute_abuse_score(matched_scores, sentiment)
935
  if explicit_abuse:
936
  abuse_score = max(abuse_score, 70.0)
937
+
938
+ # Apply sentiment-based score capping BEFORE compound threat check
939
+ if sentiment == "supportive" and not explicit_abuse:
940
+ # For supportive messages, cap the abuse score much lower
941
+ abuse_score = min(abuse_score, 30.0)
942
+ logger.debug(f"Capped abuse score to {abuse_score} due to supportive sentiment")
943
 
944
  # Check for compound threats
945
+ compound_threat_flag, threat_type = detect_compound_threat(text, threshold_labels)
 
 
946
 
947
+ # Apply compound threat override only for non-supportive messages
948
+ if compound_threat_flag and sentiment != "supportive":
949
  logger.debug(f"⚠️ Compound threat detected in message: {threat_type}")
950
+ abuse_score = max(abuse_score, 85.0)
951
 
952
  # Get DARVO score
953
  darvo_score = predict_darvo_score(text)
 
958
  # Log tone usage
959
  log_emotional_tone_usage(tone_tag, threshold_labels)
960
 
961
+ # Check for the specific combination (final safety check)
962
+ highest_pattern = max(matched_scores, key=lambda x: x[1])[0] if matched_scores else None
963
  if sentiment == "supportive" and tone_tag == "neutral" and highest_pattern == "obscure language":
964
  logger.debug("Message classified as likely non-abusive (supportive, neutral, and obscure language). Returning low risk.")
965
+ return 0.0, [], [], {"label": "supportive"}, 1, 0.0, "neutral"
966
 
967
  # Set stage
968
  stage = 2 if explicit_abuse or abuse_score > 70 else 1