Spaces:

SamanthaStorm
/

Tether

Running on Zero

App Files Files Community

SamanthaStorm commited on 15 days ago

Commit

3164573

verified ·

1 Parent(s): 0407a58

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -14

app.py CHANGED Viewed

@@ -794,7 +794,34 @@ def compute_abuse_score(matched_scores, sentiment):
     except Exception as e:
         logger.error(f"Error computing abuse score: {e}")
         return 0.0
 @spaces.GPU
 def analyze_single_message(text, thresholds):
     """Analyze a single message for abuse patterns"""
@@ -805,10 +832,33 @@ def analyze_single_message(text, thresholds):
         if not text.strip():
             logger.debug("Empty text, returning zeros")
             return 0.0, [], [], {"label": "none"}, 1, 0.0, None
-        # Check for explicit abuse
-        explicit_abuse_words = ['fuck', 'bitch', 'shit', 'ass', 'dick']
-        explicit_abuse = any(word in text.lower() for word in explicit_abuse_words)
         logger.debug(f"Explicit abuse detected: {explicit_abuse}")
         # Abuse model inference
@@ -845,7 +895,7 @@ def analyze_single_message(text, thresholds):
                 if label not in threshold_labels:
                     threshold_labels.append(label)
-        logger.debug("\nLabels that passed thresholds:", threshold_labels)
         # Calculate matched scores
         matched_scores = []
@@ -884,15 +934,20 @@ def analyze_single_message(text, thresholds):
         abuse_score = compute_abuse_score(matched_scores, sentiment)
         if explicit_abuse:
             abuse_score = max(abuse_score, 70.0)
         # Check for compound threats
-        compound_threat_flag, threat_type = detect_compound_threat(
-            text, threshold_labels
-        )
-        if compound_threat_flag:
             logger.debug(f"⚠️ Compound threat detected in message: {threat_type}")
-            abuse_score = max(abuse_score, 85.0)  # Force high score for compound threats
         # Get DARVO score
         darvo_score = predict_darvo_score(text)
@@ -903,11 +958,11 @@ def analyze_single_message(text, thresholds):
         # Log tone usage
         log_emotional_tone_usage(tone_tag, threshold_labels)
-        # Check for the specific combination
-        highest_pattern = max(matched_scores, key=lambda x: x[1])[0] if matched_scores else None  # Get highest pattern
         if sentiment == "supportive" and tone_tag == "neutral" and highest_pattern == "obscure language":
             logger.debug("Message classified as likely non-abusive (supportive, neutral, and obscure language). Returning low risk.")
-            return 0.0, [], [], {"label": "supportive"}, 1, 0.0, "neutral"  # Return non-abusive values
         # Set stage
         stage = 2 if explicit_abuse or abuse_score > 70 else 1

     except Exception as e:
         logger.error(f"Error computing abuse score: {e}")
         return 0.0
+def detect_explicit_abuse(text):
+    """Improved explicit abuse detection with word boundary checking"""
+    import re
+    explicit_abuse_words = ['fuck', 'bitch', 'shit', 'dick']  # Removed 'ass'
+    # Add more specific patterns for actual abusive uses of 'ass'
+    abusive_ass_patterns = [
+        r'\bass\b(?!\s*glass)',  # 'ass' not followed by 'glass'
+        r'\bdumb\s*ass\b',
+        r'\bkiss\s*my\s*ass\b',
+        r'\bget\s*your\s*ass\b'
+    ]
+    text_lower = text.lower()
+    # Check basic explicit words
+    for word in explicit_abuse_words:
+        if re.search(r'\b' + word + r'\b', text_lower):
+            return True
+    # Check specific abusive 'ass' patterns
+    for pattern in abusive_ass_patterns:
+        if re.search(pattern, text_lower):
+            return True
+    return False
 @spaces.GPU
 def analyze_single_message(text, thresholds):
     """Analyze a single message for abuse patterns"""
         if not text.strip():
             logger.debug("Empty text, returning zeros")
             return 0.0, [], [], {"label": "none"}, 1, 0.0, None
+        # EARLY SUPPORTIVE MESSAGE CHECK
+        innocent_indicators = [
+            'broken', 'not working', 'cracked', 'glass', 'screen', 'phone',
+            'device', 'battery', 'charger', 'wifi', 'internet', 'computer',
+            'sorry', 'apologize', 'my fault', 'mistake'
+        ]
+        # If message contains innocent indicators and is short/simple
+        if (any(indicator in text.lower() for indicator in innocent_indicators) and
+            len(text.split()) < 20 and
+            not any(threat in text.lower() for threat in ['kill', 'hurt', 'destroy', 'hate'])):
+            # Run quick sentiment check
+            sent_inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
+            sent_inputs = {k: v.to(device) for k, v in sent_inputs.items()}
+            with torch.no_grad():
+                sent_logits = sentiment_model(**sent_inputs).logits[0]
+                sent_probs = torch.softmax(sent_logits, dim=-1).cpu().numpy()
+            # If sentiment is strongly supportive, return early
+            if sent_probs[0] > 0.8:  # 80% supportive
+                logger.debug("Early return: Message appears to be innocent/supportive")
+                return 0.0, [], [], {"label": "supportive"}, 1, 0.0, "neutral"
+        # Check for explicit abuse (moved AFTER early return check)
+        explicit_abuse = detect_explicit_abuse(text)
         logger.debug(f"Explicit abuse detected: {explicit_abuse}")
         # Abuse model inference
                 if label not in threshold_labels:
                     threshold_labels.append(label)
+        logger.debug(f"\nLabels that passed thresholds: {threshold_labels}")
         # Calculate matched scores
         matched_scores = []
         abuse_score = compute_abuse_score(matched_scores, sentiment)
         if explicit_abuse:
             abuse_score = max(abuse_score, 70.0)
+        # Apply sentiment-based score capping BEFORE compound threat check
+        if sentiment == "supportive" and not explicit_abuse:
+            # For supportive messages, cap the abuse score much lower
+            abuse_score = min(abuse_score, 30.0)
+            logger.debug(f"Capped abuse score to {abuse_score} due to supportive sentiment")
         # Check for compound threats
+        compound_threat_flag, threat_type = detect_compound_threat(text, threshold_labels)
+        # Apply compound threat override only for non-supportive messages
+        if compound_threat_flag and sentiment != "supportive":
             logger.debug(f"⚠️ Compound threat detected in message: {threat_type}")
+            abuse_score = max(abuse_score, 85.0)
         # Get DARVO score
         darvo_score = predict_darvo_score(text)
         # Log tone usage
         log_emotional_tone_usage(tone_tag, threshold_labels)
+        # Check for the specific combination (final safety check)
+        highest_pattern = max(matched_scores, key=lambda x: x[1])[0] if matched_scores else None
         if sentiment == "supportive" and tone_tag == "neutral" and highest_pattern == "obscure language":
             logger.debug("Message classified as likely non-abusive (supportive, neutral, and obscure language). Returning low risk.")
+            return 0.0, [], [], {"label": "supportive"}, 1, 0.0, "neutral"
         # Set stage
         stage = 2 if explicit_abuse or abuse_score > 70 else 1