Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -794,7 +794,34 @@ def compute_abuse_score(matched_scores, sentiment):
|
|
794 |
except Exception as e:
|
795 |
logger.error(f"Error computing abuse score: {e}")
|
796 |
return 0.0
|
797 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
798 |
@spaces.GPU
|
799 |
def analyze_single_message(text, thresholds):
|
800 |
"""Analyze a single message for abuse patterns"""
|
@@ -805,10 +832,33 @@ def analyze_single_message(text, thresholds):
|
|
805 |
if not text.strip():
|
806 |
logger.debug("Empty text, returning zeros")
|
807 |
return 0.0, [], [], {"label": "none"}, 1, 0.0, None
|
808 |
-
|
809 |
-
#
|
810 |
-
|
811 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
812 |
logger.debug(f"Explicit abuse detected: {explicit_abuse}")
|
813 |
|
814 |
# Abuse model inference
|
@@ -845,7 +895,7 @@ def analyze_single_message(text, thresholds):
|
|
845 |
if label not in threshold_labels:
|
846 |
threshold_labels.append(label)
|
847 |
|
848 |
-
logger.debug("\nLabels that passed thresholds:
|
849 |
|
850 |
# Calculate matched scores
|
851 |
matched_scores = []
|
@@ -884,15 +934,20 @@ def analyze_single_message(text, thresholds):
|
|
884 |
abuse_score = compute_abuse_score(matched_scores, sentiment)
|
885 |
if explicit_abuse:
|
886 |
abuse_score = max(abuse_score, 70.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
887 |
|
888 |
# Check for compound threats
|
889 |
-
compound_threat_flag, threat_type = detect_compound_threat(
|
890 |
-
text, threshold_labels
|
891 |
-
)
|
892 |
|
893 |
-
|
|
|
894 |
logger.debug(f"⚠️ Compound threat detected in message: {threat_type}")
|
895 |
-
abuse_score = max(abuse_score, 85.0)
|
896 |
|
897 |
# Get DARVO score
|
898 |
darvo_score = predict_darvo_score(text)
|
@@ -903,11 +958,11 @@ def analyze_single_message(text, thresholds):
|
|
903 |
# Log tone usage
|
904 |
log_emotional_tone_usage(tone_tag, threshold_labels)
|
905 |
|
906 |
-
# Check for the specific combination
|
907 |
-
highest_pattern = max(matched_scores, key=lambda x: x[1])[0] if matched_scores else None
|
908 |
if sentiment == "supportive" and tone_tag == "neutral" and highest_pattern == "obscure language":
|
909 |
logger.debug("Message classified as likely non-abusive (supportive, neutral, and obscure language). Returning low risk.")
|
910 |
-
return 0.0, [], [], {"label": "supportive"}, 1, 0.0, "neutral"
|
911 |
|
912 |
# Set stage
|
913 |
stage = 2 if explicit_abuse or abuse_score > 70 else 1
|
|
|
794 |
except Exception as e:
|
795 |
logger.error(f"Error computing abuse score: {e}")
|
796 |
return 0.0
|
797 |
+
|
798 |
+
def detect_explicit_abuse(text):
|
799 |
+
"""Improved explicit abuse detection with word boundary checking"""
|
800 |
+
import re
|
801 |
+
|
802 |
+
explicit_abuse_words = ['fuck', 'bitch', 'shit', 'dick'] # Removed 'ass'
|
803 |
+
# Add more specific patterns for actual abusive uses of 'ass'
|
804 |
+
abusive_ass_patterns = [
|
805 |
+
r'\bass\b(?!\s*glass)', # 'ass' not followed by 'glass'
|
806 |
+
r'\bdumb\s*ass\b',
|
807 |
+
r'\bkiss\s*my\s*ass\b',
|
808 |
+
r'\bget\s*your\s*ass\b'
|
809 |
+
]
|
810 |
+
|
811 |
+
text_lower = text.lower()
|
812 |
+
|
813 |
+
# Check basic explicit words
|
814 |
+
for word in explicit_abuse_words:
|
815 |
+
if re.search(r'\b' + word + r'\b', text_lower):
|
816 |
+
return True
|
817 |
+
|
818 |
+
# Check specific abusive 'ass' patterns
|
819 |
+
for pattern in abusive_ass_patterns:
|
820 |
+
if re.search(pattern, text_lower):
|
821 |
+
return True
|
822 |
+
|
823 |
+
return False
|
824 |
+
|
825 |
@spaces.GPU
|
826 |
def analyze_single_message(text, thresholds):
|
827 |
"""Analyze a single message for abuse patterns"""
|
|
|
832 |
if not text.strip():
|
833 |
logger.debug("Empty text, returning zeros")
|
834 |
return 0.0, [], [], {"label": "none"}, 1, 0.0, None
|
835 |
+
|
836 |
+
# EARLY SUPPORTIVE MESSAGE CHECK
|
837 |
+
innocent_indicators = [
|
838 |
+
'broken', 'not working', 'cracked', 'glass', 'screen', 'phone',
|
839 |
+
'device', 'battery', 'charger', 'wifi', 'internet', 'computer',
|
840 |
+
'sorry', 'apologize', 'my fault', 'mistake'
|
841 |
+
]
|
842 |
+
|
843 |
+
# If message contains innocent indicators and is short/simple
|
844 |
+
if (any(indicator in text.lower() for indicator in innocent_indicators) and
|
845 |
+
len(text.split()) < 20 and
|
846 |
+
not any(threat in text.lower() for threat in ['kill', 'hurt', 'destroy', 'hate'])):
|
847 |
+
|
848 |
+
# Run quick sentiment check
|
849 |
+
sent_inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
|
850 |
+
sent_inputs = {k: v.to(device) for k, v in sent_inputs.items()}
|
851 |
+
with torch.no_grad():
|
852 |
+
sent_logits = sentiment_model(**sent_inputs).logits[0]
|
853 |
+
sent_probs = torch.softmax(sent_logits, dim=-1).cpu().numpy()
|
854 |
+
|
855 |
+
# If sentiment is strongly supportive, return early
|
856 |
+
if sent_probs[0] > 0.8: # 80% supportive
|
857 |
+
logger.debug("Early return: Message appears to be innocent/supportive")
|
858 |
+
return 0.0, [], [], {"label": "supportive"}, 1, 0.0, "neutral"
|
859 |
+
|
860 |
+
# Check for explicit abuse (moved AFTER early return check)
|
861 |
+
explicit_abuse = detect_explicit_abuse(text)
|
862 |
logger.debug(f"Explicit abuse detected: {explicit_abuse}")
|
863 |
|
864 |
# Abuse model inference
|
|
|
895 |
if label not in threshold_labels:
|
896 |
threshold_labels.append(label)
|
897 |
|
898 |
+
logger.debug(f"\nLabels that passed thresholds: {threshold_labels}")
|
899 |
|
900 |
# Calculate matched scores
|
901 |
matched_scores = []
|
|
|
934 |
abuse_score = compute_abuse_score(matched_scores, sentiment)
|
935 |
if explicit_abuse:
|
936 |
abuse_score = max(abuse_score, 70.0)
|
937 |
+
|
938 |
+
# Apply sentiment-based score capping BEFORE compound threat check
|
939 |
+
if sentiment == "supportive" and not explicit_abuse:
|
940 |
+
# For supportive messages, cap the abuse score much lower
|
941 |
+
abuse_score = min(abuse_score, 30.0)
|
942 |
+
logger.debug(f"Capped abuse score to {abuse_score} due to supportive sentiment")
|
943 |
|
944 |
# Check for compound threats
|
945 |
+
compound_threat_flag, threat_type = detect_compound_threat(text, threshold_labels)
|
|
|
|
|
946 |
|
947 |
+
# Apply compound threat override only for non-supportive messages
|
948 |
+
if compound_threat_flag and sentiment != "supportive":
|
949 |
logger.debug(f"⚠️ Compound threat detected in message: {threat_type}")
|
950 |
+
abuse_score = max(abuse_score, 85.0)
|
951 |
|
952 |
# Get DARVO score
|
953 |
darvo_score = predict_darvo_score(text)
|
|
|
958 |
# Log tone usage
|
959 |
log_emotional_tone_usage(tone_tag, threshold_labels)
|
960 |
|
961 |
+
# Check for the specific combination (final safety check)
|
962 |
+
highest_pattern = max(matched_scores, key=lambda x: x[1])[0] if matched_scores else None
|
963 |
if sentiment == "supportive" and tone_tag == "neutral" and highest_pattern == "obscure language":
|
964 |
logger.debug("Message classified as likely non-abusive (supportive, neutral, and obscure language). Returning low risk.")
|
965 |
+
return 0.0, [], [], {"label": "supportive"}, 1, 0.0, "neutral"
|
966 |
|
967 |
# Set stage
|
968 |
stage = 2 if explicit_abuse or abuse_score > 70 else 1
|