Spaces:
Runtime error
Runtime error
File size: 5,944 Bytes
73a6a7e 71192d1 73a6a7e ce2ce69 71192d1 73a6a7e 71192d1 73a6a7e 71192d1 73a6a7e 71192d1 73a6a7e 71192d1 73a6a7e 71192d1 73a6a7e ce2ce69 73a6a7e 71192d1 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e 71192d1 ce2ce69 71192d1 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e ce2ce69 71192d1 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e ce2ce69 71192d1 73a6a7e 71192d1 ce2ce69 73a6a7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import logging
import yaml
from pathlib import Path
from typing import List, Dict
from app.services.base import load_spacy_model
from app.core.config import settings, APP_NAME, SPACY_MODEL_ID
from app.core.exceptions import ServiceError
logger = logging.getLogger(f"{APP_NAME}.services.inclusive_language")
class InclusiveLanguageChecker:
def __init__(self, rules_directory: str = settings.INCLUSIVE_RULES_DIR):
self._nlp = None
self.matcher = None
self.rules = self._load_inclusive_rules(Path(rules_directory))
def _load_inclusive_rules(self, rules_path: Path) -> Dict[str, Dict]:
"""
Load YAML-based inclusive language rules from the given directory.
"""
if not rules_path.is_dir():
logger.error(f"Inclusive language rules directory not found: {rules_path}")
raise ServiceError(
status_code=500,
detail=f"Inclusive language rules directory not found: {rules_path}"
)
rules = {}
for yaml_file in rules_path.glob("*.yml"):
try:
with yaml_file.open(encoding="utf-8") as f:
rule_list = yaml.safe_load(f)
if not isinstance(rule_list, list):
logger.warning(f"Skipping non-list rule file: {yaml_file}")
continue
for rule in rule_list:
inconsiderate = rule.get("inconsiderate", [])
considerate = rule.get("considerate", [])
note = rule.get("note", "")
source = rule.get("source", "")
rule_type = rule.get("type", "basic")
# Ensure consistent formatting
if isinstance(considerate, str):
considerate = [considerate]
if isinstance(inconsiderate, str):
inconsiderate = [inconsiderate]
for phrase in inconsiderate:
rules[phrase.lower()] = {
"considerate": considerate,
"note": note,
"source": source,
"type": rule_type
}
except Exception as e:
logger.error(f"Error loading rule file {yaml_file}: {e}", exc_info=True)
raise ServiceError(
status_code=500,
detail=f"Failed to load inclusive language rules: {e}"
)
logger.info(f"Loaded {len(rules)} inclusive language rules from {rules_path}")
return rules
def _get_nlp(self):
"""
Lazy-loads the spaCy model for NLP processing.
"""
if self._nlp is None:
self._nlp = load_spacy_model(SPACY_MODEL_ID)
return self._nlp
def _init_matcher(self, nlp):
"""
Initializes spaCy PhraseMatcher using loaded rules.
"""
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
for phrase in self.rules:
matcher.add(phrase, [nlp.make_doc(phrase)])
logger.info(f"PhraseMatcher initialized with {len(self.rules)} phrases.")
return matcher
async def check(self, text: str) -> dict:
"""
Checks a string for non-inclusive language based on rule definitions.
"""
text = text.strip()
if not text:
raise ServiceError(status_code=400, detail="Input text is empty for inclusive language check.")
try:
nlp = self._get_nlp()
if self.matcher is None:
self.matcher = self._init_matcher(nlp)
doc = nlp(text)
matches = self.matcher(doc)
results = []
matched_spans = set()
# Match exact phrases
for match_id, start, end in matches:
phrase = nlp.vocab.strings[match_id].lower()
if any(s <= start < e or s < end <= e for s, e in matched_spans):
continue # Avoid overlapping matches
matched_spans.add((start, end))
rule = self.rules.get(phrase)
if rule:
results.append({
"term": doc[start:end].text,
"type": rule["type"],
"note": rule["note"],
"suggestions": rule["considerate"],
"context": doc[start:end].sent.text,
"start_char": doc[start].idx,
"end_char": doc[end - 1].idx + len(doc[end - 1]),
"source": rule["source"]
})
# Match individual token lemmas (fallback)
for token in doc:
lemma = token.lemma_.lower()
if (token.i, token.i + 1) in matched_spans:
continue # Already matched in phrase
if lemma in self.rules:
rule = self.rules[lemma]
results.append({
"term": token.text,
"type": rule["type"],
"note": rule["note"],
"suggestions": rule["considerate"],
"context": token.sent.text,
"start_char": token.idx,
"end_char": token.idx + len(token),
"source": rule["source"]
})
return {"issues": results}
except Exception as e:
logger.error(f"Inclusive language check error for text: '{text[:50]}...'", exc_info=True)
raise ServiceError(
status_code=500,
detail="An internal error occurred during inclusive language checking."
) from e
|