wellsaid / app /services /inclusive_language.py
iamspruce
fixed the api
73a6a7e
import logging
import yaml
from pathlib import Path
from typing import List, Dict
from app.services.base import load_spacy_model
from app.core.config import settings, APP_NAME, SPACY_MODEL_ID
from app.core.exceptions import ServiceError
logger = logging.getLogger(f"{APP_NAME}.services.inclusive_language")
class InclusiveLanguageChecker:
def __init__(self, rules_directory: str = settings.INCLUSIVE_RULES_DIR):
self._nlp = None
self.matcher = None
self.rules = self._load_inclusive_rules(Path(rules_directory))
def _load_inclusive_rules(self, rules_path: Path) -> Dict[str, Dict]:
"""
Load YAML-based inclusive language rules from the given directory.
"""
if not rules_path.is_dir():
logger.error(f"Inclusive language rules directory not found: {rules_path}")
raise ServiceError(
status_code=500,
detail=f"Inclusive language rules directory not found: {rules_path}"
)
rules = {}
for yaml_file in rules_path.glob("*.yml"):
try:
with yaml_file.open(encoding="utf-8") as f:
rule_list = yaml.safe_load(f)
if not isinstance(rule_list, list):
logger.warning(f"Skipping non-list rule file: {yaml_file}")
continue
for rule in rule_list:
inconsiderate = rule.get("inconsiderate", [])
considerate = rule.get("considerate", [])
note = rule.get("note", "")
source = rule.get("source", "")
rule_type = rule.get("type", "basic")
# Ensure consistent formatting
if isinstance(considerate, str):
considerate = [considerate]
if isinstance(inconsiderate, str):
inconsiderate = [inconsiderate]
for phrase in inconsiderate:
rules[phrase.lower()] = {
"considerate": considerate,
"note": note,
"source": source,
"type": rule_type
}
except Exception as e:
logger.error(f"Error loading rule file {yaml_file}: {e}", exc_info=True)
raise ServiceError(
status_code=500,
detail=f"Failed to load inclusive language rules: {e}"
)
logger.info(f"Loaded {len(rules)} inclusive language rules from {rules_path}")
return rules
def _get_nlp(self):
"""
Lazy-loads the spaCy model for NLP processing.
"""
if self._nlp is None:
self._nlp = load_spacy_model(SPACY_MODEL_ID)
return self._nlp
def _init_matcher(self, nlp):
"""
Initializes spaCy PhraseMatcher using loaded rules.
"""
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
for phrase in self.rules:
matcher.add(phrase, [nlp.make_doc(phrase)])
logger.info(f"PhraseMatcher initialized with {len(self.rules)} phrases.")
return matcher
async def check(self, text: str) -> dict:
"""
Checks a string for non-inclusive language based on rule definitions.
"""
text = text.strip()
if not text:
raise ServiceError(status_code=400, detail="Input text is empty for inclusive language check.")
try:
nlp = self._get_nlp()
if self.matcher is None:
self.matcher = self._init_matcher(nlp)
doc = nlp(text)
matches = self.matcher(doc)
results = []
matched_spans = set()
# Match exact phrases
for match_id, start, end in matches:
phrase = nlp.vocab.strings[match_id].lower()
if any(s <= start < e or s < end <= e for s, e in matched_spans):
continue # Avoid overlapping matches
matched_spans.add((start, end))
rule = self.rules.get(phrase)
if rule:
results.append({
"term": doc[start:end].text,
"type": rule["type"],
"note": rule["note"],
"suggestions": rule["considerate"],
"context": doc[start:end].sent.text,
"start_char": doc[start].idx,
"end_char": doc[end - 1].idx + len(doc[end - 1]),
"source": rule["source"]
})
# Match individual token lemmas (fallback)
for token in doc:
lemma = token.lemma_.lower()
if (token.i, token.i + 1) in matched_spans:
continue # Already matched in phrase
if lemma in self.rules:
rule = self.rules[lemma]
results.append({
"term": token.text,
"type": rule["type"],
"note": rule["note"],
"suggestions": rule["considerate"],
"context": token.sent.text,
"start_char": token.idx,
"end_char": token.idx + len(token),
"source": rule["source"]
})
return {"issues": results}
except Exception as e:
logger.error(f"Inclusive language check error for text: '{text[:50]}...'", exc_info=True)
raise ServiceError(
status_code=500,
detail="An internal error occurred during inclusive language checking."
) from e