import logging import yaml from pathlib import Path from typing import List, Dict from app.services.base import load_spacy_model from app.core.config import settings, APP_NAME, SPACY_MODEL_ID from app.core.exceptions import ServiceError logger = logging.getLogger(f"{APP_NAME}.services.inclusive_language") class InclusiveLanguageChecker: def __init__(self, rules_directory: str = settings.INCLUSIVE_RULES_DIR): self._nlp = None self.matcher = None self.rules = self._load_inclusive_rules(Path(rules_directory)) def _load_inclusive_rules(self, rules_path: Path) -> Dict[str, Dict]: """ Load YAML-based inclusive language rules from the given directory. """ if not rules_path.is_dir(): logger.error(f"Inclusive language rules directory not found: {rules_path}") raise ServiceError( status_code=500, detail=f"Inclusive language rules directory not found: {rules_path}" ) rules = {} for yaml_file in rules_path.glob("*.yml"): try: with yaml_file.open(encoding="utf-8") as f: rule_list = yaml.safe_load(f) if not isinstance(rule_list, list): logger.warning(f"Skipping non-list rule file: {yaml_file}") continue for rule in rule_list: inconsiderate = rule.get("inconsiderate", []) considerate = rule.get("considerate", []) note = rule.get("note", "") source = rule.get("source", "") rule_type = rule.get("type", "basic") # Ensure consistent formatting if isinstance(considerate, str): considerate = [considerate] if isinstance(inconsiderate, str): inconsiderate = [inconsiderate] for phrase in inconsiderate: rules[phrase.lower()] = { "considerate": considerate, "note": note, "source": source, "type": rule_type } except Exception as e: logger.error(f"Error loading rule file {yaml_file}: {e}", exc_info=True) raise ServiceError( status_code=500, detail=f"Failed to load inclusive language rules: {e}" ) logger.info(f"Loaded {len(rules)} inclusive language rules from {rules_path}") return rules def _get_nlp(self): """ Lazy-loads the spaCy model for NLP processing. """ if self._nlp is None: self._nlp = load_spacy_model(SPACY_MODEL_ID) return self._nlp def _init_matcher(self, nlp): """ Initializes spaCy PhraseMatcher using loaded rules. """ from spacy.matcher import PhraseMatcher matcher = PhraseMatcher(nlp.vocab, attr="LOWER") for phrase in self.rules: matcher.add(phrase, [nlp.make_doc(phrase)]) logger.info(f"PhraseMatcher initialized with {len(self.rules)} phrases.") return matcher async def check(self, text: str) -> dict: """ Checks a string for non-inclusive language based on rule definitions. """ text = text.strip() if not text: raise ServiceError(status_code=400, detail="Input text is empty for inclusive language check.") try: nlp = self._get_nlp() if self.matcher is None: self.matcher = self._init_matcher(nlp) doc = nlp(text) matches = self.matcher(doc) results = [] matched_spans = set() # Match exact phrases for match_id, start, end in matches: phrase = nlp.vocab.strings[match_id].lower() if any(s <= start < e or s < end <= e for s, e in matched_spans): continue # Avoid overlapping matches matched_spans.add((start, end)) rule = self.rules.get(phrase) if rule: results.append({ "term": doc[start:end].text, "type": rule["type"], "note": rule["note"], "suggestions": rule["considerate"], "context": doc[start:end].sent.text, "start_char": doc[start].idx, "end_char": doc[end - 1].idx + len(doc[end - 1]), "source": rule["source"] }) # Match individual token lemmas (fallback) for token in doc: lemma = token.lemma_.lower() if (token.i, token.i + 1) in matched_spans: continue # Already matched in phrase if lemma in self.rules: rule = self.rules[lemma] results.append({ "term": token.text, "type": rule["type"], "note": rule["note"], "suggestions": rule["considerate"], "context": token.sent.text, "start_char": token.idx, "end_char": token.idx + len(token), "source": rule["source"] }) return {"issues": results} except Exception as e: logger.error(f"Inclusive language check error for text: '{text[:50]}...'", exc_info=True) raise ServiceError( status_code=500, detail="An internal error occurred during inclusive language checking." ) from e