File size: 5,944 Bytes
73a6a7e
71192d1
 
 
 
73a6a7e
 
 
 
 
 
ce2ce69
71192d1
73a6a7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71192d1
 
73a6a7e
71192d1
73a6a7e
71192d1
73a6a7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71192d1
73a6a7e
 
 
 
 
 
 
71192d1
 
73a6a7e
 
 
 
 
 
 
 
 
 
 
 
ce2ce69
73a6a7e
 
71192d1
73a6a7e
 
 
ce2ce69
 
73a6a7e
 
 
 
 
 
 
 
ce2ce69
73a6a7e
 
 
71192d1
ce2ce69
 
 
 
71192d1
73a6a7e
ce2ce69
73a6a7e
 
 
 
ce2ce69
73a6a7e
ce2ce69
 
 
 
 
 
 
73a6a7e
 
 
ce2ce69
71192d1
73a6a7e
ce2ce69
 
73a6a7e
 
 
 
ce2ce69
 
 
 
 
 
 
73a6a7e
 
 
ce2ce69
71192d1
73a6a7e
71192d1
ce2ce69
73a6a7e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import logging
import yaml
from pathlib import Path
from typing import List, Dict

from app.services.base import load_spacy_model
from app.core.config import settings, APP_NAME, SPACY_MODEL_ID
from app.core.exceptions import ServiceError

logger = logging.getLogger(f"{APP_NAME}.services.inclusive_language")


class InclusiveLanguageChecker:
    def __init__(self, rules_directory: str = settings.INCLUSIVE_RULES_DIR):
        self._nlp = None
        self.matcher = None
        self.rules = self._load_inclusive_rules(Path(rules_directory))

    def _load_inclusive_rules(self, rules_path: Path) -> Dict[str, Dict]:
        """
        Load YAML-based inclusive language rules from the given directory.
        """
        if not rules_path.is_dir():
            logger.error(f"Inclusive language rules directory not found: {rules_path}")
            raise ServiceError(
                status_code=500,
                detail=f"Inclusive language rules directory not found: {rules_path}"
            )

        rules = {}
        for yaml_file in rules_path.glob("*.yml"):
            try:
                with yaml_file.open(encoding="utf-8") as f:
                    rule_list = yaml.safe_load(f)

                if not isinstance(rule_list, list):
                    logger.warning(f"Skipping non-list rule file: {yaml_file}")
                    continue

                for rule in rule_list:
                    inconsiderate = rule.get("inconsiderate", [])
                    considerate = rule.get("considerate", [])
                    note = rule.get("note", "")
                    source = rule.get("source", "")
                    rule_type = rule.get("type", "basic")

                    # Ensure consistent formatting
                    if isinstance(considerate, str):
                        considerate = [considerate]
                    if isinstance(inconsiderate, str):
                        inconsiderate = [inconsiderate]

                    for phrase in inconsiderate:
                        rules[phrase.lower()] = {
                            "considerate": considerate,
                            "note": note,
                            "source": source,
                            "type": rule_type
                        }

            except Exception as e:
                logger.error(f"Error loading rule file {yaml_file}: {e}", exc_info=True)
                raise ServiceError(
                    status_code=500,
                    detail=f"Failed to load inclusive language rules: {e}"
                )

        logger.info(f"Loaded {len(rules)} inclusive language rules from {rules_path}")
        return rules

    def _get_nlp(self):
        """
        Lazy-loads the spaCy model for NLP processing.
        """
        if self._nlp is None:
            self._nlp = load_spacy_model(SPACY_MODEL_ID)
        return self._nlp

    def _init_matcher(self, nlp):
        """
        Initializes spaCy PhraseMatcher using loaded rules.
        """
        from spacy.matcher import PhraseMatcher

        matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
        for phrase in self.rules:
            matcher.add(phrase, [nlp.make_doc(phrase)])

        logger.info(f"PhraseMatcher initialized with {len(self.rules)} phrases.")
        return matcher

    async def check(self, text: str) -> dict:
        """
        Checks a string for non-inclusive language based on rule definitions.
        """
        text = text.strip()
        if not text:
            raise ServiceError(status_code=400, detail="Input text is empty for inclusive language check.")

        try:
            nlp = self._get_nlp()
            if self.matcher is None:
                self.matcher = self._init_matcher(nlp)

            doc = nlp(text)
            matches = self.matcher(doc)
            results = []
            matched_spans = set()

            # Match exact phrases
            for match_id, start, end in matches:
                phrase = nlp.vocab.strings[match_id].lower()
                if any(s <= start < e or s < end <= e for s, e in matched_spans):
                    continue  # Avoid overlapping matches

                matched_spans.add((start, end))
                rule = self.rules.get(phrase)
                if rule:
                    results.append({
                        "term": doc[start:end].text,
                        "type": rule["type"],
                        "note": rule["note"],
                        "suggestions": rule["considerate"],
                        "context": doc[start:end].sent.text,
                        "start_char": doc[start].idx,
                        "end_char": doc[end - 1].idx + len(doc[end - 1]),
                        "source": rule["source"]
                    })

            # Match individual token lemmas (fallback)
            for token in doc:
                lemma = token.lemma_.lower()
                if (token.i, token.i + 1) in matched_spans:
                    continue  # Already matched in phrase

                if lemma in self.rules:
                    rule = self.rules[lemma]
                    results.append({
                        "term": token.text,
                        "type": rule["type"],
                        "note": rule["note"],
                        "suggestions": rule["considerate"],
                        "context": token.sent.text,
                        "start_char": token.idx,
                        "end_char": token.idx + len(token),
                        "source": rule["source"]
                    })

            return {"issues": results}

        except Exception as e:
            logger.error(f"Inclusive language check error for text: '{text[:50]}...'", exc_info=True)
            raise ServiceError(
                status_code=500,
                detail="An internal error occurred during inclusive language checking."
            ) from e