File size: 5,768 Bytes
ce2ce69
 
 
 
 
 
73a6a7e
 
 
 
 
 
 
 
 
 
ce2ce69
73a6a7e
 
 
 
ce2ce69
73a6a7e
ce2ce69
 
 
 
 
 
 
 
 
73a6a7e
ce2ce69
 
 
73a6a7e
 
 
 
 
 
 
ce2ce69
73a6a7e
ce2ce69
73a6a7e
 
 
 
 
 
ce2ce69
 
73a6a7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce2ce69
 
 
 
73a6a7e
ce2ce69
 
 
 
73a6a7e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import logging
import asyncio
from typing import List, Dict
from functools import lru_cache

from app.services.base import (
    load_spacy_model,
    load_sentence_transformer_model,
    ensure_nltk_resource
)
from app.core.config import (
    settings,
    APP_NAME,
    SPACY_MODEL_ID,
    WORDNET_NLTK_ID,
    SENTENCE_TRANSFORMER_MODEL_ID
)
from app.core.exceptions import ServiceError, ModelNotDownloadedError

from nltk.corpus import wordnet
from sentence_transformers.util import cos_sim

logger = logging.getLogger(f"{APP_NAME}.services.synonyms")

SPACY_TO_WORDNET_POS = {
    "NOUN": wordnet.NOUN,
    "VERB": wordnet.VERB,
    "ADJ": wordnet.ADJ,
    "ADV": wordnet.ADV,
}

CONTENT_POS_TAGS = {"NOUN", "VERB", "ADJ", "ADV"}


class SynonymSuggester:
    def __init__(self):
        self._sentence_model = None
        self._nlp = None

    def _get_sentence_model(self):
        if self._sentence_model is None:
            self._sentence_model = load_sentence_transformer_model(
                SENTENCE_TRANSFORMER_MODEL_ID
            )
        return self._sentence_model

    def _get_nlp(self):
        if self._nlp is None:
            self._nlp = load_spacy_model(
                SPACY_MODEL_ID
            )
        return self._nlp

    async def suggest(self, text: str) -> dict:
        try:
            text = text.strip()
            if not text:
                raise ServiceError(status_code=400, detail="Input text is empty for synonym suggestion.")

            sentence_model = self._get_sentence_model()
            nlp = self._get_nlp()
            await asyncio.to_thread(ensure_nltk_resource, WORDNET_NLTK_ID)

            doc = await asyncio.to_thread(nlp, text)
            all_suggestions: Dict[str, List[str]] = {}

            original_text_embedding = await asyncio.to_thread(
                sentence_model.encode, text,
                convert_to_tensor=True,
                normalize_embeddings=True
            )

            candidate_data = []

            for token in doc:
                if token.pos_ in CONTENT_POS_TAGS and len(token.text.strip()) > 2 and not token.is_punct and not token.is_space:
                    original_word = token.text
                    word_start = token.idx
                    word_end = token.idx + len(original_word)
                    wordnet_pos = SPACY_TO_WORDNET_POS.get(token.pos_)
                    if not wordnet_pos:
                        continue

                    wordnet_candidates = await asyncio.to_thread(
                        self._get_wordnet_synonyms_cached, original_word, wordnet_pos
                    )
                    if not wordnet_candidates:
                        continue

                    if original_word not in all_suggestions:
                        all_suggestions[original_word] = []

                    for candidate in wordnet_candidates:
                        temp_sentence = text[:word_start] + candidate + text[word_end:]
                        candidate_data.append({
                            "original_word": original_word,
                            "wordnet_candidate": candidate,
                            "temp_sentence": temp_sentence,
                        })

            if not candidate_data:
                return {"suggestions": {}}

            all_candidate_sentences = [c["temp_sentence"] for c in candidate_data]
            all_candidate_embeddings = await asyncio.to_thread(
                sentence_model.encode,
                all_candidate_sentences,
                batch_size=settings.SENTENCE_TRANSFORMER_BATCH_SIZE,
                convert_to_tensor=True,
                normalize_embeddings=True
            )

            if original_text_embedding.dim() == 1:
                original_text_embedding = original_text_embedding.unsqueeze(0)

            cosine_scores = cos_sim(original_text_embedding, all_candidate_embeddings)[0]

            similarity_threshold = 0.65
            top_n = 5
            temp_scored: Dict[str, List[tuple]] = {word: [] for word in all_suggestions}

            for i, data in enumerate(candidate_data):
                word = data["original_word"]
                candidate = data["wordnet_candidate"]
                score = cosine_scores[i].item()
                if score >= similarity_threshold and candidate.lower() != word.lower():
                    temp_scored[word].append((score, candidate))

            final_suggestions = {}
            for word, scored in temp_scored.items():
                if scored:
                    sorted_unique = []
                    seen = set()
                    for score, candidate in sorted(scored, key=lambda x: x[0], reverse=True):
                        if candidate not in seen:
                            sorted_unique.append(candidate)
                            seen.add(candidate)
                        if len(sorted_unique) >= top_n:
                            break
                    final_suggestions[word] = sorted_unique

            return {"suggestions": final_suggestions}

        except Exception as e:
            logger.error(f"Synonym suggestion error for text: '{text[:50]}...'", exc_info=True)
            raise ServiceError(status_code=500, detail="An internal error occurred during synonym suggestion.") from e

    @lru_cache(maxsize=5000)
    def _get_wordnet_synonyms_cached(self, word: str, pos: str) -> List[str]:
        synonyms = set()
        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                name = lemma.name().replace("_", " ").lower()
                if name.isalpha() and len(name) > 1:
                    synonyms.add(name)
        synonyms.discard(word.lower())
        return sorted(synonyms)