Spaces:
Runtime error
Runtime error
File size: 5,768 Bytes
ce2ce69 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e ce2ce69 73a6a7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import logging
import asyncio
from typing import List, Dict
from functools import lru_cache
from app.services.base import (
load_spacy_model,
load_sentence_transformer_model,
ensure_nltk_resource
)
from app.core.config import (
settings,
APP_NAME,
SPACY_MODEL_ID,
WORDNET_NLTK_ID,
SENTENCE_TRANSFORMER_MODEL_ID
)
from app.core.exceptions import ServiceError, ModelNotDownloadedError
from nltk.corpus import wordnet
from sentence_transformers.util import cos_sim
logger = logging.getLogger(f"{APP_NAME}.services.synonyms")
SPACY_TO_WORDNET_POS = {
"NOUN": wordnet.NOUN,
"VERB": wordnet.VERB,
"ADJ": wordnet.ADJ,
"ADV": wordnet.ADV,
}
CONTENT_POS_TAGS = {"NOUN", "VERB", "ADJ", "ADV"}
class SynonymSuggester:
def __init__(self):
self._sentence_model = None
self._nlp = None
def _get_sentence_model(self):
if self._sentence_model is None:
self._sentence_model = load_sentence_transformer_model(
SENTENCE_TRANSFORMER_MODEL_ID
)
return self._sentence_model
def _get_nlp(self):
if self._nlp is None:
self._nlp = load_spacy_model(
SPACY_MODEL_ID
)
return self._nlp
async def suggest(self, text: str) -> dict:
try:
text = text.strip()
if not text:
raise ServiceError(status_code=400, detail="Input text is empty for synonym suggestion.")
sentence_model = self._get_sentence_model()
nlp = self._get_nlp()
await asyncio.to_thread(ensure_nltk_resource, WORDNET_NLTK_ID)
doc = await asyncio.to_thread(nlp, text)
all_suggestions: Dict[str, List[str]] = {}
original_text_embedding = await asyncio.to_thread(
sentence_model.encode, text,
convert_to_tensor=True,
normalize_embeddings=True
)
candidate_data = []
for token in doc:
if token.pos_ in CONTENT_POS_TAGS and len(token.text.strip()) > 2 and not token.is_punct and not token.is_space:
original_word = token.text
word_start = token.idx
word_end = token.idx + len(original_word)
wordnet_pos = SPACY_TO_WORDNET_POS.get(token.pos_)
if not wordnet_pos:
continue
wordnet_candidates = await asyncio.to_thread(
self._get_wordnet_synonyms_cached, original_word, wordnet_pos
)
if not wordnet_candidates:
continue
if original_word not in all_suggestions:
all_suggestions[original_word] = []
for candidate in wordnet_candidates:
temp_sentence = text[:word_start] + candidate + text[word_end:]
candidate_data.append({
"original_word": original_word,
"wordnet_candidate": candidate,
"temp_sentence": temp_sentence,
})
if not candidate_data:
return {"suggestions": {}}
all_candidate_sentences = [c["temp_sentence"] for c in candidate_data]
all_candidate_embeddings = await asyncio.to_thread(
sentence_model.encode,
all_candidate_sentences,
batch_size=settings.SENTENCE_TRANSFORMER_BATCH_SIZE,
convert_to_tensor=True,
normalize_embeddings=True
)
if original_text_embedding.dim() == 1:
original_text_embedding = original_text_embedding.unsqueeze(0)
cosine_scores = cos_sim(original_text_embedding, all_candidate_embeddings)[0]
similarity_threshold = 0.65
top_n = 5
temp_scored: Dict[str, List[tuple]] = {word: [] for word in all_suggestions}
for i, data in enumerate(candidate_data):
word = data["original_word"]
candidate = data["wordnet_candidate"]
score = cosine_scores[i].item()
if score >= similarity_threshold and candidate.lower() != word.lower():
temp_scored[word].append((score, candidate))
final_suggestions = {}
for word, scored in temp_scored.items():
if scored:
sorted_unique = []
seen = set()
for score, candidate in sorted(scored, key=lambda x: x[0], reverse=True):
if candidate not in seen:
sorted_unique.append(candidate)
seen.add(candidate)
if len(sorted_unique) >= top_n:
break
final_suggestions[word] = sorted_unique
return {"suggestions": final_suggestions}
except Exception as e:
logger.error(f"Synonym suggestion error for text: '{text[:50]}...'", exc_info=True)
raise ServiceError(status_code=500, detail="An internal error occurred during synonym suggestion.") from e
@lru_cache(maxsize=5000)
def _get_wordnet_synonyms_cached(self, word: str, pos: str) -> List[str]:
synonyms = set()
for syn in wordnet.synsets(word, pos=pos):
for lemma in syn.lemmas():
name = lemma.name().replace("_", " ").lower()
if name.isalpha() and len(name) > 1:
synonyms.add(name)
synonyms.discard(word.lower())
return sorted(synonyms)
|