Spaces:
Sleeping
Sleeping
| import spacy | |
| import pickle | |
| from nltk.corpus import wordnet | |
| def load_spacy_values(filepath_model_spacy='model_spacy_synonyms', filepath_docs_spacy = 'dict_spacy_object.pkl'): | |
| nlp = spacy.load(filepath_model_spacy) | |
| with open(filepath_docs_spacy, 'rb') as file: | |
| dict_docs_spacy_bytes = pickle.load(file) | |
| dict_docs_spacy = {key: spacy.tokens.Doc(nlp.vocab).from_bytes(doc_bytes) for key, doc_bytes in dict_docs_spacy_bytes.items()} | |
| return nlp, dict_docs_spacy | |
| def find_antonyms(word): | |
| antonyms = set() | |
| syn_set = wordnet.synsets(word) | |
| for syn in syn_set: | |
| for lemma in syn.lemmas(): | |
| if lemma.antonyms(): | |
| antonyms.add(lemma.antonyms()[0].name()) | |
| return antonyms | |
| def find_synonyms(word, model, dict_embedding, dict_2000_tokens): #cluster_to_words, dbscan_model): | |
| """ | |
| This function finds the most similar word in the same cluster, and excludes antonyms | |
| """ | |
| antonyms = find_antonyms(word) | |
| dict_2000_tokens_less_antonyms = [token for token in dict_2000_tokens if token not in antonyms] | |
| word_embedding = model(word) | |
| similarities=[] | |
| for token in dict_2000_tokens_less_antonyms: | |
| similarities.append((token, dict_embedding.get(token).similarity(word_embedding))) | |
| most_similar_token = sorted(similarities, key=lambda item: -item[1])[0][0] | |
| return most_similar_token |