Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| ''' | |
| @Author : Jiangjie Chen | |
| @Time : 2020/5/11 19:08 | |
| @Contact : [email protected] | |
| @Description: | |
| ''' | |
| import os | |
| import tagme | |
| def read_title_id(entity_def_path): | |
| id_to_title = {} | |
| with open(entity_def_path, 'r', encoding='UTF-8') as f: | |
| lines = f.readlines() | |
| for i, line in enumerate(lines): | |
| if i > 0: | |
| entity, id = line.strip().split('|') | |
| id_to_title[id] = entity | |
| return id_to_title | |
| class ELClient: | |
| def __init__(self, link_type, min_rho=0.1, prefix=None, verbose=False): | |
| self.verbose = verbose | |
| self.link_type = link_type | |
| if link_type == 'tagme': | |
| self.min_rho = min_rho | |
| tagme.GCUBE_TOKEN = os.environ['TAGME_APIKEY'] | |
| elif link_type == 'spacy': | |
| assert prefix is not None | |
| self.init_spacy_linker(prefix) | |
| else: | |
| raise NotImplementedError(link_type) | |
| def init_spacy_linker(self, prefix): | |
| entity_def_path = f"{prefix}/entity_defs.csv" | |
| self._print('* Loading entity linker...') | |
| self.nlp = spacy.load(prefix) | |
| self.id2title = read_title_id(entity_def_path) | |
| self._print('* Entity linker loaded.') | |
| def _tagme_link(self, text): | |
| result = [] | |
| for ann in tagme.annotate(text, long_text=1).get_annotations(min_rho=self.min_rho): | |
| result.append((text[ann.begin:ann.end], ann.score, ann.entity_id, ann.entity_title)) | |
| # result.append({'begin': ann.begin, | |
| # 'end': ann.end, | |
| # 'id': ann.entity_id, | |
| # 'title': ann.entity_title, | |
| # 'score': ann.score}) | |
| result.sort(key=lambda x: x[1], reverse=True) | |
| return result | |
| def link(self, text): | |
| if self.link_type == 'tagme': | |
| return self._tagme_link(text) | |
| else: | |
| return self._spacy_link(text) | |
| def _spacy_link(self, text): | |
| text = self._preprocess_text(text) | |
| doc = self.nlp(text) | |
| ents = [(e.text, e.label_, e.kb_id_, self.id2title.get(e.kb_id_, '')) | |
| for e in doc.ents if e.kb_id_ != 'NIL'] | |
| return ents | |
| def _preprocess_text(self, text): | |
| if isinstance(text, list): | |
| text = ' '.join(text) | |
| text = text.strip().replace('-lrb-', '(').replace('-rrb-', ')') | |
| return text | |
| def _print(self, x): | |
| if self.verbose: print(x) | |
| if __name__ == '__main__': | |
| elcl = ELClient(link_type='tagme', verbose=True) | |
| res = elcl.link('Jeff Dean wants to meet Yoshua Bengio.') | |
| print(res) | |