Update mcp/nlp.py
Browse files- mcp/nlp.py +11 -19
mcp/nlp.py
CHANGED
@@ -1,38 +1,30 @@
|
|
1 |
# mcp/nlp.py
|
2 |
import spacy
|
3 |
-
import scispacy
|
4 |
from scispacy.linking import EntityLinker
|
5 |
|
6 |
-
# Load a powerful biomedical model + UMLS linker
|
7 |
@spacy.util.cache_dir("~/.cache/scispacy")
|
8 |
def load_model():
|
9 |
nlp = spacy.load("en_core_sci_scibert")
|
10 |
-
# Resolve abbreviations then link to UMLS
|
11 |
linker = EntityLinker(name="umls", resolve_abbreviations=True, threshold=0.75)
|
12 |
nlp.add_pipe(linker)
|
13 |
return nlp
|
14 |
|
15 |
nlp = load_model()
|
16 |
|
17 |
-
def extract_umls_concepts(text: str):
|
18 |
"""
|
19 |
-
Returns
|
20 |
"""
|
21 |
doc = nlp(text)
|
22 |
-
|
23 |
for ent in doc.ents:
|
24 |
for cui, score in ent._.umls_ents:
|
25 |
meta = nlp.get_pipe("scispacy_linker").kb.cui_to_entity[cui]
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
for c in concepts:
|
35 |
-
prev = seen.get(c["cui"])
|
36 |
-
if not prev or c["score"] > prev["score"]:
|
37 |
-
seen[c["cui"]] = c
|
38 |
-
return list(seen.values())
|
|
|
1 |
# mcp/nlp.py
|
2 |
import spacy
|
|
|
3 |
from scispacy.linking import EntityLinker
|
4 |
|
|
|
5 |
@spacy.util.cache_dir("~/.cache/scispacy")
|
6 |
def load_model():
|
7 |
nlp = spacy.load("en_core_sci_scibert")
|
|
|
8 |
linker = EntityLinker(name="umls", resolve_abbreviations=True, threshold=0.75)
|
9 |
nlp.add_pipe(linker)
|
10 |
return nlp
|
11 |
|
12 |
nlp = load_model()
|
13 |
|
14 |
+
def extract_umls_concepts(text: str) -> list[dict]:
|
15 |
"""
|
16 |
+
Returns unique UMLS concepts with confidence scores and semantic types.
|
17 |
"""
|
18 |
doc = nlp(text)
|
19 |
+
best = {}
|
20 |
for ent in doc.ents:
|
21 |
for cui, score in ent._.umls_ents:
|
22 |
meta = nlp.get_pipe("scispacy_linker").kb.cui_to_entity[cui]
|
23 |
+
if cui not in best or score > best[cui]["score"]:
|
24 |
+
best[cui] = {
|
25 |
+
"cui": cui,
|
26 |
+
"name": meta.canonical_name,
|
27 |
+
"score": float(score),
|
28 |
+
"types": meta.types
|
29 |
+
}
|
30 |
+
return list(best.values())
|
|
|
|
|
|
|
|
|
|