Update mcp/nlp.py
Browse files- mcp/nlp.py +35 -52
mcp/nlp.py
CHANGED
@@ -1,55 +1,38 @@
|
|
1 |
# mcp/nlp.py
|
2 |
-
|
3 |
-
#!/usr/bin/env python3
|
4 |
-
"""MedGenesis – spaCy helper for lightweight keyword extraction.
|
5 |
-
|
6 |
-
Features
|
7 |
-
~~~~~~~~
|
8 |
-
* Lazy‑loads **`en_core_web_sm`** at first call; cached thereafter.
|
9 |
-
* If model missing, raises actionable RuntimeError — Dockerfile must
|
10 |
-
install via `python -m spacy download en_core_web_sm` (already in Dockerfile).
|
11 |
-
* `extract_keywords` returns **unique named‑entity strings** (>2 chars)
|
12 |
-
stripped of whitespace, preserving original casing.
|
13 |
-
* Adds fallback to simple noun‑chunk extraction when no entities found –
|
14 |
-
helps very short abstracts.
|
15 |
-
"""
|
16 |
-
from __future__ import annotations
|
17 |
-
|
18 |
import spacy
|
19 |
-
|
20 |
-
from
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
# ---------------------------------------------------------------------
|
40 |
-
# Public API
|
41 |
-
# ---------------------------------------------------------------------
|
42 |
-
|
43 |
-
def extract_keywords(text: str, *, min_len: int = 3) -> List[str]:
|
44 |
-
"""Return de‑duplicated entity keywords (fallback noun chunks)."""
|
45 |
-
nlp = _load_model()
|
46 |
doc = nlp(text)
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# mcp/nlp.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import spacy
|
3 |
+
import scispacy
|
4 |
+
from scispacy.linking import EntityLinker
|
5 |
+
|
6 |
+
# Load a powerful biomedical model + UMLS linker
|
7 |
+
@spacy.util.cache_dir("~/.cache/scispacy")
|
8 |
+
def load_model():
|
9 |
+
nlp = spacy.load("en_core_sci_scibert")
|
10 |
+
# Resolve abbreviations then link to UMLS
|
11 |
+
linker = EntityLinker(name="umls", resolve_abbreviations=True, threshold=0.75)
|
12 |
+
nlp.add_pipe(linker)
|
13 |
+
return nlp
|
14 |
+
|
15 |
+
nlp = load_model()
|
16 |
+
|
17 |
+
def extract_umls_concepts(text: str):
|
18 |
+
"""
|
19 |
+
Returns a list of {cui, concept_name, score, semantic_types}.
|
20 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
doc = nlp(text)
|
22 |
+
concepts = []
|
23 |
+
for ent in doc.ents:
|
24 |
+
for cui, score in ent._.umls_ents:
|
25 |
+
meta = nlp.get_pipe("scispacy_linker").kb.cui_to_entity[cui]
|
26 |
+
concepts.append({
|
27 |
+
"cui": cui,
|
28 |
+
"name": meta.canonical_name,
|
29 |
+
"score": float(score),
|
30 |
+
"types": meta.types # list of semantic type strings
|
31 |
+
})
|
32 |
+
# Deduplicate by CUI, keep highest score
|
33 |
+
seen = {}
|
34 |
+
for c in concepts:
|
35 |
+
prev = seen.get(c["cui"])
|
36 |
+
if not prev or c["score"] > prev["score"]:
|
37 |
+
seen[c["cui"]] = c
|
38 |
+
return list(seen.values())
|