mgbam commited on
Commit
2c1c247
·
verified ·
1 Parent(s): 1786f57

Update mcp/nlp.py

Browse files
Files changed (1) hide show
  1. mcp/nlp.py +35 -52
mcp/nlp.py CHANGED
@@ -1,55 +1,38 @@
1
  # mcp/nlp.py
2
-
3
- #!/usr/bin/env python3
4
- """MedGenesis – spaCy helper for lightweight keyword extraction.
5
-
6
- Features
7
- ~~~~~~~~
8
- * Lazy‑loads **`en_core_web_sm`** at first call; cached thereafter.
9
- * If model missing, raises actionable RuntimeError — Dockerfile must
10
- install via `python -m spacy download en_core_web_sm` (already in Dockerfile).
11
- * `extract_keywords` returns **unique named‑entity strings** (>2 chars)
12
- stripped of whitespace, preserving original casing.
13
- * Adds fallback to simple noun‑chunk extraction when no entities found –
14
- helps very short abstracts.
15
- """
16
- from __future__ import annotations
17
-
18
  import spacy
19
- from functools import lru_cache
20
- from typing import List
21
-
22
-
23
- # ---------------------------------------------------------------------
24
- # Model loader (cached)
25
- # ---------------------------------------------------------------------
26
-
27
- @lru_cache(maxsize=1)
28
- def _load_model():
29
- try:
30
- return spacy.load("en_core_web_sm")
31
- except OSError as e:
32
- raise RuntimeError(
33
- "spaCy model 'en_core_web_sm' is not installed. Add\n"
34
- " RUN python -m spacy download en_core_web_sm\n"
35
- "to your Dockerfile build stage."
36
- ) from e
37
-
38
-
39
- # ---------------------------------------------------------------------
40
- # Public API
41
- # ---------------------------------------------------------------------
42
-
43
- def extract_keywords(text: str, *, min_len: int = 3) -> List[str]:
44
- """Return de‑duplicated entity keywords (fallback noun chunks)."""
45
- nlp = _load_model()
46
  doc = nlp(text)
47
-
48
- ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_len}
49
- if ents:
50
- return list(ents)
51
-
52
- # Fallback: noun chunks if spaCy found no entities (rare for tiny texts)
53
- chunks = {chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) >= min_len}
54
- return list(chunks)
55
-
 
 
 
 
 
 
 
 
 
1
  # mcp/nlp.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import spacy
3
+ import scispacy
4
+ from scispacy.linking import EntityLinker
5
+
6
+ # Load a powerful biomedical model + UMLS linker
7
+ @spacy.util.cache_dir("~/.cache/scispacy")
8
+ def load_model():
9
+ nlp = spacy.load("en_core_sci_scibert")
10
+ # Resolve abbreviations then link to UMLS
11
+ linker = EntityLinker(name="umls", resolve_abbreviations=True, threshold=0.75)
12
+ nlp.add_pipe(linker)
13
+ return nlp
14
+
15
+ nlp = load_model()
16
+
17
+ def extract_umls_concepts(text: str):
18
+ """
19
+ Returns a list of {cui, concept_name, score, semantic_types}.
20
+ """
 
 
 
 
 
 
 
 
 
21
  doc = nlp(text)
22
+ concepts = []
23
+ for ent in doc.ents:
24
+ for cui, score in ent._.umls_ents:
25
+ meta = nlp.get_pipe("scispacy_linker").kb.cui_to_entity[cui]
26
+ concepts.append({
27
+ "cui": cui,
28
+ "name": meta.canonical_name,
29
+ "score": float(score),
30
+ "types": meta.types # list of semantic type strings
31
+ })
32
+ # Deduplicate by CUI, keep highest score
33
+ seen = {}
34
+ for c in concepts:
35
+ prev = seen.get(c["cui"])
36
+ if not prev or c["score"] > prev["score"]:
37
+ seen[c["cui"]] = c
38
+ return list(seen.values())