mgbam commited on
Commit
c3f5ed6
·
verified ·
1 Parent(s): c4bf66f

Update mcp/nlp.py

Browse files
Files changed (1) hide show
  1. mcp/nlp.py +27 -23
mcp/nlp.py CHANGED
@@ -1,30 +1,34 @@
1
  # mcp/nlp.py
 
2
  import spacy
3
- from scispacy.linking import EntityLinker
 
4
 
5
- @spacy.util.cache_dir("~/.cache/scispacy")
6
- def load_model():
7
- nlp = spacy.load("en_core_sci_scibert")
8
- linker = EntityLinker(name="umls", resolve_abbreviations=True, threshold=0.75)
9
- nlp.add_pipe(linker)
10
- return nlp
 
 
11
 
12
- nlp = load_model()
13
-
14
- def extract_umls_concepts(text: str) -> list[dict]:
15
  """
16
- Returns unique UMLS concepts with confidence scores and semantic types.
 
 
17
  """
18
  doc = nlp(text)
19
- best = {}
20
- for ent in doc.ents:
21
- for cui, score in ent._.umls_ents:
22
- meta = nlp.get_pipe("scispacy_linker").kb.cui_to_entity[cui]
23
- if cui not in best or score > best[cui]["score"]:
24
- best[cui] = {
25
- "cui": cui,
26
- "name": meta.canonical_name,
27
- "score": float(score),
28
- "types": meta.types
29
- }
30
- return list(best.values())
 
1
  # mcp/nlp.py
2
+ import asyncio
3
  import spacy
4
+ from typing import List, Dict
5
+ from mcp.umls import lookup_umls
6
 
7
+ # Load only the small English model
8
+ try:
9
+ nlp = spacy.load("en_core_web_sm")
10
+ except OSError:
11
+ # In case it wasn’t downloaded yet
12
+ from spacy.cli import download
13
+ download("en_core_web_sm")
14
+ nlp = spacy.load("en_core_web_sm")
15
 
16
+ async def extract_umls_concepts(text: str) -> List[Dict]:
 
 
17
  """
18
+ 1) Run spaCy NER on the text
19
+ 2) For each unique entity, do an async UMLS lookup
20
+ 3) Return the list of successful concept dicts
21
  """
22
  doc = nlp(text)
23
+ terms = {ent.text for ent in doc.ents if len(ent.text.strip()) > 2}
24
+
25
+ # Kick off all lookups in parallel
26
+ tasks = [lookup_umls(term) for term in terms]
27
+ results = await asyncio.gather(*tasks, return_exceptions=True)
28
+
29
+ # Filter out failures & concepts without CUI
30
+ concepts = []
31
+ for r in results:
32
+ if isinstance(r, dict) and r.get("cui"):
33
+ concepts.append(r)
34
+ return concepts