mgbam commited on
Commit
3f9a9ea
·
verified ·
1 Parent(s): 3eda6bb

Update mcp/nlp.py

Browse files
Files changed (1) hide show
  1. mcp/nlp.py +11 -19
mcp/nlp.py CHANGED
@@ -1,38 +1,30 @@
1
  # mcp/nlp.py
2
  import spacy
3
- import scispacy
4
  from scispacy.linking import EntityLinker
5
 
6
- # Load a powerful biomedical model + UMLS linker
7
  @spacy.util.cache_dir("~/.cache/scispacy")
8
  def load_model():
9
  nlp = spacy.load("en_core_sci_scibert")
10
- # Resolve abbreviations then link to UMLS
11
  linker = EntityLinker(name="umls", resolve_abbreviations=True, threshold=0.75)
12
  nlp.add_pipe(linker)
13
  return nlp
14
 
15
  nlp = load_model()
16
 
17
- def extract_umls_concepts(text: str):
18
  """
19
- Returns a list of {cui, concept_name, score, semantic_types}.
20
  """
21
  doc = nlp(text)
22
- concepts = []
23
  for ent in doc.ents:
24
  for cui, score in ent._.umls_ents:
25
  meta = nlp.get_pipe("scispacy_linker").kb.cui_to_entity[cui]
26
- concepts.append({
27
- "cui": cui,
28
- "name": meta.canonical_name,
29
- "score": float(score),
30
- "types": meta.types # list of semantic type strings
31
- })
32
- # Deduplicate by CUI, keep highest score
33
- seen = {}
34
- for c in concepts:
35
- prev = seen.get(c["cui"])
36
- if not prev or c["score"] > prev["score"]:
37
- seen[c["cui"]] = c
38
- return list(seen.values())
 
1
  # mcp/nlp.py
2
  import spacy
 
3
  from scispacy.linking import EntityLinker
4
 
 
5
  @spacy.util.cache_dir("~/.cache/scispacy")
6
  def load_model():
7
  nlp = spacy.load("en_core_sci_scibert")
 
8
  linker = EntityLinker(name="umls", resolve_abbreviations=True, threshold=0.75)
9
  nlp.add_pipe(linker)
10
  return nlp
11
 
12
  nlp = load_model()
13
 
14
+ def extract_umls_concepts(text: str) -> list[dict]:
15
  """
16
+ Returns unique UMLS concepts with confidence scores and semantic types.
17
  """
18
  doc = nlp(text)
19
+ best = {}
20
  for ent in doc.ents:
21
  for cui, score in ent._.umls_ents:
22
  meta = nlp.get_pipe("scispacy_linker").kb.cui_to_entity[cui]
23
+ if cui not in best or score > best[cui]["score"]:
24
+ best[cui] = {
25
+ "cui": cui,
26
+ "name": meta.canonical_name,
27
+ "score": float(score),
28
+ "types": meta.types
29
+ }
30
+ return list(best.values())