Spaces:
Building on CPU Upgrade

mgbam commited on
Commit
1fdaf04
Β·
verified Β·
1 Parent(s): f6de761

Update mcp/umls.py

Browse files
Files changed (1) hide show
  1. mcp/umls.py +67 -39
mcp/umls.py CHANGED
@@ -1,66 +1,94 @@
1
  # mcp/umls.py
2
  """
3
- Async UMLS lookup via NLM REST API.
4
- Graceful fallback: always returns a dict with all keys even on failure.
 
 
 
5
  """
6
 
7
- import os
8
- import httpx
9
  from functools import lru_cache
10
 
11
- UMLS_KEY = os.getenv("UMLS_KEY") # Should be set as a secret (Hugging Face/colab/etc)
 
12
  _AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
13
  _SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
14
 
15
- # ── Internal helper: get a single-use service ticket ──────────────
16
- async def _get_ticket() -> str | None:
17
  """
18
- Return a single-use UMLS service ticket, or None if auth fails.
 
19
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  if not UMLS_KEY:
21
  return None
22
  try:
23
- async with httpx.AsyncClient(timeout=10) as client:
24
- # 1. Get ticket-granting ticket (TGT)
25
- tgt_resp = await client.post(_AUTH_URL, data={"apikey": UMLS_KEY})
26
- tgt_resp.raise_for_status()
27
- tgt_url = tgt_resp.text.split('action="')[1].split('"')[0]
28
- # 2. Exchange for single-use service ticket
29
- st_resp = await client.post(tgt_url, data={"service": "http://umlsks.nlm.nih.gov"})
30
- return st_resp.text.strip()
31
  except Exception:
32
  return None
33
 
34
- # ── Public API: async, returns safe dict ──────────────────────────
35
  @lru_cache(maxsize=512)
36
  async def lookup_umls(term: str) -> dict:
37
  """
38
- Lookup a UMLS concept by text term.
39
- Returns dict with {term, cui, name, definition}, always.
40
- If quota/auth fails: all values None except term.
41
  """
42
  ticket = await _get_ticket()
43
  if not ticket:
44
  return {"term": term, "cui": None, "name": None, "definition": None}
45
- params = {
46
- "string": term,
47
- "ticket": ticket,
48
- "pageSize": 1,
49
- }
50
  try:
51
- async with httpx.AsyncClient(timeout=8) as client:
52
- resp = await client.get(_SEARCH_URL, params=params)
53
- resp.raise_for_status()
54
- items = resp.json().get("result", {}).get("results", [])
55
- if items:
56
- hit = items[0]
57
- return {
58
- "term": term,
59
- "cui": hit.get("ui"),
60
- "name": hit.get("name"),
61
- "definition": hit.get("rootSource"),
62
- }
63
- else:
64
- return {"term": term, "cui": None, "name": None, "definition": None}
65
  except Exception:
66
  return {"term": term, "cui": None, "name": None, "definition": None}
 
1
  # mcp/umls.py
2
  """
3
+ Biomedical keyword/concept extractor for UMLS lookup.
4
+
5
+ - Uses SciSpaCy if available (best for biomedical text).
6
+ - Falls back to spaCy 'en_core_web_sm' (less accurate, general English).
7
+ - Final fallback: regex keyword extraction.
8
  """
9
 
10
+ import os, httpx, asyncio
 
11
  from functools import lru_cache
12
 
13
+ # UMLS API config
14
+ UMLS_KEY = os.getenv("UMLS_KEY")
15
  _AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
16
  _SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
17
 
18
+ # ------- Robust concept extractor ------------------------------------------
19
+ def extract_umls_concepts(text: str, min_length: int = 3) -> list[str]:
20
  """
21
+ Extract biomedical concepts (for UMLS) from text.
22
+ Priority: SciSpaCy -> spaCy -> regex.
23
  """
24
+ try:
25
+ # Try SciSpaCy first (best for biomedical NER)
26
+ import scispacy
27
+ import spacy
28
+ try:
29
+ nlp = spacy.load("en_ner_bionlp13cg_md")
30
+ except Exception:
31
+ nlp = spacy.load("en_core_sci_sm")
32
+ doc = nlp(text)
33
+ # All entities β‰₯ min_length, deduplicated
34
+ ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
35
+ if ents:
36
+ return list(ents)
37
+ except Exception:
38
+ pass
39
+
40
+ # Fallback: spaCy general NER
41
+ try:
42
+ import spacy
43
+ nlp = spacy.load("en_core_web_sm")
44
+ doc = nlp(text)
45
+ ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
46
+ if ents:
47
+ return list(ents)
48
+ except Exception:
49
+ pass
50
+
51
+ # Final fallback: Regex keywords
52
+ import re
53
+ words = re.findall(r"\b[a-zA-Z0-9\-]+\b", text)
54
+ return list({w for w in words if len(w) >= min_length})
55
+
56
+ # ------- UMLS API lookup (same as before, safe for missing/invalid key) ----
57
+ async def _get_ticket() -> str | None:
58
  if not UMLS_KEY:
59
  return None
60
  try:
61
+ async with httpx.AsyncClient(timeout=10) as c:
62
+ tgt = await c.post(_AUTH_URL, data={"apikey": UMLS_KEY})
63
+ tgt.raise_for_status()
64
+ action = tgt.text.split('action="')[1].split('"')[0]
65
+ st = await c.post(action, data={"service": "http://umlsks.nlm.nih.gov"})
66
+ return st.text
 
 
67
  except Exception:
68
  return None
69
 
 
70
  @lru_cache(maxsize=512)
71
  async def lookup_umls(term: str) -> dict:
72
  """
73
+ Return {term,cui,name,definition}.
74
+ If auth/quota fails β†’ returns all keys as None (safe for UI).
 
75
  """
76
  ticket = await _get_ticket()
77
  if not ticket:
78
  return {"term": term, "cui": None, "name": None, "definition": None}
79
+
80
+ params = {"string": term, "ticket": ticket, "pageSize": 1}
 
 
 
81
  try:
82
+ async with httpx.AsyncClient(timeout=8) as c:
83
+ r = await c.get(_SEARCH_URL, params=params)
84
+ r.raise_for_status()
85
+ items = r.json().get("result", {}).get("results", [])
86
+ hit = items[0] if items else {}
87
+ return {
88
+ "term": term,
89
+ "cui": hit.get("ui"),
90
+ "name": hit.get("name"),
91
+ "definition": hit.get("rootSource"),
92
+ }
 
 
 
93
  except Exception:
94
  return {"term": term, "cui": None, "name": None, "definition": None}