MCP_Res / mcp /umls.py
mgbam's picture
Update mcp/umls.py
1fdaf04 verified
raw
history blame
3.21 kB
# mcp/umls.py
"""
Biomedical keyword/concept extractor for UMLS lookup.
- Uses SciSpaCy if available (best for biomedical text).
- Falls back to spaCy 'en_core_web_sm' (less accurate, general English).
- Final fallback: regex keyword extraction.
"""
import os, httpx, asyncio
from functools import lru_cache
# UMLS API config
UMLS_KEY = os.getenv("UMLS_KEY")
_AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
_SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
# ------- Robust concept extractor ------------------------------------------
def extract_umls_concepts(text: str, min_length: int = 3) -> list[str]:
"""
Extract biomedical concepts (for UMLS) from text.
Priority: SciSpaCy -> spaCy -> regex.
"""
try:
# Try SciSpaCy first (best for biomedical NER)
import scispacy
import spacy
try:
nlp = spacy.load("en_ner_bionlp13cg_md")
except Exception:
nlp = spacy.load("en_core_sci_sm")
doc = nlp(text)
# All entities β‰₯ min_length, deduplicated
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
if ents:
return list(ents)
except Exception:
pass
# Fallback: spaCy general NER
try:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
if ents:
return list(ents)
except Exception:
pass
# Final fallback: Regex keywords
import re
words = re.findall(r"\b[a-zA-Z0-9\-]+\b", text)
return list({w for w in words if len(w) >= min_length})
# ------- UMLS API lookup (same as before, safe for missing/invalid key) ----
async def _get_ticket() -> str | None:
if not UMLS_KEY:
return None
try:
async with httpx.AsyncClient(timeout=10) as c:
tgt = await c.post(_AUTH_URL, data={"apikey": UMLS_KEY})
tgt.raise_for_status()
action = tgt.text.split('action="')[1].split('"')[0]
st = await c.post(action, data={"service": "http://umlsks.nlm.nih.gov"})
return st.text
except Exception:
return None
@lru_cache(maxsize=512)
async def lookup_umls(term: str) -> dict:
"""
Return {term,cui,name,definition}.
If auth/quota fails β†’ returns all keys as None (safe for UI).
"""
ticket = await _get_ticket()
if not ticket:
return {"term": term, "cui": None, "name": None, "definition": None}
params = {"string": term, "ticket": ticket, "pageSize": 1}
try:
async with httpx.AsyncClient(timeout=8) as c:
r = await c.get(_SEARCH_URL, params=params)
r.raise_for_status()
items = r.json().get("result", {}).get("results", [])
hit = items[0] if items else {}
return {
"term": term,
"cui": hit.get("ui"),
"name": hit.get("name"),
"definition": hit.get("rootSource"),
}
except Exception:
return {"term": term, "cui": None, "name": None, "definition": None}