File size: 3,208 Bytes
f6de761 1fdaf04 f6de761 1fdaf04 55cf8ec 1fdaf04 f6de761 8d292e0 1fdaf04 f6de761 1fdaf04 f6de761 1fdaf04 8d292e0 1fdaf04 8d292e0 55cf8ec 8d292e0 1fdaf04 8d292e0 1fdaf04 8d292e0 1fdaf04 8d292e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# mcp/umls.py
"""
Biomedical keyword/concept extractor for UMLS lookup.
- Uses SciSpaCy if available (best for biomedical text).
- Falls back to spaCy 'en_core_web_sm' (less accurate, general English).
- Final fallback: regex keyword extraction.
"""
import os, httpx, asyncio
from functools import lru_cache
# UMLS API config
UMLS_KEY = os.getenv("UMLS_KEY")
_AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
_SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
# ------- Robust concept extractor ------------------------------------------
def extract_umls_concepts(text: str, min_length: int = 3) -> list[str]:
"""
Extract biomedical concepts (for UMLS) from text.
Priority: SciSpaCy -> spaCy -> regex.
"""
try:
# Try SciSpaCy first (best for biomedical NER)
import scispacy
import spacy
try:
nlp = spacy.load("en_ner_bionlp13cg_md")
except Exception:
nlp = spacy.load("en_core_sci_sm")
doc = nlp(text)
# All entities ≥ min_length, deduplicated
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
if ents:
return list(ents)
except Exception:
pass
# Fallback: spaCy general NER
try:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
if ents:
return list(ents)
except Exception:
pass
# Final fallback: Regex keywords
import re
words = re.findall(r"\b[a-zA-Z0-9\-]+\b", text)
return list({w for w in words if len(w) >= min_length})
# ------- UMLS API lookup (same as before, safe for missing/invalid key) ----
async def _get_ticket() -> str | None:
if not UMLS_KEY:
return None
try:
async with httpx.AsyncClient(timeout=10) as c:
tgt = await c.post(_AUTH_URL, data={"apikey": UMLS_KEY})
tgt.raise_for_status()
action = tgt.text.split('action="')[1].split('"')[0]
st = await c.post(action, data={"service": "http://umlsks.nlm.nih.gov"})
return st.text
except Exception:
return None
@lru_cache(maxsize=512)
async def lookup_umls(term: str) -> dict:
"""
Return {term,cui,name,definition}.
If auth/quota fails → returns all keys as None (safe for UI).
"""
ticket = await _get_ticket()
if not ticket:
return {"term": term, "cui": None, "name": None, "definition": None}
params = {"string": term, "ticket": ticket, "pageSize": 1}
try:
async with httpx.AsyncClient(timeout=8) as c:
r = await c.get(_SEARCH_URL, params=params)
r.raise_for_status()
items = r.json().get("result", {}).get("results", [])
hit = items[0] if items else {}
return {
"term": term,
"cui": hit.get("ui"),
"name": hit.get("name"),
"definition": hit.get("rootSource"),
}
except Exception:
return {"term": term, "cui": None, "name": None, "definition": None}
|