|
|
|
""" |
|
Biomedical keyword/concept extractor for UMLS lookup. |
|
|
|
- Uses SciSpaCy if available (best for biomedical text). |
|
- Falls back to spaCy 'en_core_web_sm' (less accurate, general English). |
|
- Final fallback: regex keyword extraction. |
|
""" |
|
|
|
import os, httpx, asyncio |
|
from functools import lru_cache |
|
|
|
|
|
UMLS_KEY = os.getenv("UMLS_KEY") |
|
_AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key" |
|
_SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current" |
|
|
|
|
|
def extract_umls_concepts(text: str, min_length: int = 3) -> list[str]: |
|
""" |
|
Extract biomedical concepts (for UMLS) from text. |
|
Priority: SciSpaCy -> spaCy -> regex. |
|
""" |
|
try: |
|
|
|
import scispacy |
|
import spacy |
|
try: |
|
nlp = spacy.load("en_ner_bionlp13cg_md") |
|
except Exception: |
|
nlp = spacy.load("en_core_sci_sm") |
|
doc = nlp(text) |
|
|
|
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length} |
|
if ents: |
|
return list(ents) |
|
except Exception: |
|
pass |
|
|
|
|
|
try: |
|
import spacy |
|
nlp = spacy.load("en_core_web_sm") |
|
doc = nlp(text) |
|
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length} |
|
if ents: |
|
return list(ents) |
|
except Exception: |
|
pass |
|
|
|
|
|
import re |
|
words = re.findall(r"\b[a-zA-Z0-9\-]+\b", text) |
|
return list({w for w in words if len(w) >= min_length}) |
|
|
|
|
|
async def _get_ticket() -> str | None: |
|
if not UMLS_KEY: |
|
return None |
|
try: |
|
async with httpx.AsyncClient(timeout=10) as c: |
|
tgt = await c.post(_AUTH_URL, data={"apikey": UMLS_KEY}) |
|
tgt.raise_for_status() |
|
action = tgt.text.split('action="')[1].split('"')[0] |
|
st = await c.post(action, data={"service": "http://umlsks.nlm.nih.gov"}) |
|
return st.text |
|
except Exception: |
|
return None |
|
|
|
@lru_cache(maxsize=512) |
|
async def lookup_umls(term: str) -> dict: |
|
""" |
|
Return {term,cui,name,definition}. |
|
If auth/quota fails β returns all keys as None (safe for UI). |
|
""" |
|
ticket = await _get_ticket() |
|
if not ticket: |
|
return {"term": term, "cui": None, "name": None, "definition": None} |
|
|
|
params = {"string": term, "ticket": ticket, "pageSize": 1} |
|
try: |
|
async with httpx.AsyncClient(timeout=8) as c: |
|
r = await c.get(_SEARCH_URL, params=params) |
|
r.raise_for_status() |
|
items = r.json().get("result", {}).get("results", []) |
|
hit = items[0] if items else {} |
|
return { |
|
"term": term, |
|
"cui": hit.get("ui"), |
|
"name": hit.get("name"), |
|
"definition": hit.get("rootSource"), |
|
} |
|
except Exception: |
|
return {"term": term, "cui": None, "name": None, "definition": None} |
|
|