|
import os |
|
import re |
|
import httpx |
|
import asyncio |
|
from functools import lru_cache |
|
from pathlib import Path |
|
from typing import List, Optional, Dict, Any |
|
|
|
|
|
|
|
|
|
UMLS_API_KEY = os.getenv("UMLS_KEY") |
|
UMLS_AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key" |
|
UMLS_SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current" |
|
|
|
|
|
|
|
|
|
class UMLSResult(Dict[str, Optional[str]]): |
|
""" |
|
Represents a single UMLS lookup result. |
|
Keys: term, cui, name, definition |
|
""" |
|
pass |
|
|
|
|
|
|
|
|
|
@lru_cache(maxsize=None) |
|
def _load_spacy_model(model_name: str): |
|
import spacy |
|
return spacy.load(model_name) |
|
|
|
@lru_cache(maxsize=None) |
|
def _load_scispacy_model(): |
|
|
|
try: |
|
return _load_spacy_model("en_ner_bionlp13cg_md") |
|
except Exception: |
|
return _load_spacy_model("en_core_sci_sm") |
|
|
|
@lru_cache(maxsize=None) |
|
def _load_general_spacy(): |
|
return _load_spacy_model("en_core_web_sm") |
|
|
|
|
|
|
|
|
|
def _extract_entities(nlp, text: str, min_length: int) -> List[str]: |
|
""" |
|
Run a spaCy nlp pipeline over text and return unique entity texts |
|
of at least min_length. |
|
""" |
|
doc = nlp(text) |
|
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length} |
|
return list(ents) |
|
|
|
|
|
def _regex_fallback(text: str, min_length: int) -> List[str]: |
|
""" |
|
Simple regex-based token extraction for fallback. |
|
""" |
|
tokens = re.findall(r"\b[a-zA-Z0-9\-]+\b", text) |
|
return list({t for t in tokens if len(t) >= min_length}) |
|
|
|
|
|
def extract_umls_concepts(text: str, min_length: int = 3) -> List[str]: |
|
""" |
|
Extract biomedical concepts from text in priority order: |
|
1. SciSpaCy (en_ner_bionlp13cg_md or en_core_sci_sm) |
|
2. spaCy general NER (en_core_web_sm) |
|
3. Regex tokens |
|
|
|
Guaranteed to return a list of unique strings. |
|
""" |
|
|
|
try: |
|
scispacy_nlp = _load_scispacy_model() |
|
entities = _extract_entities(scispacy_nlp, text, min_length) |
|
if entities: |
|
return entities |
|
except ImportError: |
|
|
|
pass |
|
except Exception: |
|
|
|
pass |
|
|
|
|
|
try: |
|
general_nlp = _load_general_spacy() |
|
entities = _extract_entities(general_nlp, text, min_length) |
|
if entities: |
|
return entities |
|
except Exception: |
|
pass |
|
|
|
|
|
return _regex_fallback(text, min_length) |
|
|
|
|
|
|
|
|
|
async def _get_umls_ticket() -> Optional[str]: |
|
""" |
|
Obtain a UMLS service ticket for subsequent queries. |
|
Returns None if API key is missing or authentication fails. |
|
""" |
|
if not UMLS_API_KEY: |
|
return None |
|
|
|
try: |
|
async with httpx.AsyncClient(timeout=10) as client: |
|
response = await client.post( |
|
UMLS_AUTH_URL, data={"apikey": UMLS_API_KEY} |
|
) |
|
response.raise_for_status() |
|
tgt_url = response.text.split('action="')[1].split('"')[0] |
|
service_resp = await client.post( |
|
tgt_url, data={"service": "http://umlsks.nlm.nih.gov"} |
|
) |
|
return service_resp.text |
|
except Exception: |
|
return None |
|
|
|
|
|
@lru_cache(maxsize=512) |
|
async def lookup_umls(term: str) -> UMLSResult: |
|
""" |
|
Look up a term in the UMLS API. |
|
Returns a dict containing the original term, its CUI, preferred name, and definition. |
|
On failure or quota issues, returns all values except 'term' as None. |
|
""" |
|
ticket = await _get_umls_ticket() |
|
if not ticket: |
|
return {"term": term, "cui": None, "name": None, "definition": None} |
|
|
|
params = {"string": term, "ticket": ticket, "pageSize": 1} |
|
try: |
|
async with httpx.AsyncClient(timeout=8) as client: |
|
resp = await client.get(UMLS_SEARCH_URL, params=params) |
|
resp.raise_for_status() |
|
results = resp.json().get("result", {}).get("results", []) |
|
first = results[0] if results else {} |
|
return { |
|
"term": term, |
|
"cui": first.get("ui"), |
|
"name": first.get("name"), |
|
"definition": first.get("definition") or first.get("rootSource"), |
|
} |
|
except Exception: |
|
return {"term": term, "cui": None, "name": None, "definition": None} |
|
|