File size: 5,235 Bytes
4f7b321 1fdaf04 4f7b321 f6de761 4f7b321 55cf8ec 8d292e0 4f7b321 f6de761 4f7b321 f6de761 4f7b321 1fdaf04 4f7b321 1fdaf04 4f7b321 1fdaf04 4f7b321 1fdaf04 4f7b321 1fdaf04 4f7b321 1fdaf04 4f7b321 8d292e0 4f7b321 8d292e0 4f7b321 8d292e0 4f7b321 55cf8ec 4f7b321 8d292e0 4f7b321 8d292e0 4f7b321 8d292e0 1fdaf04 8d292e0 4f7b321 1fdaf04 4f7b321 1fdaf04 8d292e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import os
import re
import httpx
import asyncio
from functools import lru_cache
from pathlib import Path
from typing import List, Optional, Dict, Any
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
UMLS_API_KEY = os.getenv("UMLS_KEY")
UMLS_AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
UMLS_SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
# ---------------------------------------------------------------------------
# Named types
# ---------------------------------------------------------------------------
class UMLSResult(Dict[str, Optional[str]]):
"""
Represents a single UMLS lookup result.
Keys: term, cui, name, definition
"""
pass
# ---------------------------------------------------------------------------
# NLP model loading with caching
# ---------------------------------------------------------------------------
@lru_cache(maxsize=None)
def _load_spacy_model(model_name: str):
import spacy
return spacy.load(model_name)
@lru_cache(maxsize=None)
def _load_scispacy_model():
# Prefer the BioNLP model; fall back to the smaller sci model
try:
return _load_spacy_model("en_ner_bionlp13cg_md")
except Exception:
return _load_spacy_model("en_core_sci_sm")
@lru_cache(maxsize=None)
def _load_general_spacy():
return _load_spacy_model("en_core_web_sm")
# ---------------------------------------------------------------------------
# Concept extraction utilities
# ---------------------------------------------------------------------------
def _extract_entities(nlp, text: str, min_length: int) -> List[str]:
"""
Run a spaCy nlp pipeline over text and return unique entity texts
of at least min_length.
"""
doc = nlp(text)
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
return list(ents)
def _regex_fallback(text: str, min_length: int) -> List[str]:
"""
Simple regex-based token extraction for fallback.
"""
tokens = re.findall(r"\b[a-zA-Z0-9\-]+\b", text)
return list({t for t in tokens if len(t) >= min_length})
def extract_umls_concepts(text: str, min_length: int = 3) -> List[str]:
"""
Extract biomedical concepts from text in priority order:
1. SciSpaCy (en_ner_bionlp13cg_md or en_core_sci_sm)
2. spaCy general NER (en_core_web_sm)
3. Regex tokens
Guaranteed to return a list of unique strings.
"""
# 1) SciSpaCy pipeline
try:
scispacy_nlp = _load_scispacy_model()
entities = _extract_entities(scispacy_nlp, text, min_length)
if entities:
return entities
except ImportError:
# SciSpaCy not installed
pass
except Exception:
# Unexpected failure in scispacy
pass
# 2) General spaCy pipeline
try:
general_nlp = _load_general_spacy()
entities = _extract_entities(general_nlp, text, min_length)
if entities:
return entities
except Exception:
pass
# 3) Regex fallback
return _regex_fallback(text, min_length)
# ---------------------------------------------------------------------------
# UMLS API integration
# ---------------------------------------------------------------------------
async def _get_umls_ticket() -> Optional[str]:
"""
Obtain a UMLS service ticket for subsequent queries.
Returns None if API key is missing or authentication fails.
"""
if not UMLS_API_KEY:
return None
try:
async with httpx.AsyncClient(timeout=10) as client:
response = await client.post(
UMLS_AUTH_URL, data={"apikey": UMLS_API_KEY}
)
response.raise_for_status()
tgt_url = response.text.split('action="')[1].split('"')[0]
service_resp = await client.post(
tgt_url, data={"service": "http://umlsks.nlm.nih.gov"}
)
return service_resp.text
except Exception:
return None
@lru_cache(maxsize=512)
async def lookup_umls(term: str) -> UMLSResult:
"""
Look up a term in the UMLS API.
Returns a dict containing the original term, its CUI, preferred name, and definition.
On failure or quota issues, returns all values except 'term' as None.
"""
ticket = await _get_umls_ticket()
if not ticket:
return {"term": term, "cui": None, "name": None, "definition": None}
params = {"string": term, "ticket": ticket, "pageSize": 1}
try:
async with httpx.AsyncClient(timeout=8) as client:
resp = await client.get(UMLS_SEARCH_URL, params=params)
resp.raise_for_status()
results = resp.json().get("result", {}).get("results", [])
first = results[0] if results else {}
return {
"term": term,
"cui": first.get("ui"),
"name": first.get("name"),
"definition": first.get("definition") or first.get("rootSource"),
}
except Exception:
return {"term": term, "cui": None, "name": None, "definition": None}
|