Update mcp/umls.py
Browse files- mcp/umls.py +36 -146
mcp/umls.py
CHANGED
@@ -1,152 +1,42 @@
|
|
1 |
-
|
2 |
-
import
|
3 |
-
import httpx
|
4 |
-
import asyncio
|
5 |
from functools import lru_cache
|
6 |
-
from pathlib import Path
|
7 |
-
from typing import List, Optional, Dict, Any
|
8 |
|
9 |
-
# ---------------------------------------------------------------------------
|
10 |
-
# Configuration
|
11 |
-
# ---------------------------------------------------------------------------
|
12 |
UMLS_API_KEY = os.getenv("UMLS_KEY")
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
# ---------------------------------------------------------------------------
|
27 |
-
# NLP model loading with caching
|
28 |
-
# ---------------------------------------------------------------------------
|
29 |
-
@lru_cache(maxsize=None)
|
30 |
-
def _load_spacy_model(model_name: str):
|
31 |
-
import spacy
|
32 |
-
return spacy.load(model_name)
|
33 |
-
|
34 |
-
@lru_cache(maxsize=None)
|
35 |
-
def _load_scispacy_model():
|
36 |
-
# Prefer the BioNLP model; fall back to the smaller sci model
|
37 |
-
try:
|
38 |
-
return _load_spacy_model("en_ner_bionlp13cg_md")
|
39 |
-
except Exception:
|
40 |
-
return _load_spacy_model("en_core_sci_sm")
|
41 |
-
|
42 |
-
@lru_cache(maxsize=None)
|
43 |
-
def _load_general_spacy():
|
44 |
-
return _load_spacy_model("en_core_web_sm")
|
45 |
-
|
46 |
-
# ---------------------------------------------------------------------------
|
47 |
-
# Concept extraction utilities
|
48 |
-
# ---------------------------------------------------------------------------
|
49 |
-
def _extract_entities(nlp, text: str, min_length: int) -> List[str]:
|
50 |
-
"""
|
51 |
-
Run a spaCy nlp pipeline over text and return unique entity texts
|
52 |
-
of at least min_length.
|
53 |
-
"""
|
54 |
-
doc = nlp(text)
|
55 |
-
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
|
56 |
-
return list(ents)
|
57 |
-
|
58 |
-
|
59 |
-
def _regex_fallback(text: str, min_length: int) -> List[str]:
|
60 |
-
"""
|
61 |
-
Simple regex-based token extraction for fallback.
|
62 |
-
"""
|
63 |
-
tokens = re.findall(r"\b[a-zA-Z0-9\-]+\b", text)
|
64 |
-
return list({t for t in tokens if len(t) >= min_length})
|
65 |
-
|
66 |
-
|
67 |
-
def extract_umls_concepts(text: str, min_length: int = 3) -> List[str]:
|
68 |
-
"""
|
69 |
-
Extract biomedical concepts from text in priority order:
|
70 |
-
1. SciSpaCy (en_ner_bionlp13cg_md or en_core_sci_sm)
|
71 |
-
2. spaCy general NER (en_core_web_sm)
|
72 |
-
3. Regex tokens
|
73 |
-
|
74 |
-
Guaranteed to return a list of unique strings.
|
75 |
-
"""
|
76 |
-
# 1) SciSpaCy pipeline
|
77 |
-
try:
|
78 |
-
scispacy_nlp = _load_scispacy_model()
|
79 |
-
entities = _extract_entities(scispacy_nlp, text, min_length)
|
80 |
-
if entities:
|
81 |
-
return entities
|
82 |
-
except ImportError:
|
83 |
-
# SciSpaCy not installed
|
84 |
-
pass
|
85 |
-
except Exception:
|
86 |
-
# Unexpected failure in scispacy
|
87 |
-
pass
|
88 |
-
|
89 |
-
# 2) General spaCy pipeline
|
90 |
-
try:
|
91 |
-
general_nlp = _load_general_spacy()
|
92 |
-
entities = _extract_entities(general_nlp, text, min_length)
|
93 |
-
if entities:
|
94 |
-
return entities
|
95 |
-
except Exception:
|
96 |
-
pass
|
97 |
-
|
98 |
-
# 3) Regex fallback
|
99 |
-
return _regex_fallback(text, min_length)
|
100 |
-
|
101 |
-
# ---------------------------------------------------------------------------
|
102 |
-
# UMLS API integration
|
103 |
-
# ---------------------------------------------------------------------------
|
104 |
-
async def _get_umls_ticket() -> Optional[str]:
|
105 |
-
"""
|
106 |
-
Obtain a UMLS service ticket for subsequent queries.
|
107 |
-
Returns None if API key is missing or authentication fails.
|
108 |
-
"""
|
109 |
-
if not UMLS_API_KEY:
|
110 |
-
return None
|
111 |
-
|
112 |
-
try:
|
113 |
-
async with httpx.AsyncClient(timeout=10) as client:
|
114 |
-
response = await client.post(
|
115 |
-
UMLS_AUTH_URL, data={"apikey": UMLS_API_KEY}
|
116 |
-
)
|
117 |
-
response.raise_for_status()
|
118 |
-
tgt_url = response.text.split('action="')[1].split('"')[0]
|
119 |
-
service_resp = await client.post(
|
120 |
-
tgt_url, data={"service": "http://umlsks.nlm.nih.gov"}
|
121 |
-
)
|
122 |
-
return service_resp.text
|
123 |
-
except Exception:
|
124 |
-
return None
|
125 |
-
|
126 |
|
127 |
@lru_cache(maxsize=512)
|
128 |
-
async def lookup_umls(term: str) ->
|
129 |
-
|
130 |
-
Look up a term in the UMLS API.
|
131 |
-
Returns a dict containing the original term, its CUI, preferred name, and definition.
|
132 |
-
On failure or quota issues, returns all values except 'term' as None.
|
133 |
-
"""
|
134 |
-
ticket = await _get_umls_ticket()
|
135 |
-
if not ticket:
|
136 |
-
return {"term": term, "cui": None, "name": None, "definition": None}
|
137 |
-
|
138 |
params = {"string": term, "ticket": ticket, "pageSize": 1}
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# mcp/umls.py
|
2 |
+
import os, httpx
|
|
|
|
|
3 |
from functools import lru_cache
|
|
|
|
|
4 |
|
|
|
|
|
|
|
5 |
UMLS_API_KEY = os.getenv("UMLS_KEY")
|
6 |
+
AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
|
7 |
+
SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
|
8 |
+
CONTENT_URL = "https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{cui}"
|
9 |
+
|
10 |
+
async def _get_ticket() -> str:
|
11 |
+
async with httpx.AsyncClient(timeout=10) as c:
|
12 |
+
r1 = await c.post(AUTH_URL, data={"apikey": UMLS_API_KEY})
|
13 |
+
r1.raise_for_status()
|
14 |
+
tgt = r1.text.split('action="')[1].split('"')[0]
|
15 |
+
r2 = await c.post(tgt, data={"service": "http://umlsks.nlm.nih.gov"})
|
16 |
+
r2.raise_for_status()
|
17 |
+
return r2.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
@lru_cache(maxsize=512)
|
20 |
+
async def lookup_umls(term: str) -> dict:
|
21 |
+
ticket = await _get_ticket()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
params = {"string": term, "ticket": ticket, "pageSize": 1}
|
23 |
+
async with httpx.AsyncClient(timeout=10) as c:
|
24 |
+
r = await c.get(SEARCH_URL, params=params)
|
25 |
+
r.raise_for_status()
|
26 |
+
items = r.json().get("result", {}).get("results", [])
|
27 |
+
if not items:
|
28 |
+
return {"term": term}
|
29 |
+
itm = items[0]
|
30 |
+
cui, name = itm.get("ui"), itm.get("name")
|
31 |
+
r2 = await c.get(CONTENT_URL.format(cui=cui), params={"ticket": ticket})
|
32 |
+
r2.raise_for_status()
|
33 |
+
entry = r2.json().get("result", {})
|
34 |
+
types = [t["name"] for t in entry.get("semanticTypes", [])]
|
35 |
+
definition = entry.get("definitions", [{}])[0].get("value", "")
|
36 |
+
return {
|
37 |
+
"term": term,
|
38 |
+
"cui": cui,
|
39 |
+
"name": name,
|
40 |
+
"definition": definition,
|
41 |
+
"types": types
|
42 |
+
}
|