mgbam commited on
Commit
5951d5e
·
verified ·
1 Parent(s): 3f9a9ea

Update mcp/umls.py

Browse files
Files changed (1) hide show
  1. mcp/umls.py +36 -146
mcp/umls.py CHANGED
@@ -1,152 +1,42 @@
1
- import os
2
- import re
3
- import httpx
4
- import asyncio
5
  from functools import lru_cache
6
- from pathlib import Path
7
- from typing import List, Optional, Dict, Any
8
 
9
- # ---------------------------------------------------------------------------
10
- # Configuration
11
- # ---------------------------------------------------------------------------
12
  UMLS_API_KEY = os.getenv("UMLS_KEY")
13
- UMLS_AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
14
- UMLS_SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
15
-
16
- # ---------------------------------------------------------------------------
17
- # Named types
18
- # ---------------------------------------------------------------------------
19
- class UMLSResult(Dict[str, Optional[str]]):
20
- """
21
- Represents a single UMLS lookup result.
22
- Keys: term, cui, name, definition
23
- """
24
- pass
25
-
26
- # ---------------------------------------------------------------------------
27
- # NLP model loading with caching
28
- # ---------------------------------------------------------------------------
29
- @lru_cache(maxsize=None)
30
- def _load_spacy_model(model_name: str):
31
- import spacy
32
- return spacy.load(model_name)
33
-
34
- @lru_cache(maxsize=None)
35
- def _load_scispacy_model():
36
- # Prefer the BioNLP model; fall back to the smaller sci model
37
- try:
38
- return _load_spacy_model("en_ner_bionlp13cg_md")
39
- except Exception:
40
- return _load_spacy_model("en_core_sci_sm")
41
-
42
- @lru_cache(maxsize=None)
43
- def _load_general_spacy():
44
- return _load_spacy_model("en_core_web_sm")
45
-
46
- # ---------------------------------------------------------------------------
47
- # Concept extraction utilities
48
- # ---------------------------------------------------------------------------
49
- def _extract_entities(nlp, text: str, min_length: int) -> List[str]:
50
- """
51
- Run a spaCy nlp pipeline over text and return unique entity texts
52
- of at least min_length.
53
- """
54
- doc = nlp(text)
55
- ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
56
- return list(ents)
57
-
58
-
59
- def _regex_fallback(text: str, min_length: int) -> List[str]:
60
- """
61
- Simple regex-based token extraction for fallback.
62
- """
63
- tokens = re.findall(r"\b[a-zA-Z0-9\-]+\b", text)
64
- return list({t for t in tokens if len(t) >= min_length})
65
-
66
-
67
- def extract_umls_concepts(text: str, min_length: int = 3) -> List[str]:
68
- """
69
- Extract biomedical concepts from text in priority order:
70
- 1. SciSpaCy (en_ner_bionlp13cg_md or en_core_sci_sm)
71
- 2. spaCy general NER (en_core_web_sm)
72
- 3. Regex tokens
73
-
74
- Guaranteed to return a list of unique strings.
75
- """
76
- # 1) SciSpaCy pipeline
77
- try:
78
- scispacy_nlp = _load_scispacy_model()
79
- entities = _extract_entities(scispacy_nlp, text, min_length)
80
- if entities:
81
- return entities
82
- except ImportError:
83
- # SciSpaCy not installed
84
- pass
85
- except Exception:
86
- # Unexpected failure in scispacy
87
- pass
88
-
89
- # 2) General spaCy pipeline
90
- try:
91
- general_nlp = _load_general_spacy()
92
- entities = _extract_entities(general_nlp, text, min_length)
93
- if entities:
94
- return entities
95
- except Exception:
96
- pass
97
-
98
- # 3) Regex fallback
99
- return _regex_fallback(text, min_length)
100
-
101
- # ---------------------------------------------------------------------------
102
- # UMLS API integration
103
- # ---------------------------------------------------------------------------
104
- async def _get_umls_ticket() -> Optional[str]:
105
- """
106
- Obtain a UMLS service ticket for subsequent queries.
107
- Returns None if API key is missing or authentication fails.
108
- """
109
- if not UMLS_API_KEY:
110
- return None
111
-
112
- try:
113
- async with httpx.AsyncClient(timeout=10) as client:
114
- response = await client.post(
115
- UMLS_AUTH_URL, data={"apikey": UMLS_API_KEY}
116
- )
117
- response.raise_for_status()
118
- tgt_url = response.text.split('action="')[1].split('"')[0]
119
- service_resp = await client.post(
120
- tgt_url, data={"service": "http://umlsks.nlm.nih.gov"}
121
- )
122
- return service_resp.text
123
- except Exception:
124
- return None
125
-
126
 
127
  @lru_cache(maxsize=512)
128
- async def lookup_umls(term: str) -> UMLSResult:
129
- """
130
- Look up a term in the UMLS API.
131
- Returns a dict containing the original term, its CUI, preferred name, and definition.
132
- On failure or quota issues, returns all values except 'term' as None.
133
- """
134
- ticket = await _get_umls_ticket()
135
- if not ticket:
136
- return {"term": term, "cui": None, "name": None, "definition": None}
137
-
138
  params = {"string": term, "ticket": ticket, "pageSize": 1}
139
- try:
140
- async with httpx.AsyncClient(timeout=8) as client:
141
- resp = await client.get(UMLS_SEARCH_URL, params=params)
142
- resp.raise_for_status()
143
- results = resp.json().get("result", {}).get("results", [])
144
- first = results[0] if results else {}
145
- return {
146
- "term": term,
147
- "cui": first.get("ui"),
148
- "name": first.get("name"),
149
- "definition": first.get("definition") or first.get("rootSource"),
150
- }
151
- except Exception:
152
- return {"term": term, "cui": None, "name": None, "definition": None}
 
 
 
 
 
 
 
1
+ # mcp/umls.py
2
+ import os, httpx
 
 
3
  from functools import lru_cache
 
 
4
 
 
 
 
5
  UMLS_API_KEY = os.getenv("UMLS_KEY")
6
+ AUTH_URL = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
7
+ SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
8
+ CONTENT_URL = "https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{cui}"
9
+
10
+ async def _get_ticket() -> str:
11
+ async with httpx.AsyncClient(timeout=10) as c:
12
+ r1 = await c.post(AUTH_URL, data={"apikey": UMLS_API_KEY})
13
+ r1.raise_for_status()
14
+ tgt = r1.text.split('action="')[1].split('"')[0]
15
+ r2 = await c.post(tgt, data={"service": "http://umlsks.nlm.nih.gov"})
16
+ r2.raise_for_status()
17
+ return r2.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  @lru_cache(maxsize=512)
20
+ async def lookup_umls(term: str) -> dict:
21
+ ticket = await _get_ticket()
 
 
 
 
 
 
 
 
22
  params = {"string": term, "ticket": ticket, "pageSize": 1}
23
+ async with httpx.AsyncClient(timeout=10) as c:
24
+ r = await c.get(SEARCH_URL, params=params)
25
+ r.raise_for_status()
26
+ items = r.json().get("result", {}).get("results", [])
27
+ if not items:
28
+ return {"term": term}
29
+ itm = items[0]
30
+ cui, name = itm.get("ui"), itm.get("name")
31
+ r2 = await c.get(CONTENT_URL.format(cui=cui), params={"ticket": ticket})
32
+ r2.raise_for_status()
33
+ entry = r2.json().get("result", {})
34
+ types = [t["name"] for t in entry.get("semanticTypes", [])]
35
+ definition = entry.get("definitions", [{}])[0].get("value", "")
36
+ return {
37
+ "term": term,
38
+ "cui": cui,
39
+ "name": name,
40
+ "definition": definition,
41
+ "types": types
42
+ }