Spaces:

mgbam
/

MCP_Res

Building on CPU Upgrade

App Files Files Community

mgbam commited on 3 days ago

Commit

1fdaf04

verified ·

1 Parent(s): f6de761

Update mcp/umls.py

Browse files

Files changed (1) hide show

mcp/umls.py +67 -39

mcp/umls.py CHANGED Viewed

@@ -1,66 +1,94 @@
 # mcp/umls.py
 """
-Async UMLS lookup via NLM REST API.
-Graceful fallback: always returns a dict with all keys even on failure.
 """
-import os
-import httpx
 from functools import lru_cache
-UMLS_KEY    = os.getenv("UMLS_KEY")          # Should be set as a secret (Hugging Face/colab/etc)
 _AUTH_URL   = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
 _SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
-# ── Internal helper: get a single-use service ticket ──────────────
-async def _get_ticket() -> str | None:
     """
-    Return a single-use UMLS service ticket, or None if auth fails.
     """
     if not UMLS_KEY:
         return None
     try:
-        async with httpx.AsyncClient(timeout=10) as client:
-            # 1. Get ticket-granting ticket (TGT)
-            tgt_resp = await client.post(_AUTH_URL, data={"apikey": UMLS_KEY})
-            tgt_resp.raise_for_status()
-            tgt_url = tgt_resp.text.split('action="')[1].split('"')[0]
-            # 2. Exchange for single-use service ticket
-            st_resp = await client.post(tgt_url, data={"service": "http://umlsks.nlm.nih.gov"})
-            return st_resp.text.strip()
     except Exception:
         return None
-# ── Public API: async, returns safe dict ──────────────────────────
 @lru_cache(maxsize=512)
 async def lookup_umls(term: str) -> dict:
     """
-    Lookup a UMLS concept by text term.
-    Returns dict with {term, cui, name, definition}, always.
-    If quota/auth fails: all values None except term.
     """
     ticket = await _get_ticket()
     if not ticket:
         return {"term": term, "cui": None, "name": None, "definition": None}
-    params = {
-        "string": term,
-        "ticket": ticket,
-        "pageSize": 1,
-    }
     try:
-        async with httpx.AsyncClient(timeout=8) as client:
-            resp = await client.get(_SEARCH_URL, params=params)
-            resp.raise_for_status()
-            items = resp.json().get("result", {}).get("results", [])
-            if items:
-                hit = items[0]
-                return {
-                    "term": term,
-                    "cui": hit.get("ui"),
-                    "name": hit.get("name"),
-                    "definition": hit.get("rootSource"),
-                }
-            else:
-                return {"term": term, "cui": None, "name": None, "definition": None}
     except Exception:
         return {"term": term, "cui": None, "name": None, "definition": None}

 # mcp/umls.py
 """
+Biomedical keyword/concept extractor for UMLS lookup.
+- Uses SciSpaCy if available (best for biomedical text).
+- Falls back to spaCy 'en_core_web_sm' (less accurate, general English).
+- Final fallback: regex keyword extraction.
 """
+import os, httpx, asyncio
 from functools import lru_cache
+# UMLS API config
+UMLS_KEY    = os.getenv("UMLS_KEY")
 _AUTH_URL   = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
 _SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"
+# ------- Robust concept extractor ------------------------------------------
+def extract_umls_concepts(text: str, min_length: int = 3) -> list[str]:
     """
+    Extract biomedical concepts (for UMLS) from text.
+    Priority: SciSpaCy -> spaCy -> regex.
     """
+    try:
+        # Try SciSpaCy first (best for biomedical NER)
+        import scispacy
+        import spacy
+        try:
+            nlp = spacy.load("en_ner_bionlp13cg_md")
+        except Exception:
+            nlp = spacy.load("en_core_sci_sm")
+        doc = nlp(text)
+        # All entities ≥ min_length, deduplicated
+        ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
+        if ents:
+            return list(ents)
+    except Exception:
+        pass
+    # Fallback: spaCy general NER
+    try:
+        import spacy
+        nlp = spacy.load("en_core_web_sm")
+        doc = nlp(text)
+        ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
+        if ents:
+            return list(ents)
+    except Exception:
+        pass
+    # Final fallback: Regex keywords
+    import re
+    words = re.findall(r"\b[a-zA-Z0-9\-]+\b", text)
+    return list({w for w in words if len(w) >= min_length})
+# ------- UMLS API lookup (same as before, safe for missing/invalid key) ----
+async def _get_ticket() -> str | None:
     if not UMLS_KEY:
         return None
     try:
+        async with httpx.AsyncClient(timeout=10) as c:
+            tgt = await c.post(_AUTH_URL, data={"apikey": UMLS_KEY})
+            tgt.raise_for_status()
+            action = tgt.text.split('action="')[1].split('"')[0]
+            st = await c.post(action, data={"service": "http://umlsks.nlm.nih.gov"})
+            return st.text
     except Exception:
         return None
 @lru_cache(maxsize=512)
 async def lookup_umls(term: str) -> dict:
     """
+    Return {term,cui,name,definition}.
+    If auth/quota fails → returns all keys as None (safe for UI).
     """
     ticket = await _get_ticket()
     if not ticket:
         return {"term": term, "cui": None, "name": None, "definition": None}
+    params = {"string": term, "ticket": ticket, "pageSize": 1}
     try:
+        async with httpx.AsyncClient(timeout=8) as c:
+            r = await c.get(_SEARCH_URL, params=params)
+            r.raise_for_status()
+            items = r.json().get("result", {}).get("results", [])
+            hit = items[0] if items else {}
+            return {
+                "term": term,
+                "cui": hit.get("ui"),
+                "name": hit.get("name"),
+                "definition": hit.get("rootSource"),
+            }
     except Exception:
         return {"term": term, "cui": None, "name": None, "definition": None}