File size: 3,208 Bytes
f6de761
 
1fdaf04
 
 
 
 
f6de761
 
1fdaf04
55cf8ec
 
1fdaf04
 
f6de761
 
8d292e0
1fdaf04
 
f6de761
1fdaf04
 
f6de761
1fdaf04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d292e0
 
 
1fdaf04
 
 
 
 
 
8d292e0
 
 
55cf8ec
8d292e0
 
1fdaf04
 
8d292e0
 
 
 
1fdaf04
 
8d292e0
1fdaf04
 
 
 
 
 
 
 
 
 
 
8d292e0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# mcp/umls.py
"""
Biomedical keyword/concept extractor for UMLS lookup.

- Uses SciSpaCy if available (best for biomedical text).
- Falls back to spaCy 'en_core_web_sm' (less accurate, general English).
- Final fallback: regex keyword extraction.
"""

import os, httpx, asyncio
from functools import lru_cache

# UMLS API config
UMLS_KEY    = os.getenv("UMLS_KEY")
_AUTH_URL   = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
_SEARCH_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"

# ------- Robust concept extractor ------------------------------------------
def extract_umls_concepts(text: str, min_length: int = 3) -> list[str]:
    """
    Extract biomedical concepts (for UMLS) from text.
    Priority: SciSpaCy -> spaCy -> regex.
    """
    try:
        # Try SciSpaCy first (best for biomedical NER)
        import scispacy
        import spacy
        try:
            nlp = spacy.load("en_ner_bionlp13cg_md")
        except Exception:
            nlp = spacy.load("en_core_sci_sm")
        doc = nlp(text)
        # All entities ≥ min_length, deduplicated
        ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
        if ents:
            return list(ents)
    except Exception:
        pass

    # Fallback: spaCy general NER
    try:
        import spacy
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)
        ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_length}
        if ents:
            return list(ents)
    except Exception:
        pass

    # Final fallback: Regex keywords
    import re
    words = re.findall(r"\b[a-zA-Z0-9\-]+\b", text)
    return list({w for w in words if len(w) >= min_length})

# ------- UMLS API lookup (same as before, safe for missing/invalid key) ----
async def _get_ticket() -> str | None:
    if not UMLS_KEY:
        return None
    try:
        async with httpx.AsyncClient(timeout=10) as c:
            tgt = await c.post(_AUTH_URL, data={"apikey": UMLS_KEY})
            tgt.raise_for_status()
            action = tgt.text.split('action="')[1].split('"')[0]
            st = await c.post(action, data={"service": "http://umlsks.nlm.nih.gov"})
            return st.text
    except Exception:
        return None

@lru_cache(maxsize=512)
async def lookup_umls(term: str) -> dict:
    """
    Return {term,cui,name,definition}.
    If auth/quota fails → returns all keys as None (safe for UI).
    """
    ticket = await _get_ticket()
    if not ticket:
        return {"term": term, "cui": None, "name": None, "definition": None}

    params = {"string": term, "ticket": ticket, "pageSize": 1}
    try:
        async with httpx.AsyncClient(timeout=8) as c:
            r = await c.get(_SEARCH_URL, params=params)
            r.raise_for_status()
            items = r.json().get("result", {}).get("results", [])
            hit = items[0] if items else {}
            return {
                "term": term,
                "cui": hit.get("ui"),
                "name": hit.get("name"),
                "definition": hit.get("rootSource"),
            }
    except Exception:
        return {"term": term, "cui": None, "name": None, "definition": None}