# mcp/nlp.py #!/usr/bin/env python3 """MedGenesis – spaCy helper for lightweight keyword extraction. Features ~~~~~~~~ * Lazy‑loads **`en_core_web_sm`** at first call; cached thereafter. * If model missing, raises actionable RuntimeError — Dockerfile must install via `python -m spacy download en_core_web_sm` (already in Dockerfile). * `extract_keywords` returns **unique named‑entity strings** (>2 chars) stripped of whitespace, preserving original casing. * Adds fallback to simple noun‑chunk extraction when no entities found – helps very short abstracts. """ from __future__ import annotations import spacy from functools import lru_cache from typing import List # --------------------------------------------------------------------- # Model loader (cached) # --------------------------------------------------------------------- @lru_cache(maxsize=1) def _load_model(): try: return spacy.load("en_core_web_sm") except OSError as e: raise RuntimeError( "spaCy model 'en_core_web_sm' is not installed. Add\n" " RUN python -m spacy download en_core_web_sm\n" "to your Dockerfile build stage." ) from e # --------------------------------------------------------------------- # Public API # --------------------------------------------------------------------- def extract_keywords(text: str, *, min_len: int = 3) -> List[str]: """Return de‑duplicated entity keywords (fallback noun chunks).""" nlp = _load_model() doc = nlp(text) ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_len} if ents: return list(ents) # Fallback: noun chunks if spaCy found no entities (rare for tiny texts) chunks = {chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) >= min_len} return list(chunks)