|
|
|
|
|
|
|
"""MedGenesis – spaCy helper for lightweight keyword extraction. |
|
|
|
Features |
|
~~~~~~~~ |
|
* Lazy‑loads **`en_core_web_sm`** at first call; cached thereafter. |
|
* If model missing, raises actionable RuntimeError — Dockerfile must |
|
install via `python -m spacy download en_core_web_sm` (already in Dockerfile). |
|
* `extract_keywords` returns **unique named‑entity strings** (>2 chars) |
|
stripped of whitespace, preserving original casing. |
|
* Adds fallback to simple noun‑chunk extraction when no entities found – |
|
helps very short abstracts. |
|
""" |
|
from __future__ import annotations |
|
|
|
import spacy |
|
from functools import lru_cache |
|
from typing import List |
|
|
|
|
|
|
|
|
|
|
|
|
|
@lru_cache(maxsize=1) |
|
def _load_model(): |
|
try: |
|
return spacy.load("en_core_web_sm") |
|
except OSError as e: |
|
raise RuntimeError( |
|
"spaCy model 'en_core_web_sm' is not installed. Add\n" |
|
" RUN python -m spacy download en_core_web_sm\n" |
|
"to your Dockerfile build stage." |
|
) from e |
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_keywords(text: str, *, min_len: int = 3) -> List[str]: |
|
"""Return de‑duplicated entity keywords (fallback noun chunks).""" |
|
nlp = _load_model() |
|
doc = nlp(text) |
|
|
|
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_len} |
|
if ents: |
|
return list(ents) |
|
|
|
|
|
chunks = {chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) >= min_len} |
|
return list(chunks) |
|
|
|
|