File size: 1,875 Bytes
0a3aede 1974999 0a3aede 1974999 0a3aede 1974999 0375bcb 1974999 0375bcb 1974999 bb95791 0a3aede 1974999 0a3aede 1974999 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# mcp/nlp.py
#!/usr/bin/env python3
"""MedGenesis – spaCy helper for lightweight keyword extraction.
Features
~~~~~~~~
* Lazy‑loads **`en_core_web_sm`** at first call; cached thereafter.
* If model missing, raises actionable RuntimeError — Dockerfile must
install via `python -m spacy download en_core_web_sm` (already in Dockerfile).
* `extract_keywords` returns **unique named‑entity strings** (>2 chars)
stripped of whitespace, preserving original casing.
* Adds fallback to simple noun‑chunk extraction when no entities found –
helps very short abstracts.
"""
from __future__ import annotations
import spacy
from functools import lru_cache
from typing import List
# ---------------------------------------------------------------------
# Model loader (cached)
# ---------------------------------------------------------------------
@lru_cache(maxsize=1)
def _load_model():
try:
return spacy.load("en_core_web_sm")
except OSError as e:
raise RuntimeError(
"spaCy model 'en_core_web_sm' is not installed. Add\n"
" RUN python -m spacy download en_core_web_sm\n"
"to your Dockerfile build stage."
) from e
# ---------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------
def extract_keywords(text: str, *, min_len: int = 3) -> List[str]:
"""Return de‑duplicated entity keywords (fallback noun chunks)."""
nlp = _load_model()
doc = nlp(text)
ents = {ent.text.strip() for ent in doc.ents if len(ent.text.strip()) >= min_len}
if ents:
return list(ents)
# Fallback: noun chunks if spaCy found no entities (rare for tiny texts)
chunks = {chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) >= min_len}
return list(chunks)
|