Medapp / pubmed_utils.py
mgbam's picture
Upload 4 files
ca7c5dc verified
raw
history blame
3.13 kB
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from transformers import pipeline
from config import MY_PUBMED_EMAIL
# Build a summarization pipeline at module load (caching recommended)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
def search_pubmed(query, max_results=3):
"""
Searches PubMed via ESearch and returns list of PMIDs.
"""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "pubmed",
"term": query,
"retmax": max_results,
"retmode": "json",
"tool": "ElysiumRAG",
"email": MY_PUBMED_EMAIL
}
resp = requests.get(base_url, params=params)
resp.raise_for_status()
data = resp.json()
return data.get("esearchresult", {}).get("idlist", [])
def fetch_one_abstract(pmid):
"""
Fetches abstract for a given PMID. Returns (pmid, text).
"""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
"db": "pubmed",
"retmode": "text",
"rettype": "abstract",
"id": pmid,
"tool": "ElysiumRAG",
"email": MY_PUBMED_EMAIL
}
resp = requests.get(base_url, params=params)
resp.raise_for_status()
raw_text = resp.text.strip() or "No abstract text found."
return (pmid, raw_text)
def fetch_pubmed_abstracts(pmids):
"""
Parallel retrieval of multiple PMIDs.
"""
if not pmids:
return {}
results_map = {}
with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids}
for future in as_completed(future_to_pmid):
pmid = future_to_pmid[future]
try:
pmid_result, text = future.result()
results_map[pmid_result] = text
except Exception as e:
results_map[pmid] = f"Error: {str(e)}"
return results_map
def chunk_and_summarize(abstract_text, chunk_size=512):
"""
Chunk large abstracts by sentence, summarize each chunk, then combine.
"""
sentences = sent_tokenize(abstract_text)
chunks = []
current_chunk = []
current_length = 0
for sent in sentences:
tokens_in_sent = len(sent.split())
if current_length + tokens_in_sent > chunk_size:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_length = 0
current_chunk.append(sent)
current_length += tokens_in_sent
if current_chunk:
chunks.append(" ".join(current_chunk))
summarized_pieces = []
for c in chunks:
summary_out = summarizer(
c, max_length=100, min_length=30, do_sample=False
)
summarized_pieces.append(summary_out[0]['summary_text'])
return " ".join(summarized_pieces).strip()