|
import requests
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import nltk
|
|
nltk.download('punkt')
|
|
from nltk.tokenize import sent_tokenize
|
|
|
|
from transformers import pipeline
|
|
from config import MY_PUBMED_EMAIL
|
|
|
|
|
|
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
|
|
|
|
def search_pubmed(query, max_results=3):
|
|
"""
|
|
Searches PubMed via ESearch and returns list of PMIDs.
|
|
"""
|
|
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
|
params = {
|
|
"db": "pubmed",
|
|
"term": query,
|
|
"retmax": max_results,
|
|
"retmode": "json",
|
|
"tool": "ElysiumRAG",
|
|
"email": MY_PUBMED_EMAIL
|
|
}
|
|
resp = requests.get(base_url, params=params)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
return data.get("esearchresult", {}).get("idlist", [])
|
|
|
|
def fetch_one_abstract(pmid):
|
|
"""
|
|
Fetches abstract for a given PMID. Returns (pmid, text).
|
|
"""
|
|
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
|
params = {
|
|
"db": "pubmed",
|
|
"retmode": "text",
|
|
"rettype": "abstract",
|
|
"id": pmid,
|
|
"tool": "ElysiumRAG",
|
|
"email": MY_PUBMED_EMAIL
|
|
}
|
|
resp = requests.get(base_url, params=params)
|
|
resp.raise_for_status()
|
|
raw_text = resp.text.strip() or "No abstract text found."
|
|
return (pmid, raw_text)
|
|
|
|
def fetch_pubmed_abstracts(pmids):
|
|
"""
|
|
Parallel retrieval of multiple PMIDs.
|
|
"""
|
|
if not pmids:
|
|
return {}
|
|
|
|
results_map = {}
|
|
with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
|
|
future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids}
|
|
for future in as_completed(future_to_pmid):
|
|
pmid = future_to_pmid[future]
|
|
try:
|
|
pmid_result, text = future.result()
|
|
results_map[pmid_result] = text
|
|
except Exception as e:
|
|
results_map[pmid] = f"Error: {str(e)}"
|
|
return results_map
|
|
|
|
def chunk_and_summarize(abstract_text, chunk_size=512):
|
|
"""
|
|
Chunk large abstracts by sentence, summarize each chunk, then combine.
|
|
"""
|
|
sentences = sent_tokenize(abstract_text)
|
|
chunks = []
|
|
|
|
current_chunk = []
|
|
current_length = 0
|
|
for sent in sentences:
|
|
tokens_in_sent = len(sent.split())
|
|
if current_length + tokens_in_sent > chunk_size:
|
|
chunks.append(" ".join(current_chunk))
|
|
current_chunk = []
|
|
current_length = 0
|
|
current_chunk.append(sent)
|
|
current_length += tokens_in_sent
|
|
|
|
if current_chunk:
|
|
chunks.append(" ".join(current_chunk))
|
|
|
|
summarized_pieces = []
|
|
for c in chunks:
|
|
summary_out = summarizer(
|
|
c, max_length=100, min_length=30, do_sample=False
|
|
)
|
|
summarized_pieces.append(summary_out[0]['summary_text'])
|
|
|
|
return " ".join(summarized_pieces).strip()
|
|
|