import requests from concurrent.futures import ThreadPoolExecutor, as_completed import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize from transformers import pipeline from config import MY_PUBMED_EMAIL # Summarization pipeline for PubMed abstracts summarizer = pipeline( "summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn" ) def search_pubmed(query, max_results=3): """ Searches PubMed via ESearch. Returns list of PMIDs. """ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" params = { "db": "pubmed", "term": query, "retmax": max_results, "retmode": "json", "tool": "ElysiumRAG", "email": MY_PUBMED_EMAIL } resp = requests.get(base_url, params=params) resp.raise_for_status() data = resp.json() return data.get("esearchresult", {}).get("idlist", []) def fetch_one_abstract(pmid): """ Fetches a single abstract for the given PMID. """ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" params = { "db": "pubmed", "retmode": "text", "rettype": "abstract", "id": pmid, "tool": "ElysiumRAG", "email": MY_PUBMED_EMAIL } resp = requests.get(base_url, params=params) resp.raise_for_status() raw_text = resp.text.strip() or "No abstract text found." return (pmid, raw_text) def fetch_pubmed_abstracts(pmids): """ Parallel fetching of multiple abstracts. """ if not pmids: return {} results_map = {} with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor: future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids} for future in as_completed(future_to_pmid): pmid = future_to_pmid[future] try: pmid_result, text = future.result() results_map[pmid_result] = text except Exception as e: results_map[pmid] = f"Error: {str(e)}" return results_map def chunk_and_summarize(abstract_text, chunk_size=512): """ Splits large abstracts by sentences, summarizes each chunk, then concatenates. """ sentences = sent_tokenize(abstract_text) chunks = [] current_chunk = [] current_length = 0 for sent in sentences: tokens_in_sent = len(sent.split()) if current_length + tokens_in_sent > chunk_size: chunks.append(" ".join(current_chunk)) current_chunk = [] current_length = 0 current_chunk.append(sent) current_length += tokens_in_sent if current_chunk: chunks.append(" ".join(current_chunk)) summarized_pieces = [] for c in chunks: summary_out = summarizer( c, max_length=100, min_length=30, do_sample=False ) summarized_pieces.append(summary_out[0]['summary_text']) return " ".join(summarized_pieces).strip()