File size: 3,056 Bytes
455290d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from transformers import pipeline
from config import MY_PUBMED_EMAIL
# Summarization pipeline for PubMed abstracts
summarizer = pipeline(
"summarization",
model="facebook/bart-large-cnn",
tokenizer="facebook/bart-large-cnn"
)
def search_pubmed(query, max_results=3):
"""
Searches PubMed via ESearch. Returns list of PMIDs.
"""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "pubmed",
"term": query,
"retmax": max_results,
"retmode": "json",
"tool": "ElysiumRAG",
"email": MY_PUBMED_EMAIL
}
resp = requests.get(base_url, params=params)
resp.raise_for_status()
data = resp.json()
return data.get("esearchresult", {}).get("idlist", [])
def fetch_one_abstract(pmid):
"""
Fetches a single abstract for the given PMID.
"""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
"db": "pubmed",
"retmode": "text",
"rettype": "abstract",
"id": pmid,
"tool": "ElysiumRAG",
"email": MY_PUBMED_EMAIL
}
resp = requests.get(base_url, params=params)
resp.raise_for_status()
raw_text = resp.text.strip() or "No abstract text found."
return (pmid, raw_text)
def fetch_pubmed_abstracts(pmids):
"""
Parallel fetching of multiple abstracts.
"""
if not pmids:
return {}
results_map = {}
with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids}
for future in as_completed(future_to_pmid):
pmid = future_to_pmid[future]
try:
pmid_result, text = future.result()
results_map[pmid_result] = text
except Exception as e:
results_map[pmid] = f"Error: {str(e)}"
return results_map
def chunk_and_summarize(abstract_text, chunk_size=512):
"""
Splits large abstracts by sentences, summarizes each chunk, then concatenates.
"""
sentences = sent_tokenize(abstract_text)
chunks = []
current_chunk = []
current_length = 0
for sent in sentences:
tokens_in_sent = len(sent.split())
if current_length + tokens_in_sent > chunk_size:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_length = 0
current_chunk.append(sent)
current_length += tokens_in_sent
if current_chunk:
chunks.append(" ".join(current_chunk))
summarized_pieces = []
for c in chunks:
summary_out = summarizer(
c,
max_length=100,
min_length=30,
do_sample=False
)
summarized_pieces.append(summary_out[0]['summary_text'])
return " ".join(summarized_pieces).strip()
|