File size: 3,056 Bytes
455290d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

from transformers import pipeline
from config import MY_PUBMED_EMAIL

# Summarization pipeline for PubMed abstracts
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    tokenizer="facebook/bart-large-cnn"
)

def search_pubmed(query, max_results=3):
    """
    Searches PubMed via ESearch. Returns list of PMIDs.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmax": max_results,
        "retmode": "json",
        "tool": "ElysiumRAG",
        "email": MY_PUBMED_EMAIL
    }
    resp = requests.get(base_url, params=params)
    resp.raise_for_status()
    data = resp.json()
    return data.get("esearchresult", {}).get("idlist", [])

def fetch_one_abstract(pmid):
    """
    Fetches a single abstract for the given PMID.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pubmed",
        "retmode": "text",
        "rettype": "abstract",
        "id": pmid,
        "tool": "ElysiumRAG",
        "email": MY_PUBMED_EMAIL
    }
    resp = requests.get(base_url, params=params)
    resp.raise_for_status()
    raw_text = resp.text.strip() or "No abstract text found."
    return (pmid, raw_text)

def fetch_pubmed_abstracts(pmids):
    """
    Parallel fetching of multiple abstracts.
    """
    if not pmids:
        return {}
    results_map = {}
    with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
        future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids}
        for future in as_completed(future_to_pmid):
            pmid = future_to_pmid[future]
            try:
                pmid_result, text = future.result()
                results_map[pmid_result] = text
            except Exception as e:
                results_map[pmid] = f"Error: {str(e)}"
    return results_map

def chunk_and_summarize(abstract_text, chunk_size=512):
    """
    Splits large abstracts by sentences, summarizes each chunk, then concatenates.
    """
    sentences = sent_tokenize(abstract_text)
    chunks = []

    current_chunk = []
    current_length = 0
    for sent in sentences:
        tokens_in_sent = len(sent.split())
        if current_length + tokens_in_sent > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(sent)
        current_length += tokens_in_sent

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    summarized_pieces = []
    for c in chunks:
        summary_out = summarizer(
            c,
            max_length=100,
            min_length=30,
            do_sample=False
        )
        summarized_pieces.append(summary_out[0]['summary_text'])

    return " ".join(summarized_pieces).strip()