|
import requests |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
from transformers import pipeline |
|
from config import PUBMED_EMAIL, CHUNK_SIZE |
|
|
|
|
|
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") |
|
|
|
|
|
def search_pubmed(query, max_results=5): |
|
""" |
|
Search PubMed for PMIDs matching the query. |
|
""" |
|
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" |
|
params = { |
|
"db": "pubmed", |
|
"term": query, |
|
"retmax": max_results, |
|
"retmode": "json", |
|
"tool": "MedicalAI", |
|
"email": PUBMED_EMAIL, |
|
} |
|
response = requests.get(url, params=params) |
|
response.raise_for_status() |
|
return response.json().get("esearchresult", {}).get("idlist", []) |
|
|
|
|
|
def fetch_abstract(pmid): |
|
""" |
|
Fetch abstract for a given PubMed ID. |
|
""" |
|
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" |
|
params = { |
|
"db": "pubmed", |
|
"id": pmid, |
|
"retmode": "text", |
|
"rettype": "abstract", |
|
"tool": "MedicalAI", |
|
"email": PUBMED_EMAIL, |
|
} |
|
response = requests.get(url, params=params) |
|
response.raise_for_status() |
|
return response.text.strip() |
|
|
|
|
|
def fetch_pubmed_abstracts(pmids): |
|
""" |
|
Fetch multiple PubMed abstracts concurrently. |
|
""" |
|
results = {} |
|
with ThreadPoolExecutor(max_workers=5) as executor: |
|
future_to_pmid = {executor.submit(fetch_abstract, pmid): pmid for pmid in pmids} |
|
for future in as_completed(future_to_pmid): |
|
pmid = future_to_pmid[future] |
|
try: |
|
results[pmid] = future.result() |
|
except Exception as e: |
|
results[pmid] = f"Error fetching PMID {pmid}: {str(e)}" |
|
return results |
|
|
|
|
|
def summarize_text(text, chunk_size=CHUNK_SIZE): |
|
""" |
|
Summarize long text using a chunking strategy. |
|
""" |
|
sentences = text.split(". ") |
|
chunks = [] |
|
current_chunk = [] |
|
current_length = 0 |
|
|
|
for sentence in sentences: |
|
tokens = len(sentence.split()) |
|
if current_length + tokens > chunk_size: |
|
chunks.append(" ".join(current_chunk)) |
|
current_chunk = [] |
|
current_length = 0 |
|
current_chunk.append(sentence) |
|
current_length += tokens |
|
|
|
if current_chunk: |
|
chunks.append(" ".join(current_chunk)) |
|
|
|
summaries = [summarizer(chunk, max_length=100, min_length=30)[0]["summary_text"] for chunk in chunks] |
|
return " ".join(summaries) |
|
|