Medapp / pubmed_utils.py
mgbam's picture
Update pubmed_utils.py
3e78ff5 verified
raw
history blame
2.5 kB
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import pipeline
from config import PUBMED_EMAIL, CHUNK_SIZE
# Summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def search_pubmed(query, max_results=5):
"""
Search PubMed for PMIDs matching the query.
"""
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "pubmed",
"term": query,
"retmax": max_results,
"retmode": "json",
"tool": "MedicalAI",
"email": PUBMED_EMAIL,
}
response = requests.get(url, params=params)
response.raise_for_status()
return response.json().get("esearchresult", {}).get("idlist", [])
def fetch_abstract(pmid):
"""
Fetch abstract for a given PubMed ID.
"""
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
"db": "pubmed",
"id": pmid,
"retmode": "text",
"rettype": "abstract",
"tool": "MedicalAI",
"email": PUBMED_EMAIL,
}
response = requests.get(url, params=params)
response.raise_for_status()
return response.text.strip()
def fetch_pubmed_abstracts(pmids):
"""
Fetch multiple PubMed abstracts concurrently.
"""
results = {}
with ThreadPoolExecutor(max_workers=5) as executor:
future_to_pmid = {executor.submit(fetch_abstract, pmid): pmid for pmid in pmids}
for future in as_completed(future_to_pmid):
pmid = future_to_pmid[future]
try:
results[pmid] = future.result()
except Exception as e:
results[pmid] = f"Error fetching PMID {pmid}: {str(e)}"
return results
def summarize_text(text, chunk_size=CHUNK_SIZE):
"""
Summarize long text using a chunking strategy.
"""
sentences = text.split(". ")
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
tokens = len(sentence.split())
if current_length + tokens > chunk_size:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_length = 0
current_chunk.append(sentence)
current_length += tokens
if current_chunk:
chunks.append(" ".join(current_chunk))
summaries = [summarizer(chunk, max_length=100, min_length=30)[0]["summary_text"] for chunk in chunks]
return " ".join(summaries)