mgbam's picture
Update genesis/api_clients/pubmed_api.py
9fbaf8f verified
raw
history blame
3.36 kB
# genesis/api_clients/pubmed_api.py
import requests
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional
from datetime import datetime
PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
NCBI_API_KEY = None # Optional: Set in Hugging Face secrets for higher rate limits
def search_pubmed(query: str, max_results: int = 20, start_date: Optional[str] = None, end_date: Optional[str] = None) -> List[str]:
"""
Search PubMed for a given query and return a list of PubMed IDs.
Optionally filter by start_date and end_date (YYYY/MM/DD format).
"""
params = {
"db": "pubmed",
"term": query,
"retmax": max_results,
"retmode": "json",
"api_key": NCBI_API_KEY
}
if start_date and end_date:
params["mindate"] = start_date
params["maxdate"] = end_date
params["datetype"] = "pdat"
r = requests.get(PUBMED_SEARCH_URL, params=params)
r.raise_for_status()
data = r.json()
return data.get("esearchresult", {}).get("idlist", [])
def fetch_pubmed_details(pmid_list: List[str]) -> List[Dict]:
"""
Fetch detailed metadata for a list of PubMed IDs.
Returns title, abstract, authors, journal, and publication date.
"""
if not pmid_list:
return []
params = {
"db": "pubmed",
"id": ",".join(pmid_list),
"retmode": "xml",
"api_key": NCBI_API_KEY
}
r = requests.get(PUBMED_FETCH_URL, params=params)
r.raise_for_status()
root = ET.fromstring(r.text)
results = []
for article in root.findall(".//PubmedArticle"):
try:
title = article.find(".//ArticleTitle").text or "No title"
abstract = " ".join([t.text for t in article.findall(".//AbstractText") if t.text]) or "No abstract"
authors = []
for a in article.findall(".//Author"):
last = a.findtext("LastName", "")
first = a.findtext("ForeName", "")
if last or first:
authors.append(f"{first} {last}".strip())
journal = article.findtext(".//Journal/Title", "Unknown Journal")
pub_date = article.find(".//PubDate")
if pub_date is not None:
year = pub_date.findtext("Year", "")
month = pub_date.findtext("Month", "")
day = pub_date.findtext("Day", "")
date_str = f"{year}-{month}-{day}" if year else "Unknown"
else:
date_str = "Unknown"
results.append({
"title": title,
"abstract": abstract,
"authors": authors,
"journal": journal,
"publication_date": date_str,
"pubmed_link": f"https://pubmed.ncbi.nlm.nih.gov/{article.findtext('.//PMID')}/"
})
except Exception:
continue
return results
def search_and_fetch_pubmed(query: str, max_results: int = 20, start_date: Optional[str] = None, end_date: Optional[str] = None) -> List[Dict]:
"""
Search and fetch PubMed results in one call.
"""
pmids = search_pubmed(query, max_results, start_date, end_date)
return fetch_pubmed_details(pmids)