mgbam's picture
Update genesis/api_clients/ncbi_api.py
7227edd verified
raw
history blame
3.62 kB
# genesis/api_clients/ncbi_api.py
import os
import requests
from xml.etree import ElementTree as ET
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
def _add_api_key(params: dict):
"""
Adds API key to request params if available.
"""
if NCBI_API_KEY:
params["api_key"] = NCBI_API_KEY
return params
def search_pubmed(query: str, max_results: int = 10):
"""
Search PubMed via NCBI Entrez for a query string.
Returns a list of PubMed IDs (PMIDs).
"""
params = _add_api_key({
"db": "pubmed",
"term": query,
"retmax": max_results,
"retmode": "json"
})
r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
r.raise_for_status()
data = r.json()
return data.get("esearchresult", {}).get("idlist", [])
def fetch_pubmed_details(pmids: list):
"""
Fetch abstracts, authors, and metadata for given PMIDs.
"""
if not pmids:
return []
params = _add_api_key({
"db": "pubmed",
"id": ",".join(pmids),
"retmode": "xml"
})
r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
r.raise_for_status()
root = ET.fromstring(r.text)
results = []
for article in root.findall(".//PubmedArticle"):
title = article.findtext(".//ArticleTitle", default="")
abstract = " ".join([abst.text or "" for abst in article.findall(".//AbstractText")])
authors = [
f"{a.findtext('ForeName', '')} {a.findtext('LastName', '')}".strip()
for a in article.findall(".//Author")
]
pmid = article.findtext(".//PMID", default="")
results.append({
"pmid": pmid,
"title": title,
"abstract": abstract,
"authors": authors
})
return results
def fetch_gene_info(gene_id: str):
"""
Fetch gene metadata from NCBI Gene database.
"""
params = _add_api_key({
"db": "gene",
"id": gene_id,
"retmode": "xml"
})
r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
r.raise_for_status()
return r.text # XML - downstream parsing can be done in pipeline
def search_gene_by_symbol(symbol: str, organism: str = None):
"""
Search for a gene by symbol, optionally filtered by organism.
"""
term = symbol
if organism:
term += f" AND {organism}[Organism]"
params = _add_api_key({
"db": "gene",
"term": term,
"retmax": 5,
"retmode": "json"
})
r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
r.raise_for_status()
data = r.json()
return data.get("esearchresult", {}).get("idlist", [])
def fetch_protein_info(protein_id: str):
"""
Fetch protein metadata from NCBI Protein database.
"""
params = _add_api_key({
"db": "protein",
"id": protein_id,
"retmode": "xml"
})
r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
r.raise_for_status()
return r.text
def search_protein_by_name(name: str, organism: str = None):
"""
Search for proteins by name, optionally filtered by organism.
"""
term = name
if organism:
term += f" AND {organism}[Organism]"
params = _add_api_key({
"db": "protein",
"term": term,
"retmax": 5,
"retmode": "json"
})
r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
r.raise_for_status()
data = r.json()
return data.get("esearchresult", {}).get("idlist", [])