Spaces:
Sleeping
Sleeping
# genesis/api_clients/ncbi_api.py | |
import os | |
import requests | |
from xml.etree import ElementTree as ET | |
NCBI_API_KEY = os.getenv("NCBI_API_KEY") | |
NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" | |
def _add_api_key(params: dict): | |
""" | |
Adds API key to request params if available. | |
""" | |
if NCBI_API_KEY: | |
params["api_key"] = NCBI_API_KEY | |
return params | |
def search_pubmed(query: str, max_results: int = 10): | |
""" | |
Search PubMed via NCBI Entrez for a query string. | |
Returns a list of PubMed IDs (PMIDs). | |
""" | |
params = _add_api_key({ | |
"db": "pubmed", | |
"term": query, | |
"retmax": max_results, | |
"retmode": "json" | |
}) | |
r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params) | |
r.raise_for_status() | |
data = r.json() | |
return data.get("esearchresult", {}).get("idlist", []) | |
def fetch_pubmed_details(pmids: list): | |
""" | |
Fetch abstracts, authors, and metadata for given PMIDs. | |
""" | |
if not pmids: | |
return [] | |
params = _add_api_key({ | |
"db": "pubmed", | |
"id": ",".join(pmids), | |
"retmode": "xml" | |
}) | |
r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params) | |
r.raise_for_status() | |
root = ET.fromstring(r.text) | |
results = [] | |
for article in root.findall(".//PubmedArticle"): | |
title = article.findtext(".//ArticleTitle", default="") | |
abstract = " ".join([abst.text or "" for abst in article.findall(".//AbstractText")]) | |
authors = [ | |
f"{a.findtext('ForeName', '')} {a.findtext('LastName', '')}".strip() | |
for a in article.findall(".//Author") | |
] | |
pmid = article.findtext(".//PMID", default="") | |
results.append({ | |
"pmid": pmid, | |
"title": title, | |
"abstract": abstract, | |
"authors": authors | |
}) | |
return results | |
def fetch_gene_info(gene_id: str): | |
""" | |
Fetch gene metadata from NCBI Gene database. | |
""" | |
params = _add_api_key({ | |
"db": "gene", | |
"id": gene_id, | |
"retmode": "xml" | |
}) | |
r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params) | |
r.raise_for_status() | |
return r.text # XML - downstream parsing can be done in pipeline | |
def search_gene_by_symbol(symbol: str, organism: str = None): | |
""" | |
Search for a gene by symbol, optionally filtered by organism. | |
""" | |
term = symbol | |
if organism: | |
term += f" AND {organism}[Organism]" | |
params = _add_api_key({ | |
"db": "gene", | |
"term": term, | |
"retmax": 5, | |
"retmode": "json" | |
}) | |
r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params) | |
r.raise_for_status() | |
data = r.json() | |
return data.get("esearchresult", {}).get("idlist", []) | |
def fetch_protein_info(protein_id: str): | |
""" | |
Fetch protein metadata from NCBI Protein database. | |
""" | |
params = _add_api_key({ | |
"db": "protein", | |
"id": protein_id, | |
"retmode": "xml" | |
}) | |
r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params) | |
r.raise_for_status() | |
return r.text | |
def search_protein_by_name(name: str, organism: str = None): | |
""" | |
Search for proteins by name, optionally filtered by organism. | |
""" | |
term = name | |
if organism: | |
term += f" AND {organism}[Organism]" | |
params = _add_api_key({ | |
"db": "protein", | |
"term": term, | |
"retmax": 5, | |
"retmode": "json" | |
}) | |
r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params) | |
r.raise_for_status() | |
data = r.json() | |
return data.get("esearchresult", {}).get("idlist", []) | |