# genesis/api_clients/ncbi_api.py import os import requests from xml.etree import ElementTree as ET NCBI_API_KEY = os.getenv("NCBI_API_KEY") NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" def _add_api_key(params: dict): """ Adds API key to request params if available. """ if NCBI_API_KEY: params["api_key"] = NCBI_API_KEY return params def search_pubmed(query: str, max_results: int = 10): """ Search PubMed via NCBI Entrez for a query string. Returns a list of PubMed IDs (PMIDs). """ params = _add_api_key({ "db": "pubmed", "term": query, "retmax": max_results, "retmode": "json" }) r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params) r.raise_for_status() data = r.json() return data.get("esearchresult", {}).get("idlist", []) def fetch_pubmed_details(pmids: list): """ Fetch abstracts, authors, and metadata for given PMIDs. """ if not pmids: return [] params = _add_api_key({ "db": "pubmed", "id": ",".join(pmids), "retmode": "xml" }) r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params) r.raise_for_status() root = ET.fromstring(r.text) results = [] for article in root.findall(".//PubmedArticle"): title = article.findtext(".//ArticleTitle", default="") abstract = " ".join([abst.text or "" for abst in article.findall(".//AbstractText")]) authors = [ f"{a.findtext('ForeName', '')} {a.findtext('LastName', '')}".strip() for a in article.findall(".//Author") ] pmid = article.findtext(".//PMID", default="") results.append({ "pmid": pmid, "title": title, "abstract": abstract, "authors": authors }) return results def fetch_gene_info(gene_id: str): """ Fetch gene metadata from NCBI Gene database. """ params = _add_api_key({ "db": "gene", "id": gene_id, "retmode": "xml" }) r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params) r.raise_for_status() return r.text # XML - downstream parsing can be done in pipeline def search_gene_by_symbol(symbol: str, organism: str = None): """ Search for a gene by symbol, optionally filtered by organism. """ term = symbol if organism: term += f" AND {organism}[Organism]" params = _add_api_key({ "db": "gene", "term": term, "retmax": 5, "retmode": "json" }) r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params) r.raise_for_status() data = r.json() return data.get("esearchresult", {}).get("idlist", []) def fetch_protein_info(protein_id: str): """ Fetch protein metadata from NCBI Protein database. """ params = _add_api_key({ "db": "protein", "id": protein_id, "retmode": "xml" }) r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params) r.raise_for_status() return r.text def search_protein_by_name(name: str, organism: str = None): """ Search for proteins by name, optionally filtered by organism. """ term = name if organism: term += f" AND {organism}[Organism]" params = _add_api_key({ "db": "protein", "term": term, "retmax": 5, "retmode": "json" }) r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params) r.raise_for_status() data = r.json() return data.get("esearchresult", {}).get("idlist", [])