# genesis/api_clients/ncbi_api.py import os import requests from typing import List, Dict from datetime import datetime NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" NCBI_API_KEY = os.getenv("NCBI_API_KEY") # Optional — speeds up requests session = requests.Session() # ------------------------- # Generic NCBI Search # ------------------------- def ncbi_search(db: str, term: str, retmax: int = 10) -> List[str]: """ Search an NCBI database and return a list of IDs. """ params = { "db": db, "term": term, "retmode": "json", "retmax": retmax } if NCBI_API_KEY: params["api_key"] = NCBI_API_KEY r = session.get(f"{NCBI_BASE}/esearch.fcgi", params=params) r.raise_for_status() return r.json().get("esearchresult", {}).get("idlist", []) # ------------------------- # Generic NCBI Fetch # ------------------------- def ncbi_fetch(db: str, ids: List[str], rettype: str = "abstract", retmode: str = "text") -> str: """ Fetch detailed records from an NCBI database. """ params = { "db": db, "id": ",".join(ids), "rettype": rettype, "retmode": retmode } if NCBI_API_KEY: params["api_key"] = NCBI_API_KEY r = session.get(f"{NCBI_BASE}/efetch.fcgi", params=params) r.raise_for_status() return r.text # ------------------------- # PubMed Literature Search # ------------------------- def search_pubmed(term: str, retmax: int = 5) -> List[Dict]: """ Search PubMed for biomedical literature. """ ids = ncbi_search("pubmed", term, retmax) if not ids: return [] params = { "db": "pubmed", "id": ",".join(ids), "retmode": "json" } if NCBI_API_KEY: params["api_key"] = NCBI_API_KEY r = session.get(f"{NCBI_BASE}/esummary.fcgi", params=params) r.raise_for_status() records = r.json().get("result", {}) papers = [] for pid in ids: rec = records.get(pid, {}) papers.append({ "title": rec.get("title"), "authors": [a["name"] for a in rec.get("authors", [])], "pubdate": rec.get("pubdate"), "journal": rec.get("fulljournalname"), "uid": pid, "link": f"https://pubmed.ncbi.nlm.nih.gov/{pid}/" }) return papers # ------------------------- # Gene Search # ------------------------- def search_genes(term: str, retmax: int = 5) -> List[Dict]: """ Search NCBI Gene database for gene information. """ ids = ncbi_search("gene", term, retmax) if not ids: return [] params = { "db": "gene", "id": ",".join(ids), "retmode": "json" } if NCBI_API_KEY: params["api_key"] = NCBI_API_KEY r = session.get(f"{NCBI_BASE}/esummary.fcgi", params=params) r.raise_for_status() records = r.json().get("result", {}) genes = [] for gid in ids: rec = records.get(gid, {}) genes.append({ "symbol": rec.get("name"), "description": rec.get("description"), "organism": rec.get("organism", {}).get("scientificname"), "uid": gid, "link": f"https://www.ncbi.nlm.nih.gov/gene/{gid}" }) return genes # ------------------------- # Protein Search # ------------------------- def search_proteins(term: str, retmax: int = 5) -> List[Dict]: """ Search NCBI Protein database for protein sequences. """ ids = ncbi_search("protein", term, retmax) if not ids: return [] fasta_data = ncbi_fetch("protein", ids, rettype="fasta", retmode="text") proteins = [{"id": pid, "fasta": fasta_data} for pid in ids] return proteins # ------------------------- # Structure Search # ------------------------- def fetch_ncbi_structure(term: str, retmax: int = 5) -> List[Dict]: """ Search NCBI Structure database and return structure metadata. """ ids = ncbi_search("structure", term, retmax) if not ids: return [] params = { "db": "structure", "id": ",".join(ids), "retmode": "json" } if NCBI_API_KEY: params["api_key"] = NCBI_API_KEY r = session.get(f"{NCBI_BASE}/esummary.fcgi", params=params) r.raise_for_status() records = r.json().get("result", {}) structures = [] for sid in ids: rec = records.get(sid, {}) structures.append({ "structure_id": sid, "title": rec.get("title"), "organism": rec.get("organism"), "release_date": rec.get("releasedate"), "link": f"https://www.ncbi.nlm.nih.gov/structure/{sid}" }) return structures # ------------------------- # Build Cross-Database Profile # ------------------------- def ncbi_cross_profile(term: str) -> Dict: """ Given a term, pull literature, genes, proteins, and structures for unified output. """ return { "term": term, "timestamp": datetime.utcnow().isoformat(), "literature": search_pubmed(term, retmax=5), "genes": search_genes(term, retmax=5), "proteins": search_proteins(term, retmax=2), "structures": fetch_ncbi_structure(term, retmax=3) }