mgbam's picture
Update genesis/api_clients/ncbi_api.py
39e5a96 verified
# genesis/api_clients/ncbi_api.py
import os
import requests
from typing import List, Dict
from datetime import datetime
NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
NCBI_API_KEY = os.getenv("NCBI_API_KEY") # Optional β€” speeds up requests
session = requests.Session()
# -------------------------
# Generic NCBI Search
# -------------------------
def ncbi_search(db: str, term: str, retmax: int = 10) -> List[str]:
"""
Search an NCBI database and return a list of IDs.
"""
params = {
"db": db,
"term": term,
"retmode": "json",
"retmax": retmax
}
if NCBI_API_KEY:
params["api_key"] = NCBI_API_KEY
r = session.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
r.raise_for_status()
return r.json().get("esearchresult", {}).get("idlist", [])
# -------------------------
# Generic NCBI Fetch
# -------------------------
def ncbi_fetch(db: str, ids: List[str], rettype: str = "abstract", retmode: str = "text") -> str:
"""
Fetch detailed records from an NCBI database.
"""
params = {
"db": db,
"id": ",".join(ids),
"rettype": rettype,
"retmode": retmode
}
if NCBI_API_KEY:
params["api_key"] = NCBI_API_KEY
r = session.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
r.raise_for_status()
return r.text
# -------------------------
# PubMed Literature Search
# -------------------------
def search_pubmed(term: str, retmax: int = 5) -> List[Dict]:
"""
Search PubMed for biomedical literature.
"""
ids = ncbi_search("pubmed", term, retmax)
if not ids:
return []
params = {
"db": "pubmed",
"id": ",".join(ids),
"retmode": "json"
}
if NCBI_API_KEY:
params["api_key"] = NCBI_API_KEY
r = session.get(f"{NCBI_BASE}/esummary.fcgi", params=params)
r.raise_for_status()
records = r.json().get("result", {})
papers = []
for pid in ids:
rec = records.get(pid, {})
papers.append({
"title": rec.get("title"),
"authors": [a["name"] for a in rec.get("authors", [])],
"pubdate": rec.get("pubdate"),
"journal": rec.get("fulljournalname"),
"uid": pid,
"link": f"https://pubmed.ncbi.nlm.nih.gov/{pid}/"
})
return papers
# -------------------------
# Gene Search
# -------------------------
def search_genes(term: str, retmax: int = 5) -> List[Dict]:
"""
Search NCBI Gene database for gene information.
"""
ids = ncbi_search("gene", term, retmax)
if not ids:
return []
params = {
"db": "gene",
"id": ",".join(ids),
"retmode": "json"
}
if NCBI_API_KEY:
params["api_key"] = NCBI_API_KEY
r = session.get(f"{NCBI_BASE}/esummary.fcgi", params=params)
r.raise_for_status()
records = r.json().get("result", {})
genes = []
for gid in ids:
rec = records.get(gid, {})
genes.append({
"symbol": rec.get("name"),
"description": rec.get("description"),
"organism": rec.get("organism", {}).get("scientificname"),
"uid": gid,
"link": f"https://www.ncbi.nlm.nih.gov/gene/{gid}"
})
return genes
# -------------------------
# Protein Search
# -------------------------
def search_proteins(term: str, retmax: int = 5) -> List[Dict]:
"""
Search NCBI Protein database for protein sequences.
"""
ids = ncbi_search("protein", term, retmax)
if not ids:
return []
fasta_data = ncbi_fetch("protein", ids, rettype="fasta", retmode="text")
proteins = [{"id": pid, "fasta": fasta_data} for pid in ids]
return proteins
# -------------------------
# Structure Search
# -------------------------
def fetch_ncbi_structure(term: str, retmax: int = 5) -> List[Dict]:
"""
Search NCBI Structure database and return structure metadata.
"""
ids = ncbi_search("structure", term, retmax)
if not ids:
return []
params = {
"db": "structure",
"id": ",".join(ids),
"retmode": "json"
}
if NCBI_API_KEY:
params["api_key"] = NCBI_API_KEY
r = session.get(f"{NCBI_BASE}/esummary.fcgi", params=params)
r.raise_for_status()
records = r.json().get("result", {})
structures = []
for sid in ids:
rec = records.get(sid, {})
structures.append({
"structure_id": sid,
"title": rec.get("title"),
"organism": rec.get("organism"),
"release_date": rec.get("releasedate"),
"link": f"https://www.ncbi.nlm.nih.gov/structure/{sid}"
})
return structures
# -------------------------
# Build Cross-Database Profile
# -------------------------
def ncbi_cross_profile(term: str) -> Dict:
"""
Given a term, pull literature, genes, proteins, and structures for unified output.
"""
return {
"term": term,
"timestamp": datetime.utcnow().isoformat(),
"literature": search_pubmed(term, retmax=5),
"genes": search_genes(term, retmax=5),
"proteins": search_proteins(term, retmax=2),
"structures": fetch_ncbi_structure(term, retmax=3)
}