Spaces:

mgbam
/

Synthetic_Biology

Sleeping

App Files Files Community

mgbam commited on 8 days ago

Commit

800d67f

verified ·

1 Parent(s): 2c00ea4

Update genesis/api_clients/ncbi_api.py

Browse files

Files changed (1) hide show

genesis/api_clients/ncbi_api.py +97 -122

genesis/api_clients/ncbi_api.py CHANGED Viewed

@@ -1,178 +1,153 @@
 # genesis/api_clients/ncbi_api.py
 import os
 import requests
-import xml.etree.ElementTree as ET
 from typing import List, Dict
 NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
-NCBI_API_KEY = os.getenv("NCBI_API_KEY")  # Optional, set in Hugging Face / .env
 # -------------------------
-# Gene Search
 # -------------------------
-def search_gene(query: str, max_results: int = 5) -> List[Dict]:
     """
-    Search NCBI Gene for matching gene entries.
     """
     params = {
-        "db": "gene",
-        "term": query,
-        "retmax": max_results,
-        "api_key": NCBI_API_KEY
     }
-    r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
-    r.raise_for_status()
-    ids = [elem.text for elem in ET.fromstring(r.text).findall(".//Id")]
-    return fetch_gene_details(ids)
-def fetch_gene_details(gene_ids: List[str]) -> List[Dict]:
-    """
-    Fetch detailed information for NCBI Gene IDs.
-    """
-    if not gene_ids:
-        return []
-    params = {
-        "db": "gene",
-        "id": ",".join(gene_ids),
-        "retmode": "xml",
-        "api_key": NCBI_API_KEY
-    }
-    r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
     r.raise_for_status()
-    genes = []
-    root = ET.fromstring(r.text)
-    for gene in root.findall(".//Entrezgene"):
-        gene_id_elem = gene.find(".//Gene-track_geneid")
-        gene_symbol_elem = gene.find(".//Gene-ref_locus")
-        gene_desc_elem = gene.find(".//Gene-ref_desc")
-        genes.append({
-            "gene_id": gene_id_elem.text if gene_id_elem is not None else "",
-            "symbol": gene_symbol_elem.text if gene_symbol_elem is not None else "",
-            "description": gene_desc_elem.text if gene_desc_elem is not None else "",
-            "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id_elem.text}" if gene_id_elem is not None else ""
-        })
-    return genes
 # -------------------------
-# Protein Search
 # -------------------------
-def search_protein(query: str, max_results: int = 5) -> List[Dict]:
     """
-    Search NCBI Protein for matching entries.
     """
     params = {
-        "db": "protein",
-        "term": query,
-        "retmax": max_results,
-        "api_key": NCBI_API_KEY
     }
-    r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
     r.raise_for_status()
-    ids = [elem.text for elem in ET.fromstring(r.text).findall(".//Id")]
-    return fetch_protein_details(ids)
-def fetch_protein_details(protein_ids: List[str]) -> List[Dict]:
     """
-    Fetch detailed information for NCBI Protein IDs.
     """
-    if not protein_ids:
         return []
     params = {
-        "db": "protein",
-        "id": ",".join(protein_ids),
-        "retmode": "xml",
-        "api_key": NCBI_API_KEY
     }
-    r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
     r.raise_for_status()
-    proteins = []
-    root = ET.fromstring(r.text)
-    for seq in root.findall(".//TSeq"):
-        acc_elem = seq.find(".//TSeq_accver")
-        def_elem = seq.find(".//TSeq_defline")
-        len_elem = seq.find(".//TSeq_length")
-        proteins.append({
-            "accession": acc_elem.text if acc_elem is not None else "",
-            "definition": def_elem.text if def_elem is not None else "",
-            "length": len_elem.text if len_elem is not None else "",
-            "url": f"https://www.ncbi.nlm.nih.gov/protein/{acc_elem.text}" if acc_elem is not None else ""
         })
-    return proteins
 # -------------------------
-# Sequence Search (Nucleotide)
 # -------------------------
-def search_nucleotide(query: str, max_results: int = 5) -> List[Dict]:
     """
-    Search NCBI Nucleotide for DNA/RNA sequences.
     """
     params = {
-        "db": "nucleotide",
-        "term": query,
-        "retmax": max_results,
-        "api_key": NCBI_API_KEY
     }
-    r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
     r.raise_for_status()
-    ids = [elem.text for elem in ET.fromstring(r.text).findall(".//Id")]
-    return fetch_nucleotide_details(ids)
-def fetch_nucleotide_details(nuc_ids: List[str]) -> List[Dict]:
     """
-    Fetch detailed information for NCBI Nucleotide IDs.
     """
-    if not nuc_ids:
         return []
-    params = {
-        "db": "nucleotide",
-        "id": ",".join(nuc_ids),
-        "retmode": "xml",
-        "api_key": NCBI_API_KEY
-    }
-    r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
-    r.raise_for_status()
-    sequences = []
-    root = ET.fromstring(r.text)
-    for seq in root.findall(".//TSeq"):
-        acc_elem = seq.find(".//TSeq_accver")
-        def_elem = seq.find(".//TSeq_defline")
-        len_elem = seq.find(".//TSeq_length")
-        sequences.append({
-            "accession": acc_elem.text if acc_elem is not None else "",
-            "definition": def_elem.text if def_elem is not None else "",
-            "length": len_elem.text if len_elem is not None else "",
-            "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{acc_elem.text}" if acc_elem is not None else ""
-        })
-    return sequences
 # -------------------------
-# Cross-Domain Integration
 # -------------------------
-def entity_context(query: str) -> Dict:
     """
-    Return gene, protein, and sequence info linked to PubMed and ChEMBL.
     """
-    from genesis.api_clients import pubmed_api, chembl_api  # Lazy import to avoid cycles
     return {
-        "genes": search_gene(query),
-        "proteins": search_protein(query),
-        "nucleotides": search_nucleotide(query),
-        "literature": pubmed_api.search_pubmed(query),
-        "related_drugs": chembl_api.search_molecule(query)
     }

 # genesis/api_clients/ncbi_api.py
 import os
 import requests
 from typing import List, Dict
+from datetime import datetime
 NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
+NCBI_API_KEY = os.getenv("NCBI_API_KEY")  # Optional — speeds up requests
+session = requests.Session()
 # -------------------------
+# Generic NCBI Search
 # -------------------------
+def ncbi_search(db: str, term: str, retmax: int = 10) -> List[str]:
     """
+    Search an NCBI database and return a list of IDs.
     """
     params = {
+        "db": db,
+        "term": term,
+        "retmode": "json",
+        "retmax": retmax
     }
+    if NCBI_API_KEY:
+        params["api_key"] = NCBI_API_KEY
+    r = session.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
     r.raise_for_status()
+    return r.json().get("esearchresult", {}).get("idlist", [])
 # -------------------------
+# Generic NCBI Fetch
 # -------------------------
+def ncbi_fetch(db: str, ids: List[str], rettype: str = "abstract", retmode: str = "text") -> str:
     """
+    Fetch detailed records from an NCBI database.
     """
     params = {
+        "db": db,
+        "id": ",".join(ids),
+        "rettype": rettype,
+        "retmode": retmode
     }
+    if NCBI_API_KEY:
+        params["api_key"] = NCBI_API_KEY
+    r = session.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
     r.raise_for_status()
+    return r.text
+# -------------------------
+# PubMed Literature Search
+# -------------------------
+def search_pubmed(term: str, retmax: int = 5) -> List[Dict]:
     """
+    Search PubMed for biomedical literature.
     """
+    ids = ncbi_search("pubmed", term, retmax)
+    if not ids:
         return []
     params = {
+        "db": "pubmed",
+        "id": ",".join(ids),
+        "retmode": "json"
     }
+    if NCBI_API_KEY:
+        params["api_key"] = NCBI_API_KEY
+    r = session.get(f"{NCBI_BASE}/esummary.fcgi", params=params)
     r.raise_for_status()
+    records = r.json().get("result", {})
+    papers = []
+    for pid in ids:
+        rec = records.get(pid, {})
+        papers.append({
+            "title": rec.get("title"),
+            "authors": [a["name"] for a in rec.get("authors", [])],
+            "pubdate": rec.get("pubdate"),
+            "journal": rec.get("fulljournalname"),
+            "uid": pid,
+            "link": f"https://pubmed.ncbi.nlm.nih.gov/{pid}/"
         })
+    return papers
 # -------------------------
+# Gene Search
 # -------------------------
+def search_genes(term: str, retmax: int = 5) -> List[Dict]:
     """
+    Search NCBI Gene database for gene information.
     """
+    ids = ncbi_search("gene", term, retmax)
+    if not ids:
+        return []
     params = {
+        "db": "gene",
+        "id": ",".join(ids),
+        "retmode": "json"
     }
+    if NCBI_API_KEY:
+        params["api_key"] = NCBI_API_KEY
+    r = session.get(f"{NCBI_BASE}/esummary.fcgi", params=params)
     r.raise_for_status()
+    records = r.json().get("result", {})
+    genes = []
+    for gid in ids:
+        rec = records.get(gid, {})
+        genes.append({
+            "symbol": rec.get("name"),
+            "description": rec.get("description"),
+            "organism": rec.get("organism", {}).get("scientificname"),
+            "uid": gid,
+            "link": f"https://www.ncbi.nlm.nih.gov/gene/{gid}"
+        })
+    return genes
+# -------------------------
+# Protein Search
+# -------------------------
+def search_proteins(term: str, retmax: int = 5) -> List[Dict]:
     """
+    Search NCBI Protein database for protein sequences.
     """
+    ids = ncbi_search("protein", term, retmax)
+    if not ids:
         return []
+    fasta_data = ncbi_fetch("protein", ids, rettype="fasta", retmode="text")
+    proteins = [{"id": pid, "fasta": fasta_data} for pid in ids]
+    return proteins
 # -------------------------
+# Build Cross-Database Profile
 # -------------------------
+def ncbi_cross_profile(term: str) -> Dict:
     """
+    Given a term, pull literature, genes, and proteins for unified output.
     """
     return {
+        "term": term,
+        "timestamp": datetime.utcnow().isoformat(),
+        "literature": search_pubmed(term, retmax=5),
+        "genes": search_genes(term, retmax=5),
+        "proteins": search_proteins(term, retmax=2)
     }