mgbam commited on
Commit
800d67f
·
verified ·
1 Parent(s): 2c00ea4

Update genesis/api_clients/ncbi_api.py

Browse files
Files changed (1) hide show
  1. genesis/api_clients/ncbi_api.py +97 -122
genesis/api_clients/ncbi_api.py CHANGED
@@ -1,178 +1,153 @@
1
  # genesis/api_clients/ncbi_api.py
2
  import os
3
  import requests
4
- import xml.etree.ElementTree as ET
5
  from typing import List, Dict
 
6
 
7
  NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
8
- NCBI_API_KEY = os.getenv("NCBI_API_KEY") # Optional, set in Hugging Face / .env
 
 
9
 
10
  # -------------------------
11
- # Gene Search
12
  # -------------------------
13
- def search_gene(query: str, max_results: int = 5) -> List[Dict]:
14
  """
15
- Search NCBI Gene for matching gene entries.
16
  """
17
  params = {
18
- "db": "gene",
19
- "term": query,
20
- "retmax": max_results,
21
- "api_key": NCBI_API_KEY
22
  }
23
- r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
24
- r.raise_for_status()
25
- ids = [elem.text for elem in ET.fromstring(r.text).findall(".//Id")]
26
-
27
- return fetch_gene_details(ids)
28
-
29
- def fetch_gene_details(gene_ids: List[str]) -> List[Dict]:
30
- """
31
- Fetch detailed information for NCBI Gene IDs.
32
- """
33
- if not gene_ids:
34
- return []
35
 
36
- params = {
37
- "db": "gene",
38
- "id": ",".join(gene_ids),
39
- "retmode": "xml",
40
- "api_key": NCBI_API_KEY
41
- }
42
- r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
43
  r.raise_for_status()
44
 
45
- genes = []
46
- root = ET.fromstring(r.text)
47
- for gene in root.findall(".//Entrezgene"):
48
- gene_id_elem = gene.find(".//Gene-track_geneid")
49
- gene_symbol_elem = gene.find(".//Gene-ref_locus")
50
- gene_desc_elem = gene.find(".//Gene-ref_desc")
51
-
52
- genes.append({
53
- "gene_id": gene_id_elem.text if gene_id_elem is not None else "",
54
- "symbol": gene_symbol_elem.text if gene_symbol_elem is not None else "",
55
- "description": gene_desc_elem.text if gene_desc_elem is not None else "",
56
- "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id_elem.text}" if gene_id_elem is not None else ""
57
- })
58
-
59
- return genes
60
 
61
  # -------------------------
62
- # Protein Search
63
  # -------------------------
64
- def search_protein(query: str, max_results: int = 5) -> List[Dict]:
65
  """
66
- Search NCBI Protein for matching entries.
67
  """
68
  params = {
69
- "db": "protein",
70
- "term": query,
71
- "retmax": max_results,
72
- "api_key": NCBI_API_KEY
73
  }
74
- r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
 
 
 
75
  r.raise_for_status()
76
- ids = [elem.text for elem in ET.fromstring(r.text).findall(".//Id")]
77
 
78
- return fetch_protein_details(ids)
79
 
80
- def fetch_protein_details(protein_ids: List[str]) -> List[Dict]:
 
 
 
81
  """
82
- Fetch detailed information for NCBI Protein IDs.
83
  """
84
- if not protein_ids:
 
85
  return []
86
 
87
  params = {
88
- "db": "protein",
89
- "id": ",".join(protein_ids),
90
- "retmode": "xml",
91
- "api_key": NCBI_API_KEY
92
  }
93
- r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
 
 
 
94
  r.raise_for_status()
95
 
96
- proteins = []
97
- root = ET.fromstring(r.text)
98
- for seq in root.findall(".//TSeq"):
99
- acc_elem = seq.find(".//TSeq_accver")
100
- def_elem = seq.find(".//TSeq_defline")
101
- len_elem = seq.find(".//TSeq_length")
102
-
103
- proteins.append({
104
- "accession": acc_elem.text if acc_elem is not None else "",
105
- "definition": def_elem.text if def_elem is not None else "",
106
- "length": len_elem.text if len_elem is not None else "",
107
- "url": f"https://www.ncbi.nlm.nih.gov/protein/{acc_elem.text}" if acc_elem is not None else ""
108
  })
109
-
110
- return proteins
111
 
112
  # -------------------------
113
- # Sequence Search (Nucleotide)
114
  # -------------------------
115
- def search_nucleotide(query: str, max_results: int = 5) -> List[Dict]:
116
  """
117
- Search NCBI Nucleotide for DNA/RNA sequences.
118
  """
 
 
 
 
119
  params = {
120
- "db": "nucleotide",
121
- "term": query,
122
- "retmax": max_results,
123
- "api_key": NCBI_API_KEY
124
  }
125
- r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
 
 
 
126
  r.raise_for_status()
127
- ids = [elem.text for elem in ET.fromstring(r.text).findall(".//Id")]
128
 
129
- return fetch_nucleotide_details(ids)
 
 
 
 
 
 
 
 
 
 
 
130
 
131
- def fetch_nucleotide_details(nuc_ids: List[str]) -> List[Dict]:
 
 
 
132
  """
133
- Fetch detailed information for NCBI Nucleotide IDs.
134
  """
135
- if not nuc_ids:
 
136
  return []
137
 
138
- params = {
139
- "db": "nucleotide",
140
- "id": ",".join(nuc_ids),
141
- "retmode": "xml",
142
- "api_key": NCBI_API_KEY
143
- }
144
- r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
145
- r.raise_for_status()
146
-
147
- sequences = []
148
- root = ET.fromstring(r.text)
149
- for seq in root.findall(".//TSeq"):
150
- acc_elem = seq.find(".//TSeq_accver")
151
- def_elem = seq.find(".//TSeq_defline")
152
- len_elem = seq.find(".//TSeq_length")
153
-
154
- sequences.append({
155
- "accession": acc_elem.text if acc_elem is not None else "",
156
- "definition": def_elem.text if def_elem is not None else "",
157
- "length": len_elem.text if len_elem is not None else "",
158
- "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{acc_elem.text}" if acc_elem is not None else ""
159
- })
160
-
161
- return sequences
162
 
163
  # -------------------------
164
- # Cross-Domain Integration
165
  # -------------------------
166
- def entity_context(query: str) -> Dict:
167
  """
168
- Return gene, protein, and sequence info linked to PubMed and ChEMBL.
169
  """
170
- from genesis.api_clients import pubmed_api, chembl_api # Lazy import to avoid cycles
171
-
172
  return {
173
- "genes": search_gene(query),
174
- "proteins": search_protein(query),
175
- "nucleotides": search_nucleotide(query),
176
- "literature": pubmed_api.search_pubmed(query),
177
- "related_drugs": chembl_api.search_molecule(query)
178
  }
 
1
  # genesis/api_clients/ncbi_api.py
2
  import os
3
  import requests
 
4
  from typing import List, Dict
5
+ from datetime import datetime
6
 
7
  NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
8
+ NCBI_API_KEY = os.getenv("NCBI_API_KEY") # Optional speeds up requests
9
+
10
+ session = requests.Session()
11
 
12
  # -------------------------
13
+ # Generic NCBI Search
14
  # -------------------------
15
+ def ncbi_search(db: str, term: str, retmax: int = 10) -> List[str]:
16
  """
17
+ Search an NCBI database and return a list of IDs.
18
  """
19
  params = {
20
+ "db": db,
21
+ "term": term,
22
+ "retmode": "json",
23
+ "retmax": retmax
24
  }
25
+ if NCBI_API_KEY:
26
+ params["api_key"] = NCBI_API_KEY
 
 
 
 
 
 
 
 
 
 
27
 
28
+ r = session.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
 
 
 
 
 
 
29
  r.raise_for_status()
30
 
31
+ return r.json().get("esearchresult", {}).get("idlist", [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  # -------------------------
34
+ # Generic NCBI Fetch
35
  # -------------------------
36
+ def ncbi_fetch(db: str, ids: List[str], rettype: str = "abstract", retmode: str = "text") -> str:
37
  """
38
+ Fetch detailed records from an NCBI database.
39
  """
40
  params = {
41
+ "db": db,
42
+ "id": ",".join(ids),
43
+ "rettype": rettype,
44
+ "retmode": retmode
45
  }
46
+ if NCBI_API_KEY:
47
+ params["api_key"] = NCBI_API_KEY
48
+
49
+ r = session.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
50
  r.raise_for_status()
 
51
 
52
+ return r.text
53
 
54
+ # -------------------------
55
+ # PubMed Literature Search
56
+ # -------------------------
57
+ def search_pubmed(term: str, retmax: int = 5) -> List[Dict]:
58
  """
59
+ Search PubMed for biomedical literature.
60
  """
61
+ ids = ncbi_search("pubmed", term, retmax)
62
+ if not ids:
63
  return []
64
 
65
  params = {
66
+ "db": "pubmed",
67
+ "id": ",".join(ids),
68
+ "retmode": "json"
 
69
  }
70
+ if NCBI_API_KEY:
71
+ params["api_key"] = NCBI_API_KEY
72
+
73
+ r = session.get(f"{NCBI_BASE}/esummary.fcgi", params=params)
74
  r.raise_for_status()
75
 
76
+ records = r.json().get("result", {})
77
+ papers = []
78
+ for pid in ids:
79
+ rec = records.get(pid, {})
80
+ papers.append({
81
+ "title": rec.get("title"),
82
+ "authors": [a["name"] for a in rec.get("authors", [])],
83
+ "pubdate": rec.get("pubdate"),
84
+ "journal": rec.get("fulljournalname"),
85
+ "uid": pid,
86
+ "link": f"https://pubmed.ncbi.nlm.nih.gov/{pid}/"
 
87
  })
88
+ return papers
 
89
 
90
  # -------------------------
91
+ # Gene Search
92
  # -------------------------
93
+ def search_genes(term: str, retmax: int = 5) -> List[Dict]:
94
  """
95
+ Search NCBI Gene database for gene information.
96
  """
97
+ ids = ncbi_search("gene", term, retmax)
98
+ if not ids:
99
+ return []
100
+
101
  params = {
102
+ "db": "gene",
103
+ "id": ",".join(ids),
104
+ "retmode": "json"
 
105
  }
106
+ if NCBI_API_KEY:
107
+ params["api_key"] = NCBI_API_KEY
108
+
109
+ r = session.get(f"{NCBI_BASE}/esummary.fcgi", params=params)
110
  r.raise_for_status()
 
111
 
112
+ records = r.json().get("result", {})
113
+ genes = []
114
+ for gid in ids:
115
+ rec = records.get(gid, {})
116
+ genes.append({
117
+ "symbol": rec.get("name"),
118
+ "description": rec.get("description"),
119
+ "organism": rec.get("organism", {}).get("scientificname"),
120
+ "uid": gid,
121
+ "link": f"https://www.ncbi.nlm.nih.gov/gene/{gid}"
122
+ })
123
+ return genes
124
 
125
+ # -------------------------
126
+ # Protein Search
127
+ # -------------------------
128
+ def search_proteins(term: str, retmax: int = 5) -> List[Dict]:
129
  """
130
+ Search NCBI Protein database for protein sequences.
131
  """
132
+ ids = ncbi_search("protein", term, retmax)
133
+ if not ids:
134
  return []
135
 
136
+ fasta_data = ncbi_fetch("protein", ids, rettype="fasta", retmode="text")
137
+ proteins = [{"id": pid, "fasta": fasta_data} for pid in ids]
138
+ return proteins
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  # -------------------------
141
+ # Build Cross-Database Profile
142
  # -------------------------
143
+ def ncbi_cross_profile(term: str) -> Dict:
144
  """
145
+ Given a term, pull literature, genes, and proteins for unified output.
146
  """
 
 
147
  return {
148
+ "term": term,
149
+ "timestamp": datetime.utcnow().isoformat(),
150
+ "literature": search_pubmed(term, retmax=5),
151
+ "genes": search_genes(term, retmax=5),
152
+ "proteins": search_proteins(term, retmax=2)
153
  }