mgbam commited on
Commit
eea1b53
·
verified ·
1 Parent(s): 02711ba

Update genesis/api_clients/ncbi_api.py

Browse files
Files changed (1) hide show
  1. genesis/api_clients/ncbi_api.py +143 -85
genesis/api_clients/ncbi_api.py CHANGED
@@ -1,120 +1,178 @@
1
  # genesis/api_clients/ncbi_api.py
2
  import os
3
  import requests
4
- from typing import Dict, List, Optional
 
5
 
6
- NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
7
- NCBI_API_KEY = os.getenv("NCBI_API_KEY") # Optional for higher rate limits
8
 
9
  # -------------------------
10
- # Core Utilities
11
  # -------------------------
12
- def ncbi_request(endpoint: str, params: Dict) -> requests.Response:
13
  """
14
- Helper function to query NCBI E-Utilities with optional API key.
15
  """
16
- if NCBI_API_KEY:
17
- params["api_key"] = NCBI_API_KEY
18
- url = f"{NCBI_BASE}{endpoint}"
19
- r = requests.get(url, params=params)
20
- r.raise_for_status()
21
- return r
22
-
23
- # -------------------------
24
- # Search Functions
25
- # -------------------------
26
- def search_pubmed(query: str, max_results: int = 10) -> List[str]:
27
- """
28
- Search PubMed and return list of PMIDs.
29
- """
30
- r = ncbi_request("esearch.fcgi", {
31
- "db": "pubmed",
32
  "term": query,
33
  "retmax": max_results,
34
- "retmode": "json"
35
- })
36
- return r.json().get("esearchresult", {}).get("idlist", [])
 
 
 
 
37
 
38
- def search_gene(query: str, organism: Optional[str] = None, max_results: int = 10) -> List[str]:
39
  """
40
- Search NCBI Gene database.
41
  """
42
- term = query
43
- if organism:
44
- term += f" AND {organism}[Organism]"
45
- r = ncbi_request("esearch.fcgi", {
46
  "db": "gene",
47
- "term": term,
48
- "retmax": max_results,
49
- "retmode": "json"
50
- })
51
- return r.json().get("esearchresult", {}).get("idlist", [])
 
 
 
 
 
 
 
 
52
 
53
- def search_protein(query: str, organism: Optional[str] = None, max_results: int = 10) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
54
  """
55
- Search NCBI Protein database.
56
  """
57
- term = query
58
- if organism:
59
- term += f" AND {organism}[Organism]"
60
- r = ncbi_request("esearch.fcgi", {
61
  "db": "protein",
62
- "term": term,
63
  "retmax": max_results,
64
- "retmode": "json"
65
- })
66
- return r.json().get("esearchresult", {}).get("idlist", [])
 
 
67
 
68
- # -------------------------
69
- # Fetch Functions
70
- # -------------------------
71
- def fetch_summary(db: str, ids: List[str]) -> List[Dict]:
72
  """
73
- Fetch summary data from any NCBI database.
74
  """
75
- if not ids:
76
  return []
77
- r = ncbi_request("esummary.fcgi", {
78
- "db": db,
79
- "id": ",".join(ids),
80
- "retmode": "json"
81
- })
82
- return list(r.json().get("result", {}).values())
83
-
84
- def fetch_fasta(db: str, ids: List[str]) -> Dict[str, str]:
85
- """
86
- Fetch FASTA sequences from NCBI (protein or nucleotide).
87
- """
88
- if not ids:
89
- return {}
90
- r = ncbi_request("efetch.fcgi", {
91
- "db": db,
92
- "id": ",".join(ids),
93
- "rettype": "fasta",
94
- "retmode": "text"
95
- })
96
- return {ids[i]: seq for i, seq in enumerate(r.text.strip().split(">")[1:])}
 
 
 
 
 
97
 
98
  # -------------------------
99
- # Specialized Functions
100
  # -------------------------
101
- def get_gene_info(gene_id: str) -> Dict:
102
  """
103
- Get detailed gene info from NCBI Gene.
104
  """
105
- summaries = fetch_summary("gene", [gene_id])
106
- return summaries[0] if summaries else {}
 
 
 
 
 
 
 
 
 
107
 
108
- def get_protein_info(protein_id: str) -> Dict:
109
  """
110
- Get detailed protein info from NCBI Protein.
111
  """
112
- summaries = fetch_summary("protein", [protein_id])
113
- return summaries[0] if summaries else {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
- def get_taxonomy_info(tax_id: str) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
116
  """
117
- Get taxonomy data from NCBI Taxonomy.
118
  """
119
- summaries = fetch_summary("taxonomy", [tax_id])
120
- return summaries[0] if summaries else {}
 
 
 
 
 
 
 
 
1
  # genesis/api_clients/ncbi_api.py
2
  import os
3
  import requests
4
+ import xml.etree.ElementTree as ET
5
+ from typing import List, Dict
6
 
7
+ NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
8
+ NCBI_API_KEY = os.getenv("NCBI_API_KEY") # Optional, set in Hugging Face / .env
9
 
10
  # -------------------------
11
+ # Gene Search
12
  # -------------------------
13
+ def search_gene(query: str, max_results: int = 5) -> List[Dict]:
14
  """
15
+ Search NCBI Gene for matching gene entries.
16
  """
17
+ params = {
18
+ "db": "gene",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  "term": query,
20
  "retmax": max_results,
21
+ "api_key": NCBI_API_KEY
22
+ }
23
+ r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
24
+ r.raise_for_status()
25
+ ids = [elem.text for elem in ET.fromstring(r.text).findall(".//Id")]
26
+
27
+ return fetch_gene_details(ids)
28
 
29
+ def fetch_gene_details(gene_ids: List[str]) -> List[Dict]:
30
  """
31
+ Fetch detailed information for NCBI Gene IDs.
32
  """
33
+ if not gene_ids:
34
+ return []
35
+
36
+ params = {
37
  "db": "gene",
38
+ "id": ",".join(gene_ids),
39
+ "retmode": "xml",
40
+ "api_key": NCBI_API_KEY
41
+ }
42
+ r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
43
+ r.raise_for_status()
44
+
45
+ genes = []
46
+ root = ET.fromstring(r.text)
47
+ for gene in root.findall(".//Entrezgene"):
48
+ gene_id_elem = gene.find(".//Gene-track_geneid")
49
+ gene_symbol_elem = gene.find(".//Gene-ref_locus")
50
+ gene_desc_elem = gene.find(".//Gene-ref_desc")
51
 
52
+ genes.append({
53
+ "gene_id": gene_id_elem.text if gene_id_elem is not None else "",
54
+ "symbol": gene_symbol_elem.text if gene_symbol_elem is not None else "",
55
+ "description": gene_desc_elem.text if gene_desc_elem is not None else "",
56
+ "url": f"https://www.ncbi.nlm.nih.gov/gene/{gene_id_elem.text}" if gene_id_elem is not None else ""
57
+ })
58
+
59
+ return genes
60
+
61
+ # -------------------------
62
+ # Protein Search
63
+ # -------------------------
64
+ def search_protein(query: str, max_results: int = 5) -> List[Dict]:
65
  """
66
+ Search NCBI Protein for matching entries.
67
  """
68
+ params = {
 
 
 
69
  "db": "protein",
70
+ "term": query,
71
  "retmax": max_results,
72
+ "api_key": NCBI_API_KEY
73
+ }
74
+ r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
75
+ r.raise_for_status()
76
+ ids = [elem.text for elem in ET.fromstring(r.text).findall(".//Id")]
77
 
78
+ return fetch_protein_details(ids)
79
+
80
+ def fetch_protein_details(protein_ids: List[str]) -> List[Dict]:
 
81
  """
82
+ Fetch detailed information for NCBI Protein IDs.
83
  """
84
+ if not protein_ids:
85
  return []
86
+
87
+ params = {
88
+ "db": "protein",
89
+ "id": ",".join(protein_ids),
90
+ "retmode": "xml",
91
+ "api_key": NCBI_API_KEY
92
+ }
93
+ r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
94
+ r.raise_for_status()
95
+
96
+ proteins = []
97
+ root = ET.fromstring(r.text)
98
+ for seq in root.findall(".//TSeq"):
99
+ acc_elem = seq.find(".//TSeq_accver")
100
+ def_elem = seq.find(".//TSeq_defline")
101
+ len_elem = seq.find(".//TSeq_length")
102
+
103
+ proteins.append({
104
+ "accession": acc_elem.text if acc_elem is not None else "",
105
+ "definition": def_elem.text if def_elem is not None else "",
106
+ "length": len_elem.text if len_elem is not None else "",
107
+ "url": f"https://www.ncbi.nlm.nih.gov/protein/{acc_elem.text}" if acc_elem is not None else ""
108
+ })
109
+
110
+ return proteins
111
 
112
  # -------------------------
113
+ # Sequence Search (Nucleotide)
114
  # -------------------------
115
+ def search_nucleotide(query: str, max_results: int = 5) -> List[Dict]:
116
  """
117
+ Search NCBI Nucleotide for DNA/RNA sequences.
118
  """
119
+ params = {
120
+ "db": "nucleotide",
121
+ "term": query,
122
+ "retmax": max_results,
123
+ "api_key": NCBI_API_KEY
124
+ }
125
+ r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
126
+ r.raise_for_status()
127
+ ids = [elem.text for elem in ET.fromstring(r.text).findall(".//Id")]
128
+
129
+ return fetch_nucleotide_details(ids)
130
 
131
+ def fetch_nucleotide_details(nuc_ids: List[str]) -> List[Dict]:
132
  """
133
+ Fetch detailed information for NCBI Nucleotide IDs.
134
  """
135
+ if not nuc_ids:
136
+ return []
137
+
138
+ params = {
139
+ "db": "nucleotide",
140
+ "id": ",".join(nuc_ids),
141
+ "retmode": "xml",
142
+ "api_key": NCBI_API_KEY
143
+ }
144
+ r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
145
+ r.raise_for_status()
146
+
147
+ sequences = []
148
+ root = ET.fromstring(r.text)
149
+ for seq in root.findall(".//TSeq"):
150
+ acc_elem = seq.find(".//TSeq_accver")
151
+ def_elem = seq.find(".//TSeq_defline")
152
+ len_elem = seq.find(".//TSeq_length")
153
 
154
+ sequences.append({
155
+ "accession": acc_elem.text if acc_elem is not None else "",
156
+ "definition": def_elem.text if def_elem is not None else "",
157
+ "length": len_elem.text if len_elem is not None else "",
158
+ "url": f"https://www.ncbi.nlm.nih.gov/nuccore/{acc_elem.text}" if acc_elem is not None else ""
159
+ })
160
+
161
+ return sequences
162
+
163
+ # -------------------------
164
+ # Cross-Domain Integration
165
+ # -------------------------
166
+ def entity_context(query: str) -> Dict:
167
  """
168
+ Return gene, protein, and sequence info linked to PubMed and ChEMBL.
169
  """
170
+ from genesis.api_clients import pubmed_api, chembl_api # Lazy import to avoid cycles
171
+
172
+ return {
173
+ "genes": search_gene(query),
174
+ "proteins": search_protein(query),
175
+ "nucleotides": search_nucleotide(query),
176
+ "literature": pubmed_api.search_pubmed(query),
177
+ "related_drugs": chembl_api.search_molecule(query)
178
+ }