mgbam commited on
Commit
ee653fa
·
verified ·
1 Parent(s): ae6d61e

Update genesis/api_clients/ncbi_api.py

Browse files
Files changed (1) hide show
  1. genesis/api_clients/ncbi_api.py +60 -91
genesis/api_clients/ncbi_api.py CHANGED
@@ -1,28 +1,28 @@
1
  # genesis/api_clients/ncbi_api.py
2
  import os
3
  import requests
4
- from xml.etree import ElementTree as ET
5
 
6
- NCBI_API_KEY = os.getenv("NCBI_API_KEY")
7
  NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
8
 
9
- def _add_api_key(params: dict):
10
- """
11
- Adds API key to request params if available.
12
- """
13
  if NCBI_API_KEY:
14
  params["api_key"] = NCBI_API_KEY
15
  return params
16
 
17
-
18
- def search_pubmed(query: str, max_results: int = 10):
 
 
19
  """
20
- Search PubMed via NCBI Entrez for a query string.
21
- Returns a list of PubMed IDs (PMIDs).
22
  """
23
  params = _add_api_key({
24
- "db": "pubmed",
25
- "term": query,
26
  "retmax": max_results,
27
  "retmode": "json"
28
  })
@@ -31,107 +31,76 @@ def search_pubmed(query: str, max_results: int = 10):
31
  data = r.json()
32
  return data.get("esearchresult", {}).get("idlist", [])
33
 
34
-
35
- def fetch_pubmed_details(pmids: list):
36
  """
37
- Fetch abstracts, authors, and metadata for given PMIDs.
38
  """
39
- if not pmids:
40
- return []
41
-
42
  params = _add_api_key({
43
- "db": "pubmed",
44
- "id": ",".join(pmids),
45
- "retmode": "xml"
46
  })
47
- r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
48
  r.raise_for_status()
 
 
 
 
 
 
49
 
50
- root = ET.fromstring(r.text)
51
- results = []
52
-
53
- for article in root.findall(".//PubmedArticle"):
54
- title = article.findtext(".//ArticleTitle", default="")
55
- abstract = " ".join([abst.text or "" for abst in article.findall(".//AbstractText")])
56
- authors = [
57
- f"{a.findtext('ForeName', '')} {a.findtext('LastName', '')}".strip()
58
- for a in article.findall(".//Author")
59
- ]
60
- pmid = article.findtext(".//PMID", default="")
61
-
62
- results.append({
63
- "pmid": pmid,
64
- "title": title,
65
- "abstract": abstract,
66
- "authors": authors
67
- })
68
-
69
- return results
70
-
71
-
72
- def fetch_gene_info(gene_id: str):
73
  """
74
- Fetch gene metadata from NCBI Gene database.
75
  """
76
  params = _add_api_key({
77
- "db": "gene",
78
- "id": gene_id,
79
- "retmode": "xml"
80
  })
81
  r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
82
  r.raise_for_status()
83
- return r.text # XML - downstream parsing can be done in pipeline
84
-
85
 
86
- def search_gene_by_symbol(symbol: str, organism: str = None):
 
 
 
87
  """
88
- Search for a gene by symbol, optionally filtered by organism.
89
  """
90
- term = symbol
91
- if organism:
92
- term += f" AND {organism}[Organism]"
93
-
94
- params = _add_api_key({
95
- "db": "gene",
96
- "term": term,
97
- "retmax": 5,
98
- "retmode": "json"
99
- })
100
- r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
101
- r.raise_for_status()
102
- data = r.json()
103
- return data.get("esearchresult", {}).get("idlist", [])
104
-
105
 
106
- def fetch_protein_info(protein_id: str):
107
  """
108
- Fetch protein metadata from NCBI Protein database.
109
  """
110
- params = _add_api_key({
 
111
  "db": "protein",
112
- "id": protein_id,
113
- "retmode": "xml"
114
  })
115
- r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
116
  r.raise_for_status()
117
- return r.text
118
 
 
 
 
 
 
 
 
 
119
 
120
- def search_protein_by_name(name: str, organism: str = None):
121
  """
122
- Search for proteins by name, optionally filtered by organism.
123
  """
124
- term = name
125
- if organism:
126
- term += f" AND {organism}[Organism]"
127
-
128
- params = _add_api_key({
129
- "db": "protein",
130
- "term": term,
131
- "retmax": 5,
132
- "retmode": "json"
133
- })
134
- r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
135
- r.raise_for_status()
136
- data = r.json()
137
- return data.get("esearchresult", {}).get("idlist", [])
 
1
  # genesis/api_clients/ncbi_api.py
2
  import os
3
  import requests
4
+ from typing import List, Dict, Optional
5
 
6
+ NCBI_API_KEY = os.getenv("NCBI_API_KEY") # Optional, for higher request limits
7
  NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
8
 
9
+ def _add_api_key(params: Dict) -> Dict:
10
+ """Attach API key if available."""
 
 
11
  if NCBI_API_KEY:
12
  params["api_key"] = NCBI_API_KEY
13
  return params
14
 
15
+ # -------------------------
16
+ # SEARCH FUNCTIONS
17
+ # -------------------------
18
+ def search_ncbi(db: str, term: str, max_results: int = 10) -> List[str]:
19
  """
20
+ Search an NCBI database and return a list of IDs.
21
+ db examples: gene, protein, pubmed, taxonomy
22
  """
23
  params = _add_api_key({
24
+ "db": db,
25
+ "term": term,
26
  "retmax": max_results,
27
  "retmode": "json"
28
  })
 
31
  data = r.json()
32
  return data.get("esearchresult", {}).get("idlist", [])
33
 
34
+ def fetch_ncbi_summary(db: str, ids: List[str]) -> List[Dict]:
 
35
  """
36
+ Fetch summaries for a list of IDs from NCBI.
37
  """
 
 
 
38
  params = _add_api_key({
39
+ "db": db,
40
+ "id": ",".join(ids),
41
+ "retmode": "json"
42
  })
43
+ r = requests.get(f"{NCBI_BASE}/esummary.fcgi", params=params)
44
  r.raise_for_status()
45
+ data = r.json()
46
+ summaries = []
47
+ for uid, summary in data.get("result", {}).items():
48
+ if uid != "uids":
49
+ summaries.append(summary)
50
+ return summaries
51
 
52
+ def fetch_ncbi_details(db: str, ids: List[str]) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  """
54
+ Fetch full XML/FASTA/GenBank record for IDs.
55
  """
56
  params = _add_api_key({
57
+ "db": db,
58
+ "id": ",".join(ids),
59
+ "retmode": "text"
60
  })
61
  r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
62
  r.raise_for_status()
63
+ return r.text
 
64
 
65
+ # -------------------------
66
+ # GENE + PATHWAY HELPERS
67
+ # -------------------------
68
+ def search_gene(term: str, max_results: int = 10) -> List[Dict]:
69
  """
70
+ Search for genes and return gene IDs + names.
71
  """
72
+ ids = search_ncbi("gene", term, max_results)
73
+ if not ids:
74
+ return []
75
+ summaries = fetch_ncbi_summary("gene", ids)
76
+ return [{"uid": s.get("uid"), "name": s.get("name"), "description": s.get("description")} for s in summaries]
 
 
 
 
 
 
 
 
 
 
77
 
78
+ def get_protein_from_gene(gene_id: str) -> List[Dict]:
79
  """
80
+ Get protein products from a given gene ID.
81
  """
82
+ link_params = _add_api_key({
83
+ "dbfrom": "gene",
84
  "db": "protein",
85
+ "id": gene_id,
86
+ "retmode": "json"
87
  })
88
+ r = requests.get(f"{NCBI_BASE}/elink.fcgi", params=link_params)
89
  r.raise_for_status()
90
+ data = r.json()
91
 
92
+ protein_ids = []
93
+ for linkset in data.get("linksets", []):
94
+ for link in linkset.get("linksetdbs", []):
95
+ protein_ids.extend(link.get("links", []))
96
+
97
+ if not protein_ids:
98
+ return []
99
+ return fetch_ncbi_summary("protein", protein_ids)
100
 
101
+ def search_taxonomy(term: str) -> List[Dict]:
102
  """
103
+ Search taxonomy database for species/strain info.
104
  """
105
+ ids = search_ncbi("taxonomy", term, max_results=5)
106
+ return fetch_ncbi_summary("taxonomy", ids)