mgbam commited on
Commit
02711ba
·
verified ·
1 Parent(s): 79df878

Update genesis/api_clients/pubmed_api.py

Browse files
Files changed (1) hide show
  1. genesis/api_clients/pubmed_api.py +57 -42
genesis/api_clients/pubmed_api.py CHANGED
@@ -1,77 +1,92 @@
1
  # genesis/api_clients/pubmed_api.py
2
  import os
3
  import requests
4
- from typing import List, Dict, Optional
5
- from xml.etree import ElementTree as ET
6
 
7
- NCBI_API_KEY = os.getenv("NCBI_API_KEY") # Optional but increases rate limits
8
  NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
 
9
 
10
- def search_pubmed(query: str, max_results: int = 10) -> List[str]:
 
 
 
11
  """
12
- Search PubMed and return a list of PMIDs.
13
  """
14
  params = {
15
  "db": "pubmed",
16
  "term": query,
17
  "retmax": max_results,
18
- "api_key": NCBI_API_KEY
19
  }
20
  r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
21
  r.raise_for_status()
22
  root = ET.fromstring(r.text)
23
- return [id_tag.text for id_tag in root.findall(".//Id")]
24
 
25
- def fetch_pubmed_details(pmids: List[str]) -> List[Dict]:
26
  """
27
- Fetch detailed information for a list of PMIDs.
28
  """
29
- if not pmids:
30
  return []
31
-
32
  params = {
33
  "db": "pubmed",
34
- "id": ",".join(pmids),
35
  "retmode": "xml",
36
- "api_key": NCBI_API_KEY
37
  }
38
  r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
39
  r.raise_for_status()
40
-
41
- root = ET.fromstring(r.text)
42
- articles = []
43
 
 
 
44
  for article in root.findall(".//PubmedArticle"):
45
- title = article.findtext(".//ArticleTitle", default="No title")
46
- abstract = " ".join([t.text for t in article.findall(".//AbstractText") if t.text])
47
- journal = article.findtext(".//Title", default="Unknown Journal")
48
- pub_date = article.findtext(".//PubDate/Year", default="Unknown Year")
49
- doi = None
50
- for id_tag in article.findall(".//ArticleId"):
51
- if id_tag.attrib.get("IdType") == "doi":
52
- doi = id_tag.text
53
- authors = []
54
- for author in article.findall(".//Author"):
55
- last = author.findtext("LastName")
56
- fore = author.findtext("ForeName")
57
- if last and fore:
58
- authors.append(f"{fore} {last}")
59
- pmid = article.findtext(".//PMID")
60
  articles.append({
61
- "pmid": pmid,
62
- "title": title,
63
- "abstract": abstract,
64
- "journal": journal,
65
- "pub_date": pub_date,
66
- "doi": doi,
67
  "authors": authors,
68
- "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
69
  })
70
  return articles
71
 
72
- def search_and_fetch(query: str, max_results: int = 10) -> List[Dict]:
 
 
 
73
  """
74
- Convenience function: Search and fetch results in one step.
75
  """
76
- pmids = search_pubmed(query, max_results)
77
- return fetch_pubmed_details(pmids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # genesis/api_clients/pubmed_api.py
2
  import os
3
  import requests
4
+ import xml.etree.ElementTree as ET
5
+ from typing import List, Dict
6
 
 
7
  NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
8
+ PUBMED_API_KEY = os.getenv("PUBMED_API_KEY") # Optional, set in Hugging Face / .env
9
 
10
+ # -------------------------
11
+ # Core Helpers
12
+ # -------------------------
13
+ def pubmed_search(query: str, max_results: int = 10) -> List[str]:
14
  """
15
+ Search PubMed for a query and return a list of PubMed IDs.
16
  """
17
  params = {
18
  "db": "pubmed",
19
  "term": query,
20
  "retmax": max_results,
21
+ "api_key": PUBMED_API_KEY
22
  }
23
  r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
24
  r.raise_for_status()
25
  root = ET.fromstring(r.text)
26
+ return [id_elem.text for id_elem in root.findall(".//Id")]
27
 
28
+ def fetch_details(pubmed_ids: List[str]) -> List[Dict]:
29
  """
30
+ Fetch detailed metadata for PubMed articles.
31
  """
32
+ if not pubmed_ids:
33
  return []
34
+
35
  params = {
36
  "db": "pubmed",
37
+ "id": ",".join(pubmed_ids),
38
  "retmode": "xml",
39
+ "api_key": PUBMED_API_KEY
40
  }
41
  r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
42
  r.raise_for_status()
 
 
 
43
 
44
+ articles = []
45
+ root = ET.fromstring(r.text)
46
  for article in root.findall(".//PubmedArticle"):
47
+ title_elem = article.find(".//ArticleTitle")
48
+ abstract_elem = article.find(".//Abstract/AbstractText")
49
+ pmid_elem = article.find(".//PMID")
50
+ authors = [
51
+ f"{a.find('LastName').text} {a.find('ForeName').text}"
52
+ for a in article.findall(".//Author")
53
+ if a.find("LastName") is not None and a.find("ForeName") is not None
54
+ ]
55
+
 
 
 
 
 
 
56
  articles.append({
57
+ "pmid": pmid_elem.text if pmid_elem is not None else "",
58
+ "title": title_elem.text if title_elem is not None else "",
59
+ "abstract": abstract_elem.text if abstract_elem is not None else "",
 
 
 
60
  "authors": authors,
61
+ "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid_elem.text}/" if pmid_elem is not None else ""
62
  })
63
  return articles
64
 
65
+ # -------------------------
66
+ # High-Level Search + Fetch
67
+ # -------------------------
68
+ def search_pubmed(query: str, max_results: int = 10) -> List[Dict]:
69
  """
70
+ Search and return structured PubMed results.
71
  """
72
+ ids = pubmed_search(query, max_results)
73
+ return fetch_details(ids)
74
+
75
+ # -------------------------
76
+ # Cross-Domain Integration
77
+ # -------------------------
78
+ def literature_to_entities(query: str) -> Dict:
79
+ """
80
+ Link PubMed literature to related drugs (ChEMBL), genes (NCBI), and ontology terms (BioPortal).
81
+ """
82
+ from genesis.api_clients import chembl_api, ncbi_api, bioportal_api # Lazy import to avoid cycles
83
+
84
+ literature_data = {
85
+ "query": query,
86
+ "articles": search_pubmed(query),
87
+ "related_drugs": chembl_api.search_molecule(query),
88
+ "related_genes": ncbi_api.search_gene(query),
89
+ "ontology_mappings": bioportal_api.search_and_map(query)
90
+ }
91
+
92
+ return literature_data