mgbam commited on
Commit
026bd38
·
verified ·
1 Parent(s): 53ae019

Update genesis/api_clients/pubmed_api.py

Browse files
Files changed (1) hide show
  1. genesis/api_clients/pubmed_api.py +38 -51
genesis/api_clients/pubmed_api.py CHANGED
@@ -1,90 +1,77 @@
1
  # genesis/api_clients/pubmed_api.py
2
- import requests
3
  import os
4
- from typing import List, Dict
5
-
6
- # NCBI E-utilities base
7
- PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
8
- PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
9
- PUBMED_SUMMARY_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
10
 
11
- # Optional: NCBI API key for higher rate limits
12
- NCBI_API_KEY = os.getenv("NCBI_API_KEY")
13
 
14
  def search_pubmed(query: str, max_results: int = 10) -> List[str]:
15
  """
16
- Search PubMed for a query and return a list of PubMed IDs (PMIDs).
17
  """
18
  params = {
19
  "db": "pubmed",
20
  "term": query,
21
- "retmode": "json",
22
- "retmax": max_results
23
  }
24
- if NCBI_API_KEY:
25
- params["api_key"] = NCBI_API_KEY
26
-
27
- r = requests.get(PUBMED_SEARCH_URL, params=params)
28
  r.raise_for_status()
29
- data = r.json()
30
- return data.get("esearchresult", {}).get("idlist", [])
31
-
32
 
33
  def fetch_pubmed_details(pmids: List[str]) -> List[Dict]:
34
  """
35
- Fetch details (title, authors, journal, abstract) for a list of PMIDs.
36
  """
37
  if not pmids:
38
  return []
39
-
40
  params = {
41
  "db": "pubmed",
42
  "id": ",".join(pmids),
43
- "retmode": "xml"
 
44
  }
45
- if NCBI_API_KEY:
46
- params["api_key"] = NCBI_API_KEY
47
-
48
- r = requests.get(PUBMED_FETCH_URL, params=params)
49
  r.raise_for_status()
50
- return parse_pubmed_xml(r.text)
51
-
52
-
53
- def parse_pubmed_xml(xml_text: str) -> List[Dict]:
54
- """
55
- Parse PubMed XML into structured data.
56
- """
57
- import xml.etree.ElementTree as ET
58
- root = ET.fromstring(xml_text)
59
- articles = []
60
 
 
 
 
61
  for article in root.findall(".//PubmedArticle"):
62
- title = article.findtext(".//ArticleTitle", default="")
63
- abstract_text = " ".join([abst.text or "" for abst in article.findall(".//AbstractText")])
64
- journal = article.findtext(".//Journal/Title", default="")
 
 
 
 
 
65
  authors = []
66
  for author in article.findall(".//Author"):
67
- last = author.findtext("LastName", "")
68
- fore = author.findtext("ForeName", "")
69
- if last or fore:
70
  authors.append(f"{fore} {last}")
71
-
72
- pmid = article.findtext(".//PMID", default="")
73
-
74
  articles.append({
75
  "pmid": pmid,
76
  "title": title,
77
- "abstract": abstract_text,
78
  "journal": journal,
79
- "authors": authors
 
 
 
80
  })
81
-
82
  return articles
83
 
84
-
85
- def pubmed_summary(query: str, max_results: int = 10) -> List[Dict]:
86
  """
87
- Search PubMed and return summarized results in a single step.
88
  """
89
  pmids = search_pubmed(query, max_results)
90
  return fetch_pubmed_details(pmids)
 
1
  # genesis/api_clients/pubmed_api.py
 
2
  import os
3
+ import requests
4
+ from typing import List, Dict, Optional
5
+ from xml.etree import ElementTree as ET
 
 
 
6
 
7
+ NCBI_API_KEY = os.getenv("NCBI_API_KEY") # Optional but increases rate limits
8
+ NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
9
 
10
  def search_pubmed(query: str, max_results: int = 10) -> List[str]:
11
  """
12
+ Search PubMed and return a list of PMIDs.
13
  """
14
  params = {
15
  "db": "pubmed",
16
  "term": query,
17
+ "retmax": max_results,
18
+ "api_key": NCBI_API_KEY
19
  }
20
+ r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
 
 
 
21
  r.raise_for_status()
22
+ root = ET.fromstring(r.text)
23
+ return [id_tag.text for id_tag in root.findall(".//Id")]
 
24
 
25
  def fetch_pubmed_details(pmids: List[str]) -> List[Dict]:
26
  """
27
+ Fetch detailed information for a list of PMIDs.
28
  """
29
  if not pmids:
30
  return []
31
+
32
  params = {
33
  "db": "pubmed",
34
  "id": ",".join(pmids),
35
+ "retmode": "xml",
36
+ "api_key": NCBI_API_KEY
37
  }
38
+ r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
 
 
 
39
  r.raise_for_status()
 
 
 
 
 
 
 
 
 
 
40
 
41
+ root = ET.fromstring(r.text)
42
+ articles = []
43
+
44
  for article in root.findall(".//PubmedArticle"):
45
+ title = article.findtext(".//ArticleTitle", default="No title")
46
+ abstract = " ".join([t.text for t in article.findall(".//AbstractText") if t.text])
47
+ journal = article.findtext(".//Title", default="Unknown Journal")
48
+ pub_date = article.findtext(".//PubDate/Year", default="Unknown Year")
49
+ doi = None
50
+ for id_tag in article.findall(".//ArticleId"):
51
+ if id_tag.attrib.get("IdType") == "doi":
52
+ doi = id_tag.text
53
  authors = []
54
  for author in article.findall(".//Author"):
55
+ last = author.findtext("LastName")
56
+ fore = author.findtext("ForeName")
57
+ if last and fore:
58
  authors.append(f"{fore} {last}")
59
+ pmid = article.findtext(".//PMID")
 
 
60
  articles.append({
61
  "pmid": pmid,
62
  "title": title,
63
+ "abstract": abstract,
64
  "journal": journal,
65
+ "pub_date": pub_date,
66
+ "doi": doi,
67
+ "authors": authors,
68
+ "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
69
  })
 
70
  return articles
71
 
72
+ def search_and_fetch(query: str, max_results: int = 10) -> List[Dict]:
 
73
  """
74
+ Convenience function: Search and fetch results in one step.
75
  """
76
  pmids = search_pubmed(query, max_results)
77
  return fetch_pubmed_details(pmids)