mgbam commited on
Commit
ec0d077
·
verified ·
1 Parent(s): 27cd148

Update genesis/api_clients/pubmed_api.py

Browse files
Files changed (1) hide show
  1. genesis/api_clients/pubmed_api.py +63 -35
genesis/api_clients/pubmed_api.py CHANGED
@@ -1,31 +1,33 @@
1
  # genesis/api_clients/pubmed_api.py
2
  import requests
3
- from xml.etree import ElementTree as ET
 
4
 
5
- NCBI_EUTILS_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
 
6
 
7
- def search_pubmed(query: str, max_results: int = 10, api_key: str = None):
 
8
  """
9
- Search PubMed for a given query and return a list of PMIDs.
10
  """
11
  params = {
12
  "db": "pubmed",
13
  "term": query,
14
- "retmode": "json",
15
  "retmax": max_results,
 
 
 
16
  }
17
- if api_key:
18
- params["api_key"] = api_key
19
-
20
- response = requests.get(f"{NCBI_EUTILS_BASE}/esearch.fcgi", params=params)
21
  response.raise_for_status()
22
  data = response.json()
23
  return data.get("esearchresult", {}).get("idlist", [])
24
 
25
 
26
- def fetch_pubmed_details(pmids: list, api_key: str = None):
27
  """
28
- Fetch detailed article data for given PMIDs.
29
  """
30
  if not pmids:
31
  return []
@@ -33,42 +35,68 @@ def fetch_pubmed_details(pmids: list, api_key: str = None):
33
  params = {
34
  "db": "pubmed",
35
  "id": ",".join(pmids),
36
- "retmode": "xml"
 
37
  }
38
- if api_key:
39
- params["api_key"] = api_key
40
-
41
- response = requests.get(f"{NCBI_EUTILS_BASE}/efetch.fcgi", params=params)
42
  response.raise_for_status()
43
- return parse_pubmed_xml(response.text)
44
-
45
 
46
- def parse_pubmed_xml(xml_text: str):
47
- """
48
- Parse PubMed XML into structured dicts.
49
- """
50
- root = ET.fromstring(xml_text)
51
  articles = []
52
 
53
  for article in root.findall(".//PubmedArticle"):
54
- title_el = article.find(".//ArticleTitle")
55
- abstract_el = article.find(".//AbstractText")
56
- journal_el = article.find(".//Journal/Title")
57
- year_el = article.find(".//PubDate/Year")
 
 
 
58
 
59
  articles.append({
60
- "title": title_el.text if title_el is not None else None,
61
- "abstract": abstract_el.text if abstract_el is not None else None,
62
- "journal": journal_el.text if journal_el is not None else None,
63
- "year": year_el.text if year_el is not None else None,
 
 
64
  })
65
 
66
  return articles
67
 
68
 
69
- def search_and_fetch(query: str, max_results: int = 5, api_key: str = None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  """
71
- Convenience function: search and fetch article details.
72
  """
73
- pmids = search_pubmed(query, max_results=max_results, api_key=api_key)
74
- return fetch_pubmed_details(pmids, api_key=api_key)
 
1
  # genesis/api_clients/pubmed_api.py
2
  import requests
3
+ import xml.etree.ElementTree as ET
4
+ from datetime import datetime
5
 
6
+ NCBI_BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
7
+ EMAIL = "[email protected]" # NCBI requires an email in requests
8
 
9
+
10
+ def search_pubmed(query: str, max_results: int = 10):
11
  """
12
+ Search PubMed for a given query and return PubMed IDs (PMIDs).
13
  """
14
  params = {
15
  "db": "pubmed",
16
  "term": query,
 
17
  "retmax": max_results,
18
+ "retmode": "json",
19
+ "sort": "pub+date",
20
+ "email": EMAIL
21
  }
22
+ response = requests.get(f"{NCBI_BASE_URL}/esearch.fcgi", params=params)
 
 
 
23
  response.raise_for_status()
24
  data = response.json()
25
  return data.get("esearchresult", {}).get("idlist", [])
26
 
27
 
28
+ def fetch_pubmed_details(pmids: list):
29
  """
30
+ Retrieve details (title, abstract, authors, etc.) for a list of PMIDs.
31
  """
32
  if not pmids:
33
  return []
 
35
  params = {
36
  "db": "pubmed",
37
  "id": ",".join(pmids),
38
+ "retmode": "xml",
39
+ "email": EMAIL
40
  }
41
+ response = requests.get(f"{NCBI_BASE_URL}/efetch.fcgi", params=params)
 
 
 
42
  response.raise_for_status()
 
 
43
 
44
+ root = ET.fromstring(response.text)
 
 
 
 
45
  articles = []
46
 
47
  for article in root.findall(".//PubmedArticle"):
48
+ title = article.findtext(".//ArticleTitle", default="No title available")
49
+ abstract = " ".join(
50
+ [a.text for a in article.findall(".//AbstractText") if a.text]
51
+ )
52
+ pub_date = extract_pub_date(article)
53
+ authors = extract_authors(article)
54
+ pmid = article.findtext(".//PMID", default="")
55
 
56
  articles.append({
57
+ "pmid": pmid,
58
+ "title": title,
59
+ "abstract": abstract,
60
+ "authors": authors,
61
+ "pub_date": pub_date,
62
+ "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
63
  })
64
 
65
  return articles
66
 
67
 
68
+ def extract_pub_date(article_element):
69
+ """
70
+ Extract publication date from a PubMed XML element.
71
+ """
72
+ date_elem = article_element.find(".//PubDate")
73
+ if date_elem is not None:
74
+ year = date_elem.findtext("Year")
75
+ month = date_elem.findtext("Month", default="01")
76
+ day = date_elem.findtext("Day", default="01")
77
+ try:
78
+ return datetime(int(year), int(month), int(day)).strftime("%Y-%m-%d")
79
+ except:
80
+ return year
81
+ return "Unknown"
82
+
83
+
84
+ def extract_authors(article_element):
85
+ """
86
+ Extract authors from a PubMed XML element.
87
+ """
88
+ authors = []
89
+ for author in article_element.findall(".//Author"):
90
+ last_name = author.findtext("LastName", "")
91
+ fore_name = author.findtext("ForeName", "")
92
+ if last_name and fore_name:
93
+ authors.append(f"{fore_name} {last_name}")
94
+ return authors
95
+
96
+
97
+ def search_and_fetch(query: str, max_results: int = 5):
98
  """
99
+ Convenience function to search PubMed and fetch article details.
100
  """
101
+ pmids = search_pubmed(query, max_results)
102
+ return fetch_pubmed_details(pmids)