mgbam commited on
Commit
9fbaf8f
·
verified ·
1 Parent(s): 5bfa97f

Update genesis/api_clients/pubmed_api.py

Browse files
Files changed (1) hide show
  1. genesis/api_clients/pubmed_api.py +62 -70
genesis/api_clients/pubmed_api.py CHANGED
@@ -1,102 +1,94 @@
1
  # genesis/api_clients/pubmed_api.py
2
  import requests
3
  import xml.etree.ElementTree as ET
 
4
  from datetime import datetime
5
 
6
- NCBI_BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
7
- EMAIL = "your_email@example.com" # NCBI requires an email in requests
 
8
 
9
-
10
- def search_pubmed(query: str, max_results: int = 10):
11
  """
12
- Search PubMed for a given query and return PubMed IDs (PMIDs).
 
13
  """
14
  params = {
15
  "db": "pubmed",
16
  "term": query,
17
  "retmax": max_results,
18
  "retmode": "json",
19
- "sort": "pub+date",
20
- "email": EMAIL
21
  }
22
- response = requests.get(f"{NCBI_BASE_URL}/esearch.fcgi", params=params)
23
- response.raise_for_status()
24
- data = response.json()
 
 
 
 
 
25
  return data.get("esearchresult", {}).get("idlist", [])
26
 
27
 
28
- def fetch_pubmed_details(pmids: list):
29
  """
30
- Retrieve details (title, abstract, authors, etc.) for a list of PMIDs.
 
31
  """
32
- if not pmids:
33
  return []
34
 
35
  params = {
36
  "db": "pubmed",
37
- "id": ",".join(pmids),
38
  "retmode": "xml",
39
- "email": EMAIL
40
  }
41
- response = requests.get(f"{NCBI_BASE_URL}/efetch.fcgi", params=params)
42
- response.raise_for_status()
43
 
44
- root = ET.fromstring(response.text)
45
- articles = []
46
 
47
  for article in root.findall(".//PubmedArticle"):
48
- title = article.findtext(".//ArticleTitle", default="No title available")
49
- abstract = " ".join(
50
- [a.text for a in article.findall(".//AbstractText") if a.text]
51
- )
52
- pub_date = extract_pub_date(article)
53
- authors = extract_authors(article)
54
- pmid = article.findtext(".//PMID", default="")
55
-
56
- articles.append({
57
- "pmid": pmid,
58
- "title": title,
59
- "abstract": abstract,
60
- "authors": authors,
61
- "pub_date": pub_date,
62
- "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
63
- })
64
-
65
- return articles
66
-
67
-
68
- def extract_pub_date(article_element):
69
- """
70
- Extract publication date from a PubMed XML element.
71
- """
72
- date_elem = article_element.find(".//PubDate")
73
- if date_elem is not None:
74
- year = date_elem.findtext("Year")
75
- month = date_elem.findtext("Month", default="01")
76
- day = date_elem.findtext("Day", default="01")
77
  try:
78
- return datetime(int(year), int(month), int(day)).strftime("%Y-%m-%d")
79
- except:
80
- return year
81
- return "Unknown"
82
-
83
-
84
- def extract_authors(article_element):
85
- """
86
- Extract authors from a PubMed XML element.
87
- """
88
- authors = []
89
- for author in article_element.findall(".//Author"):
90
- last_name = author.findtext("LastName", "")
91
- fore_name = author.findtext("ForeName", "")
92
- if last_name and fore_name:
93
- authors.append(f"{fore_name} {last_name}")
94
- return authors
95
-
96
-
97
- def search_and_fetch(query: str, max_results: int = 5):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  """
99
- Convenience function to search PubMed and fetch article details.
100
  """
101
- pmids = search_pubmed(query, max_results)
102
  return fetch_pubmed_details(pmids)
 
1
  # genesis/api_clients/pubmed_api.py
2
  import requests
3
  import xml.etree.ElementTree as ET
4
+ from typing import List, Dict, Optional
5
  from datetime import datetime
6
 
7
+ PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
8
+ PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
9
+ NCBI_API_KEY = None # Optional: Set in Hugging Face secrets for higher rate limits
10
 
11
+ def search_pubmed(query: str, max_results: int = 20, start_date: Optional[str] = None, end_date: Optional[str] = None) -> List[str]:
 
12
  """
13
+ Search PubMed for a given query and return a list of PubMed IDs.
14
+ Optionally filter by start_date and end_date (YYYY/MM/DD format).
15
  """
16
  params = {
17
  "db": "pubmed",
18
  "term": query,
19
  "retmax": max_results,
20
  "retmode": "json",
21
+ "api_key": NCBI_API_KEY
 
22
  }
23
+ if start_date and end_date:
24
+ params["mindate"] = start_date
25
+ params["maxdate"] = end_date
26
+ params["datetype"] = "pdat"
27
+
28
+ r = requests.get(PUBMED_SEARCH_URL, params=params)
29
+ r.raise_for_status()
30
+ data = r.json()
31
  return data.get("esearchresult", {}).get("idlist", [])
32
 
33
 
34
+ def fetch_pubmed_details(pmid_list: List[str]) -> List[Dict]:
35
  """
36
+ Fetch detailed metadata for a list of PubMed IDs.
37
+ Returns title, abstract, authors, journal, and publication date.
38
  """
39
+ if not pmid_list:
40
  return []
41
 
42
  params = {
43
  "db": "pubmed",
44
+ "id": ",".join(pmid_list),
45
  "retmode": "xml",
46
+ "api_key": NCBI_API_KEY
47
  }
48
+ r = requests.get(PUBMED_FETCH_URL, params=params)
49
+ r.raise_for_status()
50
 
51
+ root = ET.fromstring(r.text)
52
+ results = []
53
 
54
  for article in root.findall(".//PubmedArticle"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  try:
56
+ title = article.find(".//ArticleTitle").text or "No title"
57
+ abstract = " ".join([t.text for t in article.findall(".//AbstractText") if t.text]) or "No abstract"
58
+ authors = []
59
+ for a in article.findall(".//Author"):
60
+ last = a.findtext("LastName", "")
61
+ first = a.findtext("ForeName", "")
62
+ if last or first:
63
+ authors.append(f"{first} {last}".strip())
64
+
65
+ journal = article.findtext(".//Journal/Title", "Unknown Journal")
66
+ pub_date = article.find(".//PubDate")
67
+ if pub_date is not None:
68
+ year = pub_date.findtext("Year", "")
69
+ month = pub_date.findtext("Month", "")
70
+ day = pub_date.findtext("Day", "")
71
+ date_str = f"{year}-{month}-{day}" if year else "Unknown"
72
+ else:
73
+ date_str = "Unknown"
74
+
75
+ results.append({
76
+ "title": title,
77
+ "abstract": abstract,
78
+ "authors": authors,
79
+ "journal": journal,
80
+ "publication_date": date_str,
81
+ "pubmed_link": f"https://pubmed.ncbi.nlm.nih.gov/{article.findtext('.//PMID')}/"
82
+ })
83
+ except Exception:
84
+ continue
85
+
86
+ return results
87
+
88
+
89
+ def search_and_fetch_pubmed(query: str, max_results: int = 20, start_date: Optional[str] = None, end_date: Optional[str] = None) -> List[Dict]:
90
  """
91
+ Search and fetch PubMed results in one call.
92
  """
93
+ pmids = search_pubmed(query, max_results, start_date, end_date)
94
  return fetch_pubmed_details(pmids)