mgbam commited on
Commit
a5bfe49
·
verified ·
1 Parent(s): d68c20d

Update genesis/api_clients/pubmed_api.py

Browse files
Files changed (1) hide show
  1. genesis/api_clients/pubmed_api.py +53 -57
genesis/api_clients/pubmed_api.py CHANGED
@@ -1,29 +1,28 @@
1
  # genesis/api_clients/pubmed_api.py
2
  import requests
3
- import xml.etree.ElementTree as ET
4
- from typing import List, Dict, Optional
5
- from datetime import datetime
6
 
 
7
  PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
8
  PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
9
- NCBI_API_KEY = None # Optional: Set in Hugging Face secrets for higher rate limits
10
 
11
- def search_pubmed(query: str, max_results: int = 20, start_date: Optional[str] = None, end_date: Optional[str] = None) -> List[str]:
 
 
 
12
  """
13
- Search PubMed for a given query and return a list of PubMed IDs.
14
- Optionally filter by start_date and end_date (YYYY/MM/DD format).
15
  """
16
  params = {
17
  "db": "pubmed",
18
  "term": query,
19
- "retmax": max_results,
20
  "retmode": "json",
21
- "api_key": NCBI_API_KEY
22
  }
23
- if start_date and end_date:
24
- params["mindate"] = start_date
25
- params["maxdate"] = end_date
26
- params["datetype"] = "pdat"
27
 
28
  r = requests.get(PUBMED_SEARCH_URL, params=params)
29
  r.raise_for_status()
@@ -31,64 +30,61 @@ def search_pubmed(query: str, max_results: int = 20, start_date: Optional[str] =
31
  return data.get("esearchresult", {}).get("idlist", [])
32
 
33
 
34
- def fetch_pubmed_details(pmid_list: List[str]) -> List[Dict]:
35
  """
36
- Fetch detailed metadata for a list of PubMed IDs.
37
- Returns title, abstract, authors, journal, and publication date.
38
  """
39
- if not pmid_list:
40
  return []
41
 
42
  params = {
43
  "db": "pubmed",
44
- "id": ",".join(pmid_list),
45
- "retmode": "xml",
46
- "api_key": NCBI_API_KEY
47
  }
 
 
 
48
  r = requests.get(PUBMED_FETCH_URL, params=params)
49
  r.raise_for_status()
 
50
 
51
- root = ET.fromstring(r.text)
52
- results = []
53
 
 
 
 
 
 
 
 
 
54
  for article in root.findall(".//PubmedArticle"):
55
- try:
56
- title = article.find(".//ArticleTitle").text or "No title"
57
- abstract = " ".join([t.text for t in article.findall(".//AbstractText") if t.text]) or "No abstract"
58
- authors = []
59
- for a in article.findall(".//Author"):
60
- last = a.findtext("LastName", "")
61
- first = a.findtext("ForeName", "")
62
- if last or first:
63
- authors.append(f"{first} {last}".strip())
64
-
65
- journal = article.findtext(".//Journal/Title", "Unknown Journal")
66
- pub_date = article.find(".//PubDate")
67
- if pub_date is not None:
68
- year = pub_date.findtext("Year", "")
69
- month = pub_date.findtext("Month", "")
70
- day = pub_date.findtext("Day", "")
71
- date_str = f"{year}-{month}-{day}" if year else "Unknown"
72
- else:
73
- date_str = "Unknown"
74
-
75
- results.append({
76
- "title": title,
77
- "abstract": abstract,
78
- "authors": authors,
79
- "journal": journal,
80
- "publication_date": date_str,
81
- "pubmed_link": f"https://pubmed.ncbi.nlm.nih.gov/{article.findtext('.//PMID')}/"
82
- })
83
- except Exception:
84
- continue
85
-
86
- return results
87
 
88
 
89
- def search_and_fetch_pubmed(query: str, max_results: int = 20, start_date: Optional[str] = None, end_date: Optional[str] = None) -> List[Dict]:
90
  """
91
- Search and fetch PubMed results in one call.
92
  """
93
- pmids = search_pubmed(query, max_results, start_date, end_date)
94
  return fetch_pubmed_details(pmids)
 
1
  # genesis/api_clients/pubmed_api.py
2
  import requests
3
+ import os
4
+ from typing import List, Dict
 
5
 
6
+ # NCBI E-utilities base
7
  PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
8
  PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
9
+ PUBMED_SUMMARY_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
10
 
11
+ # Optional: NCBI API key for higher rate limits
12
+ NCBI_API_KEY = os.getenv("NCBI_API_KEY")
13
+
14
+ def search_pubmed(query: str, max_results: int = 10) -> List[str]:
15
  """
16
+ Search PubMed for a query and return a list of PubMed IDs (PMIDs).
 
17
  """
18
  params = {
19
  "db": "pubmed",
20
  "term": query,
 
21
  "retmode": "json",
22
+ "retmax": max_results
23
  }
24
+ if NCBI_API_KEY:
25
+ params["api_key"] = NCBI_API_KEY
 
 
26
 
27
  r = requests.get(PUBMED_SEARCH_URL, params=params)
28
  r.raise_for_status()
 
30
  return data.get("esearchresult", {}).get("idlist", [])
31
 
32
 
33
+ def fetch_pubmed_details(pmids: List[str]) -> List[Dict]:
34
  """
35
+ Fetch details (title, authors, journal, abstract) for a list of PMIDs.
 
36
  """
37
+ if not pmids:
38
  return []
39
 
40
  params = {
41
  "db": "pubmed",
42
+ "id": ",".join(pmids),
43
+ "retmode": "xml"
 
44
  }
45
+ if NCBI_API_KEY:
46
+ params["api_key"] = NCBI_API_KEY
47
+
48
  r = requests.get(PUBMED_FETCH_URL, params=params)
49
  r.raise_for_status()
50
+ return parse_pubmed_xml(r.text)
51
 
 
 
52
 
53
+ def parse_pubmed_xml(xml_text: str) -> List[Dict]:
54
+ """
55
+ Parse PubMed XML into structured data.
56
+ """
57
+ import xml.etree.ElementTree as ET
58
+ root = ET.fromstring(xml_text)
59
+ articles = []
60
+
61
  for article in root.findall(".//PubmedArticle"):
62
+ title = article.findtext(".//ArticleTitle", default="")
63
+ abstract_text = " ".join([abst.text or "" for abst in article.findall(".//AbstractText")])
64
+ journal = article.findtext(".//Journal/Title", default="")
65
+ authors = []
66
+ for author in article.findall(".//Author"):
67
+ last = author.findtext("LastName", "")
68
+ fore = author.findtext("ForeName", "")
69
+ if last or fore:
70
+ authors.append(f"{fore} {last}")
71
+
72
+ pmid = article.findtext(".//PMID", default="")
73
+
74
+ articles.append({
75
+ "pmid": pmid,
76
+ "title": title,
77
+ "abstract": abstract_text,
78
+ "journal": journal,
79
+ "authors": authors
80
+ })
81
+
82
+ return articles
 
 
 
 
 
 
 
 
 
 
 
83
 
84
 
85
+ def pubmed_summary(query: str, max_results: int = 10) -> List[Dict]:
86
  """
87
+ Search PubMed and return summarized results in a single step.
88
  """
89
+ pmids = search_pubmed(query, max_results)
90
  return fetch_pubmed_details(pmids)