mgbam commited on
Commit
587c291
·
verified ·
1 Parent(s): 81d9b6a

Update genesis/api_clients/pubmed_api.py

Browse files
Files changed (1) hide show
  1. genesis/api_clients/pubmed_api.py +72 -76
genesis/api_clients/pubmed_api.py CHANGED
@@ -1,92 +1,88 @@
1
  # genesis/api_clients/pubmed_api.py
 
2
  import os
3
  import requests
4
- import xml.etree.ElementTree as ET
5
- from typing import List, Dict
6
 
7
- NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
8
- PUBMED_API_KEY = os.getenv("PUBMED_API_KEY") # Optional, set in Hugging Face / .env
9
 
10
- # -------------------------
11
- # Core Helpers
12
- # -------------------------
13
- def pubmed_search(query: str, max_results: int = 10) -> List[str]:
14
  """
15
- Search PubMed for a query and return a list of PubMed IDs.
 
 
 
 
 
 
 
16
  """
 
 
 
 
17
  params = {
18
  "db": "pubmed",
19
  "term": query,
20
  "retmax": max_results,
21
- "api_key": PUBMED_API_KEY
22
  }
23
- r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
24
- r.raise_for_status()
25
- root = ET.fromstring(r.text)
26
- return [id_elem.text for id_elem in root.findall(".//Id")]
27
 
28
- def fetch_details(pubmed_ids: List[str]) -> List[Dict]:
29
- """
30
- Fetch detailed metadata for PubMed articles.
31
- """
32
- if not pubmed_ids:
33
- return []
 
34
 
35
- params = {
36
- "db": "pubmed",
37
- "id": ",".join(pubmed_ids),
38
- "retmode": "xml",
39
- "api_key": PUBMED_API_KEY
40
- }
41
- r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
42
- r.raise_for_status()
43
-
44
- articles = []
45
- root = ET.fromstring(r.text)
46
- for article in root.findall(".//PubmedArticle"):
47
- title_elem = article.find(".//ArticleTitle")
48
- abstract_elem = article.find(".//Abstract/AbstractText")
49
- pmid_elem = article.find(".//PMID")
50
- authors = [
51
- f"{a.find('LastName').text} {a.find('ForeName').text}"
52
- for a in article.findall(".//Author")
53
- if a.find("LastName") is not None and a.find("ForeName") is not None
54
- ]
55
-
56
- articles.append({
57
- "pmid": pmid_elem.text if pmid_elem is not None else "",
58
- "title": title_elem.text if title_elem is not None else "",
59
- "abstract": abstract_elem.text if abstract_elem is not None else "",
60
- "authors": authors,
61
- "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid_elem.text}/" if pmid_elem is not None else ""
62
- })
63
- return articles
64
-
65
- # -------------------------
66
- # High-Level Search + Fetch
67
- # -------------------------
68
- def search_pubmed(query: str, max_results: int = 10) -> List[Dict]:
69
- """
70
- Search and return structured PubMed results.
71
- """
72
- ids = pubmed_search(query, max_results)
73
- return fetch_details(ids)
74
 
75
- # -------------------------
76
- # Cross-Domain Integration
77
- # -------------------------
78
- def literature_to_entities(query: str) -> Dict:
79
- """
80
- Link PubMed literature to related drugs (ChEMBL), genes (NCBI), and ontology terms (BioPortal).
81
- """
82
- from genesis.api_clients import chembl_api, ncbi_api, bioportal_api # Lazy import to avoid cycles
83
-
84
- literature_data = {
85
- "query": query,
86
- "articles": search_pubmed(query),
87
- "related_drugs": chembl_api.search_molecule(query),
88
- "related_genes": ncbi_api.search_gene(query),
89
- "ontology_mappings": bioportal_api.search_and_map(query)
90
- }
 
 
91
 
92
- return literature_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # genesis/api_clients/pubmed_api.py
2
+
3
  import os
4
  import requests
5
+ import html
6
+ from xml.etree import ElementTree as ET
7
 
8
+ PUBMED_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
 
9
 
10
+ def search_pubmed_literature(query: str, max_results: int = 10):
 
 
 
11
  """
12
+ Search PubMed literature using NCBI E-utilities API.
13
+
14
+ Args:
15
+ query (str): Search query (e.g. "synthetic biology cancer therapy")
16
+ max_results (int): Maximum number of results to return.
17
+
18
+ Returns:
19
+ list[dict]: Each dict has 'title', 'authors', 'link'
20
  """
21
+ if not query.strip():
22
+ return []
23
+
24
+ api_key = os.getenv("PUBMED_API_KEY") # optional
25
  params = {
26
  "db": "pubmed",
27
  "term": query,
28
  "retmax": max_results,
29
+ "retmode": "xml"
30
  }
31
+ if api_key:
32
+ params["api_key"] = api_key
 
 
33
 
34
+ try:
35
+ # Step 1: Search for IDs
36
+ search_url = f"{PUBMED_BASE}/esearch.fcgi"
37
+ search_res = requests.get(search_url, params=params, timeout=10)
38
+ search_res.raise_for_status()
39
+ root = ET.fromstring(search_res.text)
40
+ ids = [id_elem.text for id_elem in root.findall(".//Id")]
41
 
42
+ if not ids:
43
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ # Step 2: Fetch article details
46
+ fetch_url = f"{PUBMED_BASE}/efetch.fcgi"
47
+ fetch_params = {
48
+ "db": "pubmed",
49
+ "id": ",".join(ids),
50
+ "retmode": "xml"
51
+ }
52
+ if api_key:
53
+ fetch_params["api_key"] = api_key
54
+
55
+ fetch_res = requests.get(fetch_url, params=fetch_params, timeout=10)
56
+ fetch_res.raise_for_status()
57
+ fetch_root = ET.fromstring(fetch_res.text)
58
+
59
+ results = []
60
+ for article in fetch_root.findall(".//PubmedArticle"):
61
+ title_elem = article.find(".//ArticleTitle")
62
+ title = html.unescape(title_elem.text) if title_elem is not None else "No title"
63
 
64
+ authors = []
65
+ for author in article.findall(".//Author"):
66
+ last = author.find("LastName")
67
+ fore = author.find("ForeName")
68
+ if last is not None and fore is not None:
69
+ authors.append(f"{fore.text} {last.text}")
70
+
71
+ pmid_elem = article.find(".//PMID")
72
+ pmid = pmid_elem.text if pmid_elem is not None else ""
73
+ link = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" if pmid else ""
74
+
75
+ results.append({
76
+ "title": title,
77
+ "authors": ", ".join(authors) if authors else "N/A",
78
+ "link": link
79
+ })
80
+
81
+ return results
82
+
83
+ except requests.exceptions.RequestException as e:
84
+ print(f"[PubMed API Error] {e}")
85
+ return []
86
+ except ET.ParseError as e:
87
+ print(f"[PubMed Parse Error] {e}")
88
+ return []