mgbam commited on
Commit
a0d8a91
·
verified ·
1 Parent(s): e998c1c

Update api_clients/pubmed_client.py

Browse files
Files changed (1) hide show
  1. api_clients/pubmed_client.py +105 -0
api_clients/pubmed_client.py CHANGED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api_clients/pubmed_client.py
2
+ """
3
+ Client for the PubMed API via NCBI's Entrez E-utilities.
4
+ This module is expertly crafted to perform a two-step search: first finding
5
+ relevant article IDs (PMIDs) and then fetching their structured summaries.
6
+ It intelligently prioritizes review articles to provide high-quality,
7
+ synthesized information to the main orchestrator.
8
+ """
9
+ import aiohttp
10
+ from .config import PUBMED_BASE_URL, REQUEST_HEADERS
11
+
12
+ async def search_pubmed(session: aiohttp.ClientSession, query: str, max_results: int = 5) -> list[dict]:
13
+ """
14
+ Searches PubMed and returns a list of article summaries.
15
+
16
+ This function implements an intelligent search strategy:
17
+ 1. It searches for article IDs (PMIDs) matching the query within the title/abstract.
18
+ 2. It specifically filters for "review" articles, which are ideal for summarization.
19
+ 3. It then fetches concise summaries for the found PMIDs.
20
+
21
+ Args:
22
+ session (aiohttp.ClientSession): The active HTTP session.
23
+ query (str): The search term, likely a combination of concepts (e.g., "Migraine AND Aura").
24
+ max_results (int): The maximum number of article summaries to return.
25
+
26
+ Returns:
27
+ list[dict]: A list of dictionaries, each containing summary data for an article.
28
+ Returns an empty list if no results are found or an error occurs.
29
+ """
30
+ if not query:
31
+ return []
32
+
33
+ # --- Step 1: ESearch - Find relevant article PMIDs ---
34
+ # We construct a powerful query to get the most relevant results.
35
+ # - `[Title/Abstract]`: Focuses the search on the most important parts of the paper.
36
+ # - `AND review[Publication Type]`: Narrows results to high-value review articles.
37
+ # - `sort=relevance`: Ensures the best matches appear first.
38
+ search_term = f"({query}) AND review[Publication Type]"
39
+
40
+ esearch_params = {
41
+ 'db': 'pubmed',
42
+ 'term': search_term,
43
+ 'retmode': 'json',
44
+ 'retmax': max_results,
45
+ 'sort': 'relevance'
46
+ }
47
+ esearch_url = f"{PUBMED_BASE_URL}/esearch.fcgi"
48
+
49
+ pmids = []
50
+ try:
51
+ async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS, timeout=10) as resp:
52
+ resp.raise_for_status()
53
+ data = await resp.json()
54
+ pmids = data.get('esearchresult', {}).get('idlist', [])
55
+
56
+ if not pmids:
57
+ # If no review articles are found, try a broader search as a fallback
58
+ print(f"No review articles found for '{query}'. Broadening search...")
59
+ esearch_params['term'] = query # Remove the review filter
60
+ async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS) as fallback_resp:
61
+ fallback_resp.raise_for_status()
62
+ fallback_data = await fallback_resp.json()
63
+ pmids = fallback_data.get('esearchresult', {}).get('idlist', [])
64
+
65
+ if not pmids:
66
+ print(f"No PubMed results found for query: {query}")
67
+ return []
68
+
69
+ # --- Step 2: ESummary - Fetch summaries for the found PMIDs ---
70
+ esummary_params = {
71
+ 'db': 'pubmed',
72
+ 'id': ",".join(pmids), # E-utilities can take a comma-separated list of IDs
73
+ 'retmode': 'json'
74
+ }
75
+ esummary_url = f"{PUBMED_BASE_URL}/esummary.fcgi"
76
+
77
+ async with session.get(esummary_url, params=esummary_params, headers=REQUEST_HEADERS, timeout=15) as resp:
78
+ resp.raise_for_status()
79
+ summary_data = await resp.json()
80
+
81
+ # The result is a dict with a 'result' key, which contains another dict
82
+ # where keys are the PMIDs. We'll parse this into a clean list.
83
+ results = summary_data.get('result', {})
84
+
85
+ # A robust way to parse, ensuring order and handling missing data
86
+ parsed_articles = []
87
+ for pmid in pmids:
88
+ if pmid in results:
89
+ article = results[pmid]
90
+ parsed_articles.append({
91
+ 'uid': article.get('uid', pmid),
92
+ 'title': article.get('title', 'Title Not Available'),
93
+ 'pubdate': article.get('pubdate', 'N/A'),
94
+ 'authors': [author['name'] for author in article.get('authors', [])],
95
+ 'journal': article.get('source', 'N/A'),
96
+ 'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
97
+ })
98
+ return parsed_articles
99
+
100
+ except aiohttp.ClientError as e:
101
+ print(f"An error occurred while fetching from PubMed: {e}")
102
+ return []
103
+ except Exception as e:
104
+ print(f"A general error occurred in the pubmed_client: {e}")
105
+ return []