File size: 4,838 Bytes
a0d8a91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# api_clients/pubmed_client.py
"""
Client for the PubMed API via NCBI's Entrez E-utilities.
This module is expertly crafted to perform a two-step search: first finding
relevant article IDs (PMIDs) and then fetching their structured summaries.
It intelligently prioritizes review articles to provide high-quality,
synthesized information to the main orchestrator.
"""
import aiohttp
from .config import PUBMED_BASE_URL, REQUEST_HEADERS

async def search_pubmed(session: aiohttp.ClientSession, query: str, max_results: int = 5) -> list[dict]:
    """
    Searches PubMed and returns a list of article summaries.

    This function implements an intelligent search strategy:
    1.  It searches for article IDs (PMIDs) matching the query within the title/abstract.
    2.  It specifically filters for "review" articles, which are ideal for summarization.
    3.  It then fetches concise summaries for the found PMIDs.

    Args:
        session (aiohttp.ClientSession): The active HTTP session.
        query (str): The search term, likely a combination of concepts (e.g., "Migraine AND Aura").
        max_results (int): The maximum number of article summaries to return.

    Returns:
        list[dict]: A list of dictionaries, each containing summary data for an article.
                    Returns an empty list if no results are found or an error occurs.
    """
    if not query:
        return []

    # --- Step 1: ESearch - Find relevant article PMIDs ---
    # We construct a powerful query to get the most relevant results.
    # - `[Title/Abstract]`: Focuses the search on the most important parts of the paper.
    # - `AND review[Publication Type]`: Narrows results to high-value review articles.
    # - `sort=relevance`: Ensures the best matches appear first.
    search_term = f"({query}) AND review[Publication Type]"
    
    esearch_params = {
        'db': 'pubmed',
        'term': search_term,
        'retmode': 'json',
        'retmax': max_results,
        'sort': 'relevance'
    }
    esearch_url = f"{PUBMED_BASE_URL}/esearch.fcgi"
    
    pmids = []
    try:
        async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS, timeout=10) as resp:
            resp.raise_for_status()
            data = await resp.json()
            pmids = data.get('esearchresult', {}).get('idlist', [])
            
            if not pmids:
                # If no review articles are found, try a broader search as a fallback
                print(f"No review articles found for '{query}'. Broadening search...")
                esearch_params['term'] = query # Remove the review filter
                async with session.get(esearch_url, params=esearch_params, headers=REQUEST_HEADERS) as fallback_resp:
                    fallback_resp.raise_for_status()
                    fallback_data = await fallback_resp.json()
                    pmids = fallback_data.get('esearchresult', {}).get('idlist', [])
            
            if not pmids:
                print(f"No PubMed results found for query: {query}")
                return []

        # --- Step 2: ESummary - Fetch summaries for the found PMIDs ---
        esummary_params = {
            'db': 'pubmed',
            'id': ",".join(pmids), # E-utilities can take a comma-separated list of IDs
            'retmode': 'json'
        }
        esummary_url = f"{PUBMED_BASE_URL}/esummary.fcgi"
        
        async with session.get(esummary_url, params=esummary_params, headers=REQUEST_HEADERS, timeout=15) as resp:
            resp.raise_for_status()
            summary_data = await resp.json()
            
            # The result is a dict with a 'result' key, which contains another dict
            # where keys are the PMIDs. We'll parse this into a clean list.
            results = summary_data.get('result', {})
            
            # A robust way to parse, ensuring order and handling missing data
            parsed_articles = []
            for pmid in pmids:
                if pmid in results:
                    article = results[pmid]
                    parsed_articles.append({
                        'uid': article.get('uid', pmid),
                        'title': article.get('title', 'Title Not Available'),
                        'pubdate': article.get('pubdate', 'N/A'),
                        'authors': [author['name'] for author in article.get('authors', [])],
                        'journal': article.get('source', 'N/A'),
                        'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
                    })
            return parsed_articles

    except aiohttp.ClientError as e:
        print(f"An error occurred while fetching from PubMed: {e}")
        return []
    except Exception as e:
        print(f"A general error occurred in the pubmed_client: {e}")
        return []