File size: 3,623 Bytes
9bee6a4
117cd77
9bee6a4
7227edd
9bee6a4
117cd77
7227edd
9bee6a4
7227edd
ef76eaa
7227edd
ef76eaa
7227edd
 
 
117cd77
ef76eaa
117cd77
ef76eaa
7227edd
 
ef76eaa
7227edd
 
 
 
 
 
 
 
 
 
117cd77
ef76eaa
7227edd
117cd77
7227edd
117cd77
7227edd
117cd77
7227edd
 
117cd77
7227edd
117cd77
7227edd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117cd77
 
7227edd
117cd77
7227edd
117cd77
7227edd
 
 
117cd77
7227edd
 
 
 
 
 
 
 
 
 
117cd77
 
7227edd
117cd77
7227edd
117cd77
7227edd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# genesis/api_clients/ncbi_api.py
import os
import requests
from xml.etree import ElementTree as ET

NCBI_API_KEY = os.getenv("NCBI_API_KEY")
NCBI_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

def _add_api_key(params: dict):
    """
    Adds API key to request params if available.
    """
    if NCBI_API_KEY:
        params["api_key"] = NCBI_API_KEY
    return params


def search_pubmed(query: str, max_results: int = 10):
    """
    Search PubMed via NCBI Entrez for a query string.
    Returns a list of PubMed IDs (PMIDs).
    """
    params = _add_api_key({
        "db": "pubmed",
        "term": query,
        "retmax": max_results,
        "retmode": "json"
    })
    r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
    r.raise_for_status()
    data = r.json()
    return data.get("esearchresult", {}).get("idlist", [])


def fetch_pubmed_details(pmids: list):
    """
    Fetch abstracts, authors, and metadata for given PMIDs.
    """
    if not pmids:
        return []
    
    params = _add_api_key({
        "db": "pubmed",
        "id": ",".join(pmids),
        "retmode": "xml"
    })
    r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
    r.raise_for_status()

    root = ET.fromstring(r.text)
    results = []

    for article in root.findall(".//PubmedArticle"):
        title = article.findtext(".//ArticleTitle", default="")
        abstract = " ".join([abst.text or "" for abst in article.findall(".//AbstractText")])
        authors = [
            f"{a.findtext('ForeName', '')} {a.findtext('LastName', '')}".strip()
            for a in article.findall(".//Author")
        ]
        pmid = article.findtext(".//PMID", default="")
        
        results.append({
            "pmid": pmid,
            "title": title,
            "abstract": abstract,
            "authors": authors
        })

    return results


def fetch_gene_info(gene_id: str):
    """
    Fetch gene metadata from NCBI Gene database.
    """
    params = _add_api_key({
        "db": "gene",
        "id": gene_id,
        "retmode": "xml"
    })
    r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
    r.raise_for_status()
    return r.text  # XML - downstream parsing can be done in pipeline


def search_gene_by_symbol(symbol: str, organism: str = None):
    """
    Search for a gene by symbol, optionally filtered by organism.
    """
    term = symbol
    if organism:
        term += f" AND {organism}[Organism]"

    params = _add_api_key({
        "db": "gene",
        "term": term,
        "retmax": 5,
        "retmode": "json"
    })
    r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
    r.raise_for_status()
    data = r.json()
    return data.get("esearchresult", {}).get("idlist", [])


def fetch_protein_info(protein_id: str):
    """
    Fetch protein metadata from NCBI Protein database.
    """
    params = _add_api_key({
        "db": "protein",
        "id": protein_id,
        "retmode": "xml"
    })
    r = requests.get(f"{NCBI_BASE}/efetch.fcgi", params=params)
    r.raise_for_status()
    return r.text


def search_protein_by_name(name: str, organism: str = None):
    """
    Search for proteins by name, optionally filtered by organism.
    """
    term = name
    if organism:
        term += f" AND {organism}[Organism]"

    params = _add_api_key({
        "db": "protein",
        "term": term,
        "retmax": 5,
        "retmode": "json"
    })
    r = requests.get(f"{NCBI_BASE}/esearch.fcgi", params=params)
    r.raise_for_status()
    data = r.json()
    return data.get("esearchresult", {}).get("idlist", [])