Spaces:
Running
Running
File size: 5,749 Bytes
372531f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import os
import xml.etree.ElementTree as ET
import requests
class PubMedCentralSearch:
"""
PubMed Central API Retriever
"""
def __init__(self, query):
"""
Initializes the PubMedCentralSearch object.
Args:
query: The search query.
"""
self.query = query
self.api_key = self._retrieve_api_key()
def _retrieve_api_key(self):
"""
Retrieves the NCBI API key from environment variables.
Returns:
The API key.
Raises:
Exception: If the API key is not found.
"""
try:
api_key = os.environ["NCBI_API_KEY"]
except KeyError:
raise Exception(
"NCBI API key not found. Please set the NCBI_API_KEY environment variable. "
"You can obtain your key from https://www.ncbi.nlm.nih.gov/account/"
)
return api_key
def search(self, max_results=10):
"""
Searches the query using the PubMed Central API.
Args:
max_results: The maximum number of results to return.
Returns:
A list of search results.
"""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
"db": "pmc",
"term": f"{self.query} AND free fulltext[filter]",
"retmax": max_results,
"usehistory": "y",
"api_key": self.api_key,
"retmode": "json",
"sort": "relevance"
}
response = requests.get(base_url, params=params)
if response.status_code != 200:
raise Exception(
f"Failed to retrieve data: {response.status_code} - {response.text}"
)
results = response.json()
ids = results["esearchresult"]["idlist"]
search_response = []
for article_id in ids:
xml_content = self.fetch([article_id])
if self.has_body_content(xml_content):
article_data = self.parse_xml(xml_content)
if article_data:
search_response.append(
{
"href": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{article_id}/",
"body": f"{article_data['title']}\n\n{article_data['abstract']}\n\n{article_data['body'][:500]}...",
}
)
if len(search_response) >= max_results:
break
return search_response
def fetch(self, ids):
"""
Fetches the full text content for given article IDs.
Args:
ids: List of article IDs.
Returns:
XML content of the articles.
"""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
"db": "pmc",
"id": ",".join(ids),
"retmode": "xml",
"api_key": self.api_key,
}
response = requests.get(base_url, params=params)
if response.status_code != 200:
raise Exception(
f"Failed to retrieve data: {response.status_code} - {response.text}"
)
return response.text
def has_body_content(self, xml_content):
"""
Checks if the XML content has a body section.
Args:
xml_content: XML content of the article.
Returns:
Boolean indicating presence of body content.
"""
root = ET.fromstring(xml_content)
ns = {
"mml": "http://www.w3.org/1998/Math/MathML",
"xlink": "http://www.w3.org/1999/xlink",
}
article = root.find("article", ns)
if article is None:
return False
body_elem = article.find(".//body", namespaces=ns)
if body_elem is not None:
return True
else:
for sec in article.findall(".//sec", namespaces=ns):
for p in sec.findall(".//p", namespaces=ns):
if p.text:
return True
return False
def parse_xml(self, xml_content):
"""
Parses the XML content to extract title, abstract, and body.
Args:
xml_content: XML content of the article.
Returns:
Dictionary containing title, abstract, and body text.
"""
root = ET.fromstring(xml_content)
ns = {
"mml": "http://www.w3.org/1998/Math/MathML",
"xlink": "http://www.w3.org/1999/xlink",
}
article = root.find("article", ns)
if article is None:
return None
title = article.findtext(
".//title-group/article-title", default="", namespaces=ns
)
abstract = article.find(".//abstract", namespaces=ns)
abstract_text = (
"".join(abstract.itertext()).strip() if abstract is not None else ""
)
body = []
body_elem = article.find(".//body", namespaces=ns)
if body_elem is not None:
for p in body_elem.findall(".//p", namespaces=ns):
if p.text:
body.append(p.text.strip())
else:
for sec in article.findall(".//sec", namespaces=ns):
for p in sec.findall(".//p", namespaces=ns):
if p.text:
body.append(p.text.strip())
return {"title": title, "abstract": abstract_text, "body": "\n".join(body)}
|