|
from xml.etree import ElementTree |
|
|
|
import httpx |
|
from tenacity import retry, stop_after_attempt, wait_random_exponential |
|
|
|
|
|
@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10)) |
|
def search_semantic_scholar( |
|
query: str, top_k: int = 20, min_citation_count: int = 20 |
|
) -> list[dict]: |
|
resp = httpx.get( |
|
"https://api.semanticscholar.org/graph/v1/paper/search", |
|
params={ |
|
"query": query, |
|
"limit": top_k, |
|
"fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount,influentialCitationCount", |
|
"fieldsOfStudy": "Medicine,Biology", |
|
"minCitationCount": min_citation_count, |
|
}, |
|
timeout=10.0, |
|
) |
|
resp.raise_for_status() |
|
return resp.json().get("data", []) |
|
|
|
|
|
@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10)) |
|
def get_pubmed_abstracts(pmids: list[int]) -> dict[str, dict]: |
|
""" |
|
Referenced `pymed` library for parsing the xml. |
|
""" |
|
resp = httpx.get( |
|
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi", |
|
params={"db": "pubmed", "id": pmids, "retmode": "xml"}, |
|
) |
|
resp.raise_for_status() |
|
root = ElementTree.fromstring(resp.text) |
|
|
|
abstracts = {} |
|
for article in root.iter("PubmedArticle"): |
|
abstract = "" |
|
pmid = article.findtext( |
|
".//PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']", default="" |
|
) |
|
for text in article.findall(".//AbstractText"): |
|
if label := text.attrib.get("Label"): |
|
abstract += f"## {label}\n\n" |
|
abstract += f"{text.text or ''}\n\n" |
|
abstracts[pmid] = abstract.strip() |
|
|
|
return abstracts |
|
|
|
|
|
def format_publication(publication: dict) -> str: |
|
title = publication["title"] |
|
summary = (publication["tldr"] or {}).get("text", "") |
|
abstract = publication["abstract"] |
|
venue = publication["venue"] |
|
year = publication["year"] |
|
citations = publication["citationCount"] |
|
influential_citations = publication["influentialCitationCount"] |
|
doi = publication["externalIds"].get("DOI") |
|
url = f"https://doi.org/{doi}" if doi else publication["url"] |
|
return ( |
|
f"<publication title={title}>\n<url>{url}</url>\n" |
|
f"<summary>{summary}</summary>\n<abstract>{abstract}</abstract>\n" |
|
f"<venue>{venue}</venue>\n<year>{year}</year>\n" |
|
f"<citationCount>{citations}</citationCount>\n<influentialCitationCount>{influential_citations}</influentialCitationCount>\n" |
|
"</publication>" |
|
) |
|
|
|
|
|
def search_medical_literature(query: str) -> str: |
|
"""Get medical literature related to the query. |
|
|
|
Args: |
|
query: keywords, a topic, or a concept to search |
|
for medical literature. |
|
|
|
Returns: |
|
A list of papers and their details, including title, |
|
abstract, publication venue, citation numbers, etc. |
|
""" |
|
publications = search_semantic_scholar(query=query, top_k=20) |
|
pmids = [ |
|
publication["externalIds"]["PubMed"] |
|
for publication in publications |
|
if publication["externalIds"].get("PubMed") |
|
] |
|
pubmed_abstracts = get_pubmed_abstracts(pmids) |
|
|
|
outputs = [] |
|
for publication in publications: |
|
if pubmed_abstract := pubmed_abstracts.get( |
|
publication["externalIds"].get("PubMed") |
|
): |
|
publication["abstract"] = pubmed_abstract |
|
|
|
outputs.append(format_publication(publication)) |
|
|
|
return ( |
|
f"<publications>\n{'\n'.join(outputs)}\n</publications>" |
|
if outputs |
|
else "No literature found" |
|
) |
|
|