elna / app /tools /literature.py
David Chu
refactor: use pydantic settings
00d1644 unverified
raw
history blame
3.62 kB
from xml.etree import ElementTree
import httpx
from tenacity import retry, stop_after_attempt, wait_random_exponential
@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
def search_semantic_scholar(
query: str, top_k: int = 20, min_citation_count: int = 20
) -> list[dict]:
resp = httpx.get(
"https://api.semanticscholar.org/graph/v1/paper/search",
params={
"query": query,
"limit": top_k,
"fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount,influentialCitationCount",
"fieldsOfStudy": "Medicine,Biology",
"minCitationCount": min_citation_count,
},
timeout=10.0,
)
resp.raise_for_status()
return resp.json().get("data", [])
@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
def get_pubmed_abstracts(pmids: list[int]) -> dict[str, dict]:
"""
Referenced `pymed` library for parsing the xml.
"""
resp = httpx.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
params={"db": "pubmed", "id": pmids, "retmode": "xml"},
)
resp.raise_for_status()
root = ElementTree.fromstring(resp.text)
abstracts = {}
for article in root.iter("PubmedArticle"):
abstract = ""
pmid = article.findtext(
".//PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']", default=""
)
for text in article.findall(".//AbstractText"):
if label := text.attrib.get("Label"):
abstract += f"## {label}\n\n"
abstract += f"{text.text or ''}\n\n"
abstracts[pmid] = abstract.strip()
return abstracts
def format_publication(publication: dict) -> str:
title = publication["title"]
summary = (publication["tldr"] or {}).get("text", "")
abstract = publication["abstract"]
venue = publication["venue"]
year = publication["year"]
citations = publication["citationCount"]
influential_citations = publication["influentialCitationCount"]
doi = publication["externalIds"].get("DOI")
url = f"https://doi.org/{doi}" if doi else publication["url"]
return (
f"<publication title={title}>\n<url>{url}</url>\n"
f"<summary>{summary}</summary>\n<abstract>{abstract}</abstract>\n"
f"<venue>{venue}</venue>\n<year>{year}</year>\n"
f"<citationCount>{citations}</citationCount>\n<influentialCitationCount>{influential_citations}</influentialCitationCount>\n"
"</publication>"
)
def search_medical_literature(query: str) -> str:
"""Get medical literature related to the query.
Args:
query: keywords, a topic, or a concept to search
for medical literature.
Returns:
A list of papers and their details, including title,
abstract, publication venue, citation numbers, etc.
"""
publications = search_semantic_scholar(query=query, top_k=20)
pmids = [
publication["externalIds"]["PubMed"]
for publication in publications
if publication["externalIds"].get("PubMed")
]
pubmed_abstracts = get_pubmed_abstracts(pmids)
outputs = []
for publication in publications:
if pubmed_abstract := pubmed_abstracts.get(
publication["externalIds"].get("PubMed")
):
publication["abstract"] = pubmed_abstract
outputs.append(format_publication(publication))
return (
f"<publications>\n{'\n'.join(outputs)}\n</publications>"
if outputs
else "No literature found"
)