File size: 3,621 Bytes
f2c42a8 70c3a64 f2c42a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
from xml.etree import ElementTree
import httpx
from tenacity import retry, stop_after_attempt, wait_random_exponential
@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
def search_semantic_scholar(
query: str, top_k: int = 20, min_citation_count: int = 20
) -> list[dict]:
resp = httpx.get(
"https://api.semanticscholar.org/graph/v1/paper/search",
params={
"query": query,
"limit": top_k,
"fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount,influentialCitationCount",
"fieldsOfStudy": "Medicine,Biology",
"minCitationCount": min_citation_count,
},
timeout=10.0,
)
resp.raise_for_status()
return resp.json().get("data", [])
@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
def get_pubmed_abstracts(pmids: list[int]) -> dict[str, dict]:
"""
Referenced `pymed` library for parsing the xml.
"""
resp = httpx.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
params={"db": "pubmed", "id": pmids, "retmode": "xml"},
)
resp.raise_for_status()
root = ElementTree.fromstring(resp.text)
abstracts = {}
for article in root.iter("PubmedArticle"):
abstract = ""
pmid = article.findtext(
".//PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']", default=""
)
for text in article.findall(".//AbstractText"):
if label := text.attrib.get("Label"):
abstract += f"## {label}\n\n"
abstract += f"{text.text or ''}\n\n"
abstracts[pmid] = abstract.strip()
return abstracts
def format_publication(publication: dict) -> str:
title = publication["title"]
summary = (publication["tldr"] or {}).get("text", "")
abstract = publication["abstract"]
venue = publication["venue"]
year = publication["year"]
citations = publication["citationCount"]
influential_citations = publication["influentialCitationCount"]
doi = publication["externalIds"].get("DOI")
url = f"https://doi.org/{doi}" if doi else publication["url"]
return (
f"<publication title={title}>\n<url>{url}</url>\n"
f"<summary>{summary}</summary>\n<abstract>{abstract}</abstract>\n"
f"<venue>{venue}</venue>\n<year>{year}</year>\n"
f"<citationCount>{citations}</citationCount>\n<influentialCitationCount>{influential_citations}</influentialCitationCount>\n"
"</publication>"
)
def search_medical_literature(query: str) -> str:
"""Get medical literature related to the query.
Args:
query: keywords, a topic, or a concept to search
for medical literature.
Returns:
A list of papers and their details, including title,
abstract, publication venue, citation numbers, etc.
"""
publications = search_semantic_scholar(query=query, top_k=20)
pmids = [
publication["externalIds"]["PubMed"]
for publication in publications
if publication["externalIds"].get("PubMed")
]
pubmed_abstracts = get_pubmed_abstracts(pmids)
outputs = []
for publication in publications:
if pubmed_abstract := pubmed_abstracts.get(
publication["externalIds"].get("PubMed")
):
publication["abstract"] = pubmed_abstract
outputs.append(format_publication(publication))
return (
f"<publications>\n{'\n'.join(outputs)}\n</publications>"
if outputs
else "No literature found"
)
|