elna / app /tools /literature.py
Pau Rué
feat: include publication type and mesh terms to article metadata
3865f47
from xml.etree import ElementTree
import httpx
from pydantic_ai import ModelRetry
from tenacity import retry, stop_after_attempt, wait_random_exponential
from app.config import settings
from app.tools.utils import generate_id
@retry(
stop=stop_after_attempt(3),
wait=wait_random_exponential(multiplier=0.5, max=10),
reraise=True,
)
def search_semantic_scholar(
query: str, top_k: int = 20, min_citation_count: int = 5
) -> list[dict]:
headers = {}
if api_key := settings.semantic_scholar_api_key:
headers["x-api-key"] = api_key
resp = httpx.get(
"https://api.semanticscholar.org/graph/v1/paper/search",
params={
"query": query,
"limit": top_k,
"fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount,influentialCitationCount",
"fieldsOfStudy": "Medicine,Biology",
"minCitationCount": min_citation_count,
},
headers=headers,
timeout=10.0,
)
resp.raise_for_status()
data = resp.json().get("data", [])
if not data:
raise ModelRetry(f"No results for '{query}'. Try different keywords.")
return data
@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
def get_pubmed_metadata(pmids: list[int]) -> dict[str, dict]:
resp = httpx.get(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
params={"db": "pubmed", "id": pmids, "retmode": "xml"},
)
resp.raise_for_status()
root = ElementTree.fromstring(resp.text)
results = {}
for article in root.iter("PubmedArticle"):
abstract = ""
pmid = article.findtext(
".//PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']", default=""
)
# The abstract is sometimes divided into multiple
# sections. Concatenate into one Markdown text.
for text in article.findall(".//AbstractText"):
if label := text.attrib.get("Label"):
abstract += f"## {label}\n\n"
abstract += f"{text.text or ''}\n\n"
# Extract publication types
# https://www.nlm.nih.gov/mesh/pubtypes.html
publication_types = [
pt.text
for pt in article.findall(".//PublicationTypeList/PublicationType")
if pt.text
]
# Extract MeSH terms (DescriptorName and QualifierName)
mesh_terms = []
for mesh_heading in article.findall(
".//MedlineCitation/MeshHeadingList/MeshHeading"
):
descriptor = mesh_heading.findtext("DescriptorName")
if descriptor:
mesh_terms.append(descriptor.strip())
for qualifier in mesh_heading.findall("QualifierName"):
if qualifier.text:
mesh_terms.append(qualifier.text.strip())
results[pmid] = {
"abstract": abstract.strip(),
"publication_types": publication_types,
"mesh_terms": mesh_terms,
}
return results
def format_publication(publication: dict) -> dict:
tldr = publication.pop("tldr") or {}
external_ids = publication.pop("externalIds")
doi = external_ids.get("DOI")
publication["summary"] = tldr.get("text", "")
publication["citations"] = publication.pop("citationCount")
publication["influential_citations"] = publication.pop("influentialCitationCount")
publication["doi"] = doi
if doi:
publication["url"] = f"https://doi.org/{doi}"
publication["id"] = f"sch-{generate_id(publication['url'])}"
return publication
def search_medical_literature(query: str) -> list[dict]:
"""Search medical literature and prioritize high-quality evidence sources.
CRITICAL: This tool returns literature that varies significantly in evidence quality.
You MUST prioritize publications with consolidated evidence based on the following criteria:
**Type of evidence**
- Gold Standard Evidence
- Systematic Review
- Meta-Analysis
- Randomized Controlled Trial (RCT)
- High-Quality Clinical Evidence
- Controlled Clinical Trial
- Clinical Trial, Phase III
- Specialized High-Quality Studies
- Pragmatic Clinical Trial
- Clinical Trial, Phase II
- Equivalence Trial
Authoritative References
- Practice Guideline
- Pharmacopoeia
- Consensus Development Conference (NIH or not)
- Other Important Clinical Evidence
- Clinical Study
- Observational Study
- Validation Study
- Comparative Study
- Case Reports
- Multicenter Study
- Evaluation Study
**Credibility of the publisher**:
- Top general medicine journals
- The Lancet (or any of its specialty journals)
- New England Journal of Medicine (NEJM, or any of its specialty journals)
- Nature Medicine (or any of its medical specialty journals)
- Journal of the American Medical Association (JAMA, or any of its specialty journals)
- BMJ
- Top specialized medicine journals
- Journal of Clinical Oncology
- European Heart Journal
- Circulation
- Journal of the American College of Cardiology
- Cancer Cell
- Annals of Oncology
- Gastroenterology
- International Journal of Epidemiology
- Blood
- Molecular Psychiatry
- Journal of the National Cancer Institute
- Gut
- Cancer Discovery
- Clinical Cancer Research
- Science Translational Medicine
- Immunity
- Brain
- Yearbook of Paediatric Endocrinology
- Journal of Allergy and Clinical Immunology
- Annals of Internal Medicine
- Journal of Clinical Investigation
- Alzheimer's and Dementia
- Journal of Hepatology
- Clinical Infectious Diseases
- Hepatology
- Neurology
- PLoS Medicine
- Annals of the Rheumatic Diseases
- Leukemia
- European Urology
- Biological Psychiatry
- Cell Metabolism
- American Journal of Psychiatry
- American Journal of Respiratory and Critical Care Medicine
- European Journal of Heart Failure
- Journal for ImmunoTherapy of Cancer
- European Respiratory Journal
- American Journal of Epidemiology
- Annals of Neurology
- Kidney International
- Diabetes Care
- Acta Neuropathologica
- Cancer
- JCI insight
- Frontiers in Immunology
- European Journal of Cancer
- Journal of Thoracic Oncology
- Journal of the National Comprehensive Cancer Network : JNCCN
- Genetics in Medicine
- Science Immunology
- Blood advances
- Journal of the American Heart Association
- Hypertension
- Intensive Care Medicine
- BMC Medicine
- Circulation Research
- Arthritis & Rheumatology
- Diabetologia
- Journal of the American Society of Nephrology (JASN)
- Journal of Clinical Endocrinology and Metabolism
- Genome Medicine
- Journal of Experimental Medicine
- American Heart Journal
- Clinical Gastroenterology and Hepatology
- Nutrients
- Diabetes
- British Journal of Cancer
- Obstetrical and Gynecological Survey
- Annals of Surgery
- Haematologica
**Reputation of the authors**
Prioritize publications from professional societies:
- World Health Organization (WHO)
- World Medical Association (WMA)
- Centers for Disease Control and Prevention (CDC)
- National Institutes of Health (NIH)
- U.S. Preventive Services Task Force (USPSTF)
- American College of Physicians (ACP)
- National Medical Association (NMA)
- American College of Cardiology (ACC)
- American Heart Association (AHA)
- American Society of Clinical Oncology (ASCO)
- National Comprehensive Cancer Network (NCCN)
- Infectious Diseases Society of America (IDSA)
- American Academy of Pediatrics (AAP)
- American College of Obstetricians and Gynecologists (ACOG)
- American Psychiatric Association (APA)
- American College of Surgeons (ACS)
- American College of Emergency Physicians (ACEP)
- American Academy of Neurology (AAN)
- Endocrine Society
- National Institute for Health and Care Excellence (NICE)
- European Medical Association (EMA)
- European Union of Medical Specialists (UEMS)
- European Medicines Agency (EMA)
- European Society of Cardiology (ESC)
- European Respiratory Society (ERS)
- European Society of Anaesthesiology and Intensive Care (ESAIC)
- European Academy of Neurology (EAN)
- European Society for Medical Oncology (ESMO)
- European Association for the Study of the Liver (EASL)
- European Society of Clinical Microbiology and Infectious Diseases (ESCMID)
- European Association of Urology (EAU)
- European Society of Endocrinology (ESE)
- European Paediatric Association (EPA/UNEPSA)
- European Society of Human Reproduction and Embryology (ESHRE)
- European Federation of Internal Medicine (EFIM)
- European Stroke Organisation (ESO)
- European Psychiatric Association (EPA)
- European Society of Radiology (ESR)
- European Hematology Association (EHA)
- European Society for Emergency Medicine (EUSEM)
EVIDENCE PRIORITIZATION (when analyzing results):
To the extent possible, the answer should be grounded in top-tier evidence provided by reputable authors and medical societies
and published in reputable journals.
SEARCH OPTIMIZATION GUIDELINES:
1. **Medical Term Extraction**: Focus on core medical concepts, conditions,
procedures, and medications from the clinical query
2. **Broad Conceptual Scope**: Use 2-4 core medical terms. Avoid overly
specific modifiers like "criteria," "indicators," "guidelines,"
"recommendations," "treatment," or "management"
3. **Medical Terminology**: Convert colloquial terms to precise medical
terminology for better literature retrieval
4. **Search Strategy**: Construct queries that will capture both guidelines
AND research studies to ensure comprehensive evidence coverage
SEARCH EXAMPLES:
- Query: "ACE inhibitor side effects diabetes"
(captures both guidelines and studies on ACE inhibitors in diabetic patients)
- Query: "anticoagulation perioperative management elderly"
(broad enough to find guidelines and RCTs on perioperative anticoagulation)
Args:
query: Medical keywords, topic, or concept for literature search.
Should focus on clinical concepts rather than specific modifiers.
Returns:
List of publications with varying evidence quality. Each contains:
- title, abstract, venue, year, citation counts
- id (for citation), doi, url
- summary (TLDR when available)
IMPORTANT: Examine citation counts, venue, and content to identify
high-quality sources (guidelines, large RCTs) for response prioritization.
"""
publications = search_semantic_scholar(query=query, top_k=20)
pmids = [
publication["externalIds"]["PubMed"]
for publication in publications
if publication["externalIds"].get("PubMed")
]
pubmed_metadata = get_pubmed_metadata(pmids)
outputs = []
for publication in publications:
if metadata := pubmed_metadata.get(publication["externalIds"].get("PubMed")):
# Abstracts on PubMed are more complete than the
# ones returned from Semantic Scholar.
publication["abstract"] = metadata.get("abstract")
publication["publication_types"] = metadata.get("publication_types")
publication["mesh_terms"] = metadata.get("mesh_terms")
outputs.append(format_publication(publication))
return outputs