Spaces:

hhschu
/

elna

Sleeping

App Files Files Community

David Chu commited on May 20

Commit

f2c42a8

unverified ·

1 Parent(s): bd5557a

feat: add abstract from pubmed

Browse files

Files changed (6) hide show

main.py +2 -2
pyproject.toml +1 -0
system_instruction.txt +5 -5
tools/literature.py +105 -0
tools/semantic_scholar.py +0 -53
uv.lock +2 -0

main.py CHANGED Viewed

@@ -7,7 +7,7 @@ import streamlit as st
 from google import genai
 from google.genai import types
-from tools import dailymed, semantic_scholar
 SYSTEM_INSTRUCTION = Path("system_instruction.txt").read_text()
@@ -17,7 +17,7 @@ def respond(client: genai.Client, query: str) -> str:
         tools=[
             dailymed.find_drug_set_ids,
             dailymed.find_drug_instruction,
-            semantic_scholar.search_journal_articles,
         ],
         system_instruction=SYSTEM_INSTRUCTION,
     )

 from google import genai
 from google.genai import types
+from tools import dailymed, literature
 SYSTEM_INSTRUCTION = Path("system_instruction.txt").read_text()
         tools=[
             dailymed.find_drug_set_ids,
             dailymed.find_drug_instruction,
+            literature.search_medical_literature,
         ],
         system_instruction=SYSTEM_INSTRUCTION,
     )

pyproject.toml CHANGED Viewed

@@ -10,6 +10,7 @@ dependencies = [
     "pydantic>=2.11.4",
     "python-fasthtml>=0.12.15",
     "streamlit>=1.45.0",
 ]
 [tool.ruff.lint]

     "pydantic>=2.11.4",
     "python-fasthtml>=0.12.15",
     "streamlit>=1.45.0",
+    "tenacity>=9.1.2",
 ]
 [tool.ruff.lint]

system_instruction.txt CHANGED Viewed

@@ -1,12 +1,12 @@
-You are a medical research expert. Provide a concise answer to the query below, using no more than 250 words.
-Base every claim or statement strictly on the sources returned from the tool calls. For each claim, include a citation referencing the source's ID (do not include the citation in the `text` field). A claim may be supported by one or multiple sources, but only cite sources that directly support the claim. Do not add unnecessary citations.
-You may use markdown formatting, such as **, to highlight key parts of the text. Do not return the response in a markdown code block.
-If none of the sources contain relevant information to answer the query, politely inform the user that an answer cannot be provided, and do not use any citations.
-If the query is not related to medicine, politely decline to answer.
 Produce JSON matching this specification:

+You are a medical research expert. Your audience is medical professionals.
+Provide a concise answer to medical related queries, using no more than 250 words. Emphasize on readability for the reader to grasp the keys quickly. You may use markdown formatting, such as **, to highlight key parts of the text.
+If the query is not related to medicine, politely decline to answer.
+Base every claim or statement strictly on the sources returned from the tool calls. For each claim, include a citation referencing the source's ID (do not include the citation in the `text` field). A claim may be supported by one or multiple sources, but only cite sources that directly support the claim. Do not add unnecessary citations.
+If none of the sources contain relevant information to answer the query, politely inform the user that an answer cannot be provided, and do not use any citations.
 Produce JSON matching this specification:

tools/literature.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import time
+from xml.etree import ElementTree
+import httpx
+from tenacity import retry, stop_after_attempt, wait_random_exponential
+@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
+def search_semantic_scholar(
+    query: str, top_k: int = 20, min_citation_count: int = 20
+) -> list[dict]:
+    resp = httpx.get(
+        "https://api.semanticscholar.org/graph/v1/paper/search",
+        params={
+            "query": query,
+            "limit": top_k,
+            "fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount,influentialCitationCount",
+            "fieldsOfStudy": "Medicine,Biology",
+            "minCitationCount": min_citation_count,
+        },
+        timeout=10.0,
+    )
+    resp.raise_for_status()
+    return resp.json().get("data", [])
+@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
+def get_pubmed_abstracts(pmids: list[int]) -> dict[str, dict]:
+    """
+    Referenced `pymed` library for parsing the xml.
+    """
+    resp = httpx.get(
+        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
+        params={"db": "pubmed", "id": pmids, "retmode": "xml"},
+    )
+    resp.raise_for_status()
+    root = ElementTree.fromstring(resp.text)
+    abstracts = {}
+    for article in root.iter("PubmedArticle"):
+        abstract = ""
+        pmid = article.findtext(
+            ".//PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']", default=""
+        )
+        for text in article.findall(".//AbstractText"):
+            if label := text.attrib.get("Label"):
+                abstract += f"## {label}\n\n"
+            abstract += f"{text.text or ''}\n\n"
+        abstracts[pmid] = abstract.strip()
+    return abstracts
+def format_publication(publication: dict) -> str:
+    title = publication["title"]
+    summary = publication.get("tldr", {}).get("text", "")
+    abstract = publication["abstract"]
+    venue = publication["venue"]
+    year = publication["year"]
+    citations = publication["citationCount"]
+    influential_citations = publication["influentialCitationCount"]
+    doi = publication["externalIds"].get("DOI")
+    url = f"https://doi.org/{doi}" if doi else publication["url"]
+    return (
+        f"<publication title={title}>\n<url>{url}</url>\n"
+        f"<summary>{summary}</summary>\n<abstract>{abstract}</abstract>\n"
+        f"<venue>{venue}</venue>\n<year>{year}</year>\n"
+        f"<citationCount>{citations}</citationCount>\n<influentialCitationCount>{influential_citations}</influentialCitationCount>\n"
+        "</publication>"
+    )
+def search_medical_literature(query: str) -> str:
+    """Get medical literature related to the query.
+    Args:
+        query: keywords, a topic, or a concept to search
+            for medical literature.
+    Returns:
+        A list of papers and their details, including title,
+        abstract, publication venue, citation numbers, etc.
+    """
+    publications = search_semantic_scholar(query=query, top_k=20)
+    pmids = [
+        publication["externalIds"]["PubMed"]
+        for publication in publications
+        if publication["externalIds"].get("PubMed")
+    ]
+    pubmed_abstracts = get_pubmed_abstracts(pmids)
+    outputs = []
+    for publication in publications:
+        if pubmed_abstract := pubmed_abstracts.get(
+            publication["externalIds"].get("PubMed")
+        ):
+            publication["abstract"] = pubmed_abstract
+        outputs.append(format_publication(publication))
+    return (
+        f"<publications>\n{'\n'.join(outputs)}\n</publications>"
+        if outputs
+        else "No literature found"
+    )

tools/semantic_scholar.py DELETED Viewed

@@ -1,53 +0,0 @@
-import time
-import httpx
-def search_journal_articles(query: str) -> str | list[dict]:
-    """Get abstracts and summaries of related medical journal aritcles.
-    Args:
-        query: Search query for medical articles.
-    Returns:
-        A list of papers and thier title, summary, published
-        vanue and year, and the number of citations.
-    """
-    max_retries = 5
-    for attempt in range(max_retries):
-        try:
-            resp = httpx.get(
-                "https://api.semanticscholar.org/graph/v1/paper/search",
-                params={
-                    "query": query,
-                    "limit": 20,
-                    "fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount",
-                    "fieldsOfStudy": "Medicine,Biology",
-                    "minCitationCount": 20,
-                },
-                timeout=10.0,
-            )
-            resp.raise_for_status()
-            break
-        except (httpx.HTTPStatusError, httpx.TimeoutException) as err:
-            if attempt < max_retries - 1:
-                time.sleep(1)
-            else:
-                raise err
-    else:
-        return "No related articles found."
-    results = resp.json()
-    articles = []
-    for article in results.get("data", []):
-        article = {
-            "url": f"https://doi.org/{article['externalIds'].get('DOI')}",
-            "title": article["title"],
-            "summary": article["tldr"]["text"] if article["tldr"] else "",
-            "abstract": article["abstract"],
-            "venue": article["venue"],
-            "year": article["year"],
-            "citations": article["citationCount"],
-        }
-        articles.append(article)
-    return articles

uv.lock CHANGED Viewed

@@ -197,6 +197,7 @@ dependencies = [
     { name = "pydantic" },
     { name = "python-fasthtml" },
     { name = "streamlit" },
 ]
 [package.metadata]
@@ -206,6 +207,7 @@ requires-dist = [
     { name = "pydantic", specifier = ">=2.11.4" },
     { name = "python-fasthtml", specifier = ">=0.12.15" },
     { name = "streamlit", specifier = ">=1.45.0" },
 ]
 [[package]]

     { name = "pydantic" },
     { name = "python-fasthtml" },
     { name = "streamlit" },
+    { name = "tenacity" },
 ]
 [package.metadata]
     { name = "pydantic", specifier = ">=2.11.4" },
     { name = "python-fasthtml", specifier = ">=0.12.15" },
     { name = "streamlit", specifier = ">=1.45.0" },
+    { name = "tenacity", specifier = ">=9.1.2" },
 ]
 [[package]]