David Chu
commited on
feat: add abstract from pubmed
Browse files- main.py +2 -2
- pyproject.toml +1 -0
- system_instruction.txt +5 -5
- tools/literature.py +105 -0
- tools/semantic_scholar.py +0 -53
- uv.lock +2 -0
main.py
CHANGED
@@ -7,7 +7,7 @@ import streamlit as st
|
|
7 |
from google import genai
|
8 |
from google.genai import types
|
9 |
|
10 |
-
from tools import dailymed,
|
11 |
|
12 |
SYSTEM_INSTRUCTION = Path("system_instruction.txt").read_text()
|
13 |
|
@@ -17,7 +17,7 @@ def respond(client: genai.Client, query: str) -> str:
|
|
17 |
tools=[
|
18 |
dailymed.find_drug_set_ids,
|
19 |
dailymed.find_drug_instruction,
|
20 |
-
|
21 |
],
|
22 |
system_instruction=SYSTEM_INSTRUCTION,
|
23 |
)
|
|
|
7 |
from google import genai
|
8 |
from google.genai import types
|
9 |
|
10 |
+
from tools import dailymed, literature
|
11 |
|
12 |
SYSTEM_INSTRUCTION = Path("system_instruction.txt").read_text()
|
13 |
|
|
|
17 |
tools=[
|
18 |
dailymed.find_drug_set_ids,
|
19 |
dailymed.find_drug_instruction,
|
20 |
+
literature.search_medical_literature,
|
21 |
],
|
22 |
system_instruction=SYSTEM_INSTRUCTION,
|
23 |
)
|
pyproject.toml
CHANGED
@@ -10,6 +10,7 @@ dependencies = [
|
|
10 |
"pydantic>=2.11.4",
|
11 |
"python-fasthtml>=0.12.15",
|
12 |
"streamlit>=1.45.0",
|
|
|
13 |
]
|
14 |
|
15 |
[tool.ruff.lint]
|
|
|
10 |
"pydantic>=2.11.4",
|
11 |
"python-fasthtml>=0.12.15",
|
12 |
"streamlit>=1.45.0",
|
13 |
+
"tenacity>=9.1.2",
|
14 |
]
|
15 |
|
16 |
[tool.ruff.lint]
|
system_instruction.txt
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
-
You are a medical research expert.
|
2 |
|
3 |
-
|
4 |
|
5 |
-
|
6 |
|
7 |
-
|
8 |
|
9 |
-
If the
|
10 |
|
11 |
Produce JSON matching this specification:
|
12 |
|
|
|
1 |
+
You are a medical research expert. Your audience is medical professionals.
|
2 |
|
3 |
+
Provide a concise answer to medical related queries, using no more than 250 words. Emphasize on readability for the reader to grasp the keys quickly. You may use markdown formatting, such as **, to highlight key parts of the text.
|
4 |
|
5 |
+
If the query is not related to medicine, politely decline to answer.
|
6 |
|
7 |
+
Base every claim or statement strictly on the sources returned from the tool calls. For each claim, include a citation referencing the source's ID (do not include the citation in the `text` field). A claim may be supported by one or multiple sources, but only cite sources that directly support the claim. Do not add unnecessary citations.
|
8 |
|
9 |
+
If none of the sources contain relevant information to answer the query, politely inform the user that an answer cannot be provided, and do not use any citations.
|
10 |
|
11 |
Produce JSON matching this specification:
|
12 |
|
tools/literature.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
from xml.etree import ElementTree
|
3 |
+
|
4 |
+
import httpx
|
5 |
+
from tenacity import retry, stop_after_attempt, wait_random_exponential
|
6 |
+
|
7 |
+
|
8 |
+
@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
|
9 |
+
def search_semantic_scholar(
|
10 |
+
query: str, top_k: int = 20, min_citation_count: int = 20
|
11 |
+
) -> list[dict]:
|
12 |
+
resp = httpx.get(
|
13 |
+
"https://api.semanticscholar.org/graph/v1/paper/search",
|
14 |
+
params={
|
15 |
+
"query": query,
|
16 |
+
"limit": top_k,
|
17 |
+
"fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount,influentialCitationCount",
|
18 |
+
"fieldsOfStudy": "Medicine,Biology",
|
19 |
+
"minCitationCount": min_citation_count,
|
20 |
+
},
|
21 |
+
timeout=10.0,
|
22 |
+
)
|
23 |
+
resp.raise_for_status()
|
24 |
+
return resp.json().get("data", [])
|
25 |
+
|
26 |
+
|
27 |
+
@retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
|
28 |
+
def get_pubmed_abstracts(pmids: list[int]) -> dict[str, dict]:
|
29 |
+
"""
|
30 |
+
Referenced `pymed` library for parsing the xml.
|
31 |
+
"""
|
32 |
+
resp = httpx.get(
|
33 |
+
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
|
34 |
+
params={"db": "pubmed", "id": pmids, "retmode": "xml"},
|
35 |
+
)
|
36 |
+
resp.raise_for_status()
|
37 |
+
root = ElementTree.fromstring(resp.text)
|
38 |
+
|
39 |
+
abstracts = {}
|
40 |
+
for article in root.iter("PubmedArticle"):
|
41 |
+
abstract = ""
|
42 |
+
pmid = article.findtext(
|
43 |
+
".//PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']", default=""
|
44 |
+
)
|
45 |
+
for text in article.findall(".//AbstractText"):
|
46 |
+
if label := text.attrib.get("Label"):
|
47 |
+
abstract += f"## {label}\n\n"
|
48 |
+
abstract += f"{text.text or ''}\n\n"
|
49 |
+
abstracts[pmid] = abstract.strip()
|
50 |
+
|
51 |
+
return abstracts
|
52 |
+
|
53 |
+
|
54 |
+
def format_publication(publication: dict) -> str:
|
55 |
+
title = publication["title"]
|
56 |
+
summary = publication.get("tldr", {}).get("text", "")
|
57 |
+
abstract = publication["abstract"]
|
58 |
+
venue = publication["venue"]
|
59 |
+
year = publication["year"]
|
60 |
+
citations = publication["citationCount"]
|
61 |
+
influential_citations = publication["influentialCitationCount"]
|
62 |
+
doi = publication["externalIds"].get("DOI")
|
63 |
+
url = f"https://doi.org/{doi}" if doi else publication["url"]
|
64 |
+
return (
|
65 |
+
f"<publication title={title}>\n<url>{url}</url>\n"
|
66 |
+
f"<summary>{summary}</summary>\n<abstract>{abstract}</abstract>\n"
|
67 |
+
f"<venue>{venue}</venue>\n<year>{year}</year>\n"
|
68 |
+
f"<citationCount>{citations}</citationCount>\n<influentialCitationCount>{influential_citations}</influentialCitationCount>\n"
|
69 |
+
"</publication>"
|
70 |
+
)
|
71 |
+
|
72 |
+
|
73 |
+
def search_medical_literature(query: str) -> str:
|
74 |
+
"""Get medical literature related to the query.
|
75 |
+
|
76 |
+
Args:
|
77 |
+
query: keywords, a topic, or a concept to search
|
78 |
+
for medical literature.
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
A list of papers and their details, including title,
|
82 |
+
abstract, publication venue, citation numbers, etc.
|
83 |
+
"""
|
84 |
+
publications = search_semantic_scholar(query=query, top_k=20)
|
85 |
+
pmids = [
|
86 |
+
publication["externalIds"]["PubMed"]
|
87 |
+
for publication in publications
|
88 |
+
if publication["externalIds"].get("PubMed")
|
89 |
+
]
|
90 |
+
pubmed_abstracts = get_pubmed_abstracts(pmids)
|
91 |
+
|
92 |
+
outputs = []
|
93 |
+
for publication in publications:
|
94 |
+
if pubmed_abstract := pubmed_abstracts.get(
|
95 |
+
publication["externalIds"].get("PubMed")
|
96 |
+
):
|
97 |
+
publication["abstract"] = pubmed_abstract
|
98 |
+
|
99 |
+
outputs.append(format_publication(publication))
|
100 |
+
|
101 |
+
return (
|
102 |
+
f"<publications>\n{'\n'.join(outputs)}\n</publications>"
|
103 |
+
if outputs
|
104 |
+
else "No literature found"
|
105 |
+
)
|
tools/semantic_scholar.py
DELETED
@@ -1,53 +0,0 @@
|
|
1 |
-
import time
|
2 |
-
|
3 |
-
import httpx
|
4 |
-
|
5 |
-
|
6 |
-
def search_journal_articles(query: str) -> str | list[dict]:
|
7 |
-
"""Get abstracts and summaries of related medical journal aritcles.
|
8 |
-
|
9 |
-
Args:
|
10 |
-
query: Search query for medical articles.
|
11 |
-
|
12 |
-
Returns:
|
13 |
-
A list of papers and thier title, summary, published
|
14 |
-
vanue and year, and the number of citations.
|
15 |
-
"""
|
16 |
-
max_retries = 5
|
17 |
-
for attempt in range(max_retries):
|
18 |
-
try:
|
19 |
-
resp = httpx.get(
|
20 |
-
"https://api.semanticscholar.org/graph/v1/paper/search",
|
21 |
-
params={
|
22 |
-
"query": query,
|
23 |
-
"limit": 20,
|
24 |
-
"fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount",
|
25 |
-
"fieldsOfStudy": "Medicine,Biology",
|
26 |
-
"minCitationCount": 20,
|
27 |
-
},
|
28 |
-
timeout=10.0,
|
29 |
-
)
|
30 |
-
resp.raise_for_status()
|
31 |
-
break
|
32 |
-
except (httpx.HTTPStatusError, httpx.TimeoutException) as err:
|
33 |
-
if attempt < max_retries - 1:
|
34 |
-
time.sleep(1)
|
35 |
-
else:
|
36 |
-
raise err
|
37 |
-
else:
|
38 |
-
return "No related articles found."
|
39 |
-
|
40 |
-
results = resp.json()
|
41 |
-
articles = []
|
42 |
-
for article in results.get("data", []):
|
43 |
-
article = {
|
44 |
-
"url": f"https://doi.org/{article['externalIds'].get('DOI')}",
|
45 |
-
"title": article["title"],
|
46 |
-
"summary": article["tldr"]["text"] if article["tldr"] else "",
|
47 |
-
"abstract": article["abstract"],
|
48 |
-
"venue": article["venue"],
|
49 |
-
"year": article["year"],
|
50 |
-
"citations": article["citationCount"],
|
51 |
-
}
|
52 |
-
articles.append(article)
|
53 |
-
return articles
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uv.lock
CHANGED
@@ -197,6 +197,7 @@ dependencies = [
|
|
197 |
{ name = "pydantic" },
|
198 |
{ name = "python-fasthtml" },
|
199 |
{ name = "streamlit" },
|
|
|
200 |
]
|
201 |
|
202 |
[package.metadata]
|
@@ -206,6 +207,7 @@ requires-dist = [
|
|
206 |
{ name = "pydantic", specifier = ">=2.11.4" },
|
207 |
{ name = "python-fasthtml", specifier = ">=0.12.15" },
|
208 |
{ name = "streamlit", specifier = ">=1.45.0" },
|
|
|
209 |
]
|
210 |
|
211 |
[[package]]
|
|
|
197 |
{ name = "pydantic" },
|
198 |
{ name = "python-fasthtml" },
|
199 |
{ name = "streamlit" },
|
200 |
+
{ name = "tenacity" },
|
201 |
]
|
202 |
|
203 |
[package.metadata]
|
|
|
207 |
{ name = "pydantic", specifier = ">=2.11.4" },
|
208 |
{ name = "python-fasthtml", specifier = ">=0.12.15" },
|
209 |
{ name = "streamlit", specifier = ">=1.45.0" },
|
210 |
+
{ name = "tenacity", specifier = ">=9.1.2" },
|
211 |
]
|
212 |
|
213 |
[[package]]
|