David Chu
commited on
feat: tool to search dailymed
Browse files- main.py +65 -152
- tools/__init__.py +0 -0
- tools/dailymed.py +44 -0
- tools/pubmed.py +24 -0
- tools/semantic_scholar.py +53 -0
main.py
CHANGED
@@ -1,151 +1,90 @@
|
|
|
|
1 |
import os
|
2 |
-
import
|
3 |
|
4 |
-
import httpx
|
5 |
import streamlit as st
|
6 |
from google import genai
|
7 |
-
from
|
8 |
|
|
|
9 |
|
10 |
-
|
11 |
-
id: str
|
12 |
-
title: str
|
13 |
-
summary: str | None
|
14 |
-
abstract: str | None
|
15 |
-
venue: str
|
16 |
-
year: int
|
17 |
-
citations: int | None
|
18 |
-
|
19 |
-
|
20 |
-
class Statement(BaseModel):
|
21 |
-
text: str
|
22 |
-
source_ids: list[str] | None
|
23 |
-
|
24 |
-
|
25 |
-
def improve_prompt(client: genai.Client, prompt: str) -> str:
|
26 |
-
response = client.models.generate_content(
|
27 |
-
model="gemini-2.0-flash-lite",
|
28 |
-
contents=f"Rewrite the following medical question to maximize clarity and specificity for optimal search results.\n\n<query>{prompt}</query>\n\nReturn only the improved query.",
|
29 |
-
)
|
30 |
-
return response.text or ""
|
31 |
-
|
32 |
-
|
33 |
-
def format_sources(articles: list[Article]) -> str:
|
34 |
-
sources = []
|
35 |
-
for article in articles:
|
36 |
-
source = f"<source id={article.id}><title>{article.title}</title>"
|
37 |
-
if article.abstract:
|
38 |
-
source += f"<abstract>{article.abstract}</abstract>"
|
39 |
-
if article.summary:
|
40 |
-
source += f"<summary>{article.summary}</summary>"
|
41 |
-
source += "</source>"
|
42 |
-
sources.append(source)
|
43 |
-
return "\n".join(sources)
|
44 |
-
|
45 |
-
|
46 |
-
ANSWER_INSTRUCTION = """\
|
47 |
You are a medical research expert.
|
48 |
|
49 |
-
Please answer the user's query clearly and concisely, using no more than
|
50 |
|
51 |
Base every claim or statement strictly on the provided sources. For each claim, include a citation referencing the source's ID (do not include the citation in the `text` field). A claim may be supported by one or multiple sources, but only cite sources that directly support the claim. Do not add unnecessary citations.
|
52 |
|
|
|
|
|
53 |
If none of the sources contain relevant information to answer the query, politely inform the user that an answer cannot be provided, and do not use any citations.
|
54 |
|
55 |
If the query is not related to medicine, politely decline to answer.
|
56 |
|
57 |
<query>{query}</query>
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
62 |
"""
|
63 |
|
64 |
|
65 |
-
def
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
68 |
response = client.models.generate_content(
|
69 |
model="gemini-2.5-flash-preview-04-17",
|
70 |
-
contents=
|
71 |
-
|
72 |
-
),
|
73 |
-
config={
|
74 |
-
"response_mime_type": "application/json",
|
75 |
-
"response_schema": list[Statement],
|
76 |
-
},
|
77 |
)
|
78 |
-
return response.
|
79 |
-
|
80 |
-
|
81 |
-
def
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
summary=article["tldr"]["text"] if article["tldr"] else "",
|
115 |
-
abstract=article["abstract"],
|
116 |
-
venue=article["venue"],
|
117 |
-
year=article["year"],
|
118 |
-
citations=article["citationCount"],
|
119 |
-
)
|
120 |
-
articles.append(article)
|
121 |
-
return articles
|
122 |
-
|
123 |
-
|
124 |
-
def pubmed(query: str, top_k: int = 10, db: str = "pubmed"):
|
125 |
-
resp = httpx.get(
|
126 |
-
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
|
127 |
-
params={
|
128 |
-
"term": query,
|
129 |
-
"db": db,
|
130 |
-
"retmax": top_k,
|
131 |
-
"retmode": "json",
|
132 |
-
},
|
133 |
-
)
|
134 |
-
id_list = resp.json()["esearchresult"]["idlist"]
|
135 |
-
|
136 |
-
resp = httpx.get(
|
137 |
-
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",
|
138 |
-
params={
|
139 |
-
"db": db,
|
140 |
-
"id": ",".join(id_list),
|
141 |
-
"retmode": "json",
|
142 |
-
},
|
143 |
-
)
|
144 |
-
return resp.json()
|
145 |
|
146 |
|
147 |
def main():
|
148 |
-
semantic_scholar_client = httpx.Client(transport=httpx.HTTPTransport(retries=3))
|
149 |
gemini_client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
|
150 |
|
151 |
st.title("Ask ~~Jeeves~~ Elna")
|
@@ -155,36 +94,10 @@ def main():
|
|
155 |
response = st.empty()
|
156 |
|
157 |
if submit:
|
158 |
-
with st.spinner("
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
with st.spinner("Thinking...", show_time=True):
|
163 |
-
paper_map = {paper.id: paper for paper in papers}
|
164 |
-
sentences = []
|
165 |
-
citations = {}
|
166 |
-
statements = generate_answer(gemini_client, query, papers)
|
167 |
-
for statement in statements:
|
168 |
-
sentence = statement.text
|
169 |
-
for source_id in statement.source_ids or []:
|
170 |
-
if not (citation_id := citations.get(source_id)):
|
171 |
-
citation_id = len(citations) + 1
|
172 |
-
citations[source_id] = citation_id
|
173 |
-
sentence += f"[^{citation_id}] "
|
174 |
-
sentences.append(sentence.strip())
|
175 |
-
answer = " ".join(sentences)
|
176 |
-
footnotes = ""
|
177 |
-
if citations:
|
178 |
-
footnotes = "\n".join(
|
179 |
-
f"[^{v}]: :grey-badge[:material/attribution: {paper_map[k].citations}] [{paper_map[k].title}](https://doi.org/{paper_map[k].id}). _{paper_map[k].venue}_ {paper_map[k].year}."
|
180 |
-
for k, v in citations.items()
|
181 |
-
)
|
182 |
-
|
183 |
-
result = f"{answer}\n\n{footnotes}"
|
184 |
-
else:
|
185 |
-
result = "No relevant papers found."
|
186 |
-
|
187 |
-
response.markdown(result)
|
188 |
|
189 |
|
190 |
if __name__ == "__main__":
|
|
|
1 |
+
import json
|
2 |
import os
|
3 |
+
import re
|
4 |
|
|
|
5 |
import streamlit as st
|
6 |
from google import genai
|
7 |
+
from google.genai import types
|
8 |
|
9 |
+
from tools import dailymed, semantic_scholar
|
10 |
|
11 |
+
INSTRUCTION = """\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
You are a medical research expert.
|
13 |
|
14 |
+
Please answer the user's query clearly and concisely, using no more than 250 words.
|
15 |
|
16 |
Base every claim or statement strictly on the provided sources. For each claim, include a citation referencing the source's ID (do not include the citation in the `text` field). A claim may be supported by one or multiple sources, but only cite sources that directly support the claim. Do not add unnecessary citations.
|
17 |
|
18 |
+
You can use markdown format, such as **, to highlight the key part of the answer. But do not return the response in a markdown code block.
|
19 |
+
|
20 |
If none of the sources contain relevant information to answer the query, politely inform the user that an answer cannot be provided, and do not use any citations.
|
21 |
|
22 |
If the query is not related to medicine, politely decline to answer.
|
23 |
|
24 |
<query>{query}</query>
|
25 |
|
26 |
+
Produce JSON matching this specification:
|
27 |
+
|
28 |
+
Source = {{ "title": string, "url": str }}
|
29 |
+
Statement = {{ "text": string, "sources": array<Source> }}
|
30 |
+
Return: array<Statement>
|
31 |
+
|
32 |
+
Do not return the response in a markdown code block.
|
33 |
"""
|
34 |
|
35 |
|
36 |
+
def respond(client: genai.Client, query: str) -> str:
|
37 |
+
config = types.GenerateContentConfig(
|
38 |
+
tools=[
|
39 |
+
dailymed.find_drug_set_ids,
|
40 |
+
dailymed.find_drug_instruction,
|
41 |
+
semantic_scholar.search_journal_articles,
|
42 |
+
],
|
43 |
+
)
|
44 |
response = client.models.generate_content(
|
45 |
model="gemini-2.5-flash-preview-04-17",
|
46 |
+
contents=INSTRUCTION.format(query=query),
|
47 |
+
config=config,
|
|
|
|
|
|
|
|
|
|
|
48 |
)
|
49 |
+
return response.text or ""
|
50 |
+
|
51 |
+
|
52 |
+
def format_output(response: str) -> tuple[str, str]:
|
53 |
+
response = response.strip()
|
54 |
+
|
55 |
+
if response.startswith("```"):
|
56 |
+
# Extract content inside the first markdown code block (``` or ```json)
|
57 |
+
match = re.match(r"^```(?:json)?\s*([\s\S]*?)\s*```", response)
|
58 |
+
if match:
|
59 |
+
response = match.group(1).strip()
|
60 |
+
|
61 |
+
try:
|
62 |
+
statements = json.loads(response.strip())
|
63 |
+
except json.decoder.JSONDecodeError as err:
|
64 |
+
print(err)
|
65 |
+
return response, ""
|
66 |
+
|
67 |
+
try:
|
68 |
+
answer = ""
|
69 |
+
citations = {}
|
70 |
+
for statement in statements:
|
71 |
+
answer += statement["text"]
|
72 |
+
for source in statement.get("sources", []):
|
73 |
+
source_str = f"[{source['title']}]({source['url']})"
|
74 |
+
if not (citation_id := citations.get(source_str)):
|
75 |
+
citation_id = len(citations) + 1
|
76 |
+
citations[source_str] = citation_id
|
77 |
+
answer += f"[^{citation_id}]"
|
78 |
+
answer += " "
|
79 |
+
except KeyError as err:
|
80 |
+
print(err)
|
81 |
+
return response, ""
|
82 |
+
|
83 |
+
footnotes = "\n".join(f"[^{id}]: {citation}" for citation, id in citations.items())
|
84 |
+
return answer, footnotes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
|
87 |
def main():
|
|
|
88 |
gemini_client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
|
89 |
|
90 |
st.title("Ask ~~Jeeves~~ Elna")
|
|
|
94 |
response = st.empty()
|
95 |
|
96 |
if submit:
|
97 |
+
with st.spinner("Thinking...", show_time=True):
|
98 |
+
output = respond(gemini_client, query)
|
99 |
+
answer, footnotes = format_output(output)
|
100 |
+
response.markdown(f"{answer}\n\n{footnotes}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
|
103 |
if __name__ == "__main__":
|
tools/__init__.py
ADDED
File without changes
|
tools/dailymed.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import httpx
|
2 |
+
|
3 |
+
|
4 |
+
def find_drug_set_ids(name: str) -> list[dict]:
|
5 |
+
"""Get the Set IDs of drugs by a name.
|
6 |
+
|
7 |
+
The Set ID can be used to look up a drug's instruction.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
name: Generic or brand name of a drug.
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
A list of drug names and their Set ID.
|
14 |
+
"""
|
15 |
+
resp = httpx.get(
|
16 |
+
"https://dailymed.nlm.nih.gov/dailymed/services/v2/spls.json",
|
17 |
+
params={"drug_name": name},
|
18 |
+
)
|
19 |
+
return [
|
20 |
+
{
|
21 |
+
"name": row["title"],
|
22 |
+
"set_id": row["setid"],
|
23 |
+
"url": f"https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid={row['setid']}",
|
24 |
+
}
|
25 |
+
for row in resp.json()["data"]
|
26 |
+
]
|
27 |
+
|
28 |
+
|
29 |
+
def find_drug_instruction(set_id: str) -> str:
|
30 |
+
"""Get the instruction of a drug from the FDA database.
|
31 |
+
|
32 |
+
The instruction includes dosage, contradictions, adverse
|
33 |
+
reactions, drung interactions, etc.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
set_id: Set ID of the drug to look up.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
Full package instruction in XML format.
|
40 |
+
"""
|
41 |
+
resp = httpx.get(
|
42 |
+
f"https://dailymed.nlm.nih.gov/dailymed/services/v2/spls/{set_id}.xml"
|
43 |
+
)
|
44 |
+
return resp.text
|
tools/pubmed.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import httpx
|
2 |
+
|
3 |
+
|
4 |
+
def search(query: str, top_k: int = 10, db: str = "pubmed"):
|
5 |
+
resp = httpx.get(
|
6 |
+
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
|
7 |
+
params={
|
8 |
+
"term": query,
|
9 |
+
"db": db,
|
10 |
+
"retmax": top_k,
|
11 |
+
"retmode": "json",
|
12 |
+
},
|
13 |
+
)
|
14 |
+
id_list = resp.json()["esearchresult"]["idlist"]
|
15 |
+
|
16 |
+
resp = httpx.get(
|
17 |
+
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",
|
18 |
+
params={
|
19 |
+
"db": db,
|
20 |
+
"id": ",".join(id_list),
|
21 |
+
"retmode": "json",
|
22 |
+
},
|
23 |
+
)
|
24 |
+
return resp.json()
|
tools/semantic_scholar.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
import httpx
|
4 |
+
|
5 |
+
|
6 |
+
def search_journal_articles(query: str) -> str | list[dict]:
|
7 |
+
"""Get abstracts and summaries of related medical journal aritcles.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
query: Search query for medical articles.
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
A list of papers and thier title, summary, published
|
14 |
+
vanue and year, and the number of citations.
|
15 |
+
"""
|
16 |
+
max_retries = 5
|
17 |
+
for attempt in range(max_retries):
|
18 |
+
try:
|
19 |
+
resp = httpx.get(
|
20 |
+
"https://api.semanticscholar.org/graph/v1/paper/search",
|
21 |
+
params={
|
22 |
+
"query": query,
|
23 |
+
"limit": 20,
|
24 |
+
"fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount",
|
25 |
+
"fieldsOfStudy": "Medicine,Biology",
|
26 |
+
"minCitationCount": 20,
|
27 |
+
},
|
28 |
+
timeout=10.0,
|
29 |
+
)
|
30 |
+
resp.raise_for_status()
|
31 |
+
break
|
32 |
+
except (httpx.HTTPStatusError, httpx.TimeoutException) as err:
|
33 |
+
if attempt < max_retries - 1:
|
34 |
+
time.sleep(1)
|
35 |
+
else:
|
36 |
+
raise err
|
37 |
+
else:
|
38 |
+
return "No related articles found."
|
39 |
+
|
40 |
+
results = resp.json()
|
41 |
+
articles = []
|
42 |
+
for article in results.get("data", []):
|
43 |
+
article = {
|
44 |
+
"url": f"https://doi.org/{article['externalIds'].get('DOI')}",
|
45 |
+
"title": article["title"],
|
46 |
+
"summary": article["tldr"]["text"] if article["tldr"] else "",
|
47 |
+
"abstract": article["abstract"],
|
48 |
+
"venue": article["venue"],
|
49 |
+
"year": article["year"],
|
50 |
+
"citations": article["citationCount"],
|
51 |
+
}
|
52 |
+
articles.append(article)
|
53 |
+
return articles
|