David Chu commited on
Commit
5bb1986
·
unverified ·
1 Parent(s): a874450

feat: tool to search dailymed

Browse files
Files changed (5) hide show
  1. main.py +65 -152
  2. tools/__init__.py +0 -0
  3. tools/dailymed.py +44 -0
  4. tools/pubmed.py +24 -0
  5. tools/semantic_scholar.py +53 -0
main.py CHANGED
@@ -1,151 +1,90 @@
 
1
  import os
2
- import time
3
 
4
- import httpx
5
  import streamlit as st
6
  from google import genai
7
- from pydantic import BaseModel
8
 
 
9
 
10
- class Article(BaseModel):
11
- id: str
12
- title: str
13
- summary: str | None
14
- abstract: str | None
15
- venue: str
16
- year: int
17
- citations: int | None
18
-
19
-
20
- class Statement(BaseModel):
21
- text: str
22
- source_ids: list[str] | None
23
-
24
-
25
- def improve_prompt(client: genai.Client, prompt: str) -> str:
26
- response = client.models.generate_content(
27
- model="gemini-2.0-flash-lite",
28
- contents=f"Rewrite the following medical question to maximize clarity and specificity for optimal search results.\n\n<query>{prompt}</query>\n\nReturn only the improved query.",
29
- )
30
- return response.text or ""
31
-
32
-
33
- def format_sources(articles: list[Article]) -> str:
34
- sources = []
35
- for article in articles:
36
- source = f"<source id={article.id}><title>{article.title}</title>"
37
- if article.abstract:
38
- source += f"<abstract>{article.abstract}</abstract>"
39
- if article.summary:
40
- source += f"<summary>{article.summary}</summary>"
41
- source += "</source>"
42
- sources.append(source)
43
- return "\n".join(sources)
44
-
45
-
46
- ANSWER_INSTRUCTION = """\
47
  You are a medical research expert.
48
 
49
- Please answer the user's query clearly and concisely, using no more than 100 words.
50
 
51
  Base every claim or statement strictly on the provided sources. For each claim, include a citation referencing the source's ID (do not include the citation in the `text` field). A claim may be supported by one or multiple sources, but only cite sources that directly support the claim. Do not add unnecessary citations.
52
 
 
 
53
  If none of the sources contain relevant information to answer the query, politely inform the user that an answer cannot be provided, and do not use any citations.
54
 
55
  If the query is not related to medicine, politely decline to answer.
56
 
57
  <query>{query}</query>
58
 
59
- <sources>
60
- {sources}
61
- </sources>
 
 
 
 
62
  """
63
 
64
 
65
- def generate_answer(
66
- client: genai.Client, query: str, articles: list[Article]
67
- ) -> list[Statement]:
 
 
 
 
 
68
  response = client.models.generate_content(
69
  model="gemini-2.5-flash-preview-04-17",
70
- contents=ANSWER_INSTRUCTION.format(
71
- query=query, sources=format_sources(articles)
72
- ),
73
- config={
74
- "response_mime_type": "application/json",
75
- "response_schema": list[Statement],
76
- },
77
  )
78
- return response.parsed
79
-
80
-
81
- def semantic_scholar(
82
- client: httpx.Client, query: str, top_k: int = 10
83
- ) -> list[Article]:
84
- max_retries = 5
85
- for attempt in range(max_retries):
86
- try:
87
- resp = client.get(
88
- "https://api.semanticscholar.org/graph/v1/paper/search",
89
- params={
90
- "query": query,
91
- "limit": top_k,
92
- "fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount",
93
- "fieldsOfStudy": "Medicine,Biology",
94
- "minCitationCount": 20,
95
- },
96
- timeout=10.0,
97
- )
98
- resp.raise_for_status()
99
- break
100
- except (httpx.HTTPStatusError, httpx.TimeoutException) as err:
101
- if attempt < max_retries - 1:
102
- time.sleep(1)
103
- else:
104
- raise err
105
- else:
106
- raise RuntimeError()
107
-
108
- results = resp.json()
109
- articles = []
110
- for i, article in enumerate(results.get("data", []), 1):
111
- article = Article(
112
- id=article["externalIds"].get("DOI", str(i)),
113
- title=article["title"],
114
- summary=article["tldr"]["text"] if article["tldr"] else "",
115
- abstract=article["abstract"],
116
- venue=article["venue"],
117
- year=article["year"],
118
- citations=article["citationCount"],
119
- )
120
- articles.append(article)
121
- return articles
122
-
123
-
124
- def pubmed(query: str, top_k: int = 10, db: str = "pubmed"):
125
- resp = httpx.get(
126
- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
127
- params={
128
- "term": query,
129
- "db": db,
130
- "retmax": top_k,
131
- "retmode": "json",
132
- },
133
- )
134
- id_list = resp.json()["esearchresult"]["idlist"]
135
-
136
- resp = httpx.get(
137
- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",
138
- params={
139
- "db": db,
140
- "id": ",".join(id_list),
141
- "retmode": "json",
142
- },
143
- )
144
- return resp.json()
145
 
146
 
147
  def main():
148
- semantic_scholar_client = httpx.Client(transport=httpx.HTTPTransport(retries=3))
149
  gemini_client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
150
 
151
  st.title("Ask ~~Jeeves~~ Elna")
@@ -155,36 +94,10 @@ def main():
155
  response = st.empty()
156
 
157
  if submit:
158
- with st.spinner("Finding papers...", show_time=True):
159
- papers = semantic_scholar(semantic_scholar_client, query, top_k=30)
160
-
161
- if papers:
162
- with st.spinner("Thinking...", show_time=True):
163
- paper_map = {paper.id: paper for paper in papers}
164
- sentences = []
165
- citations = {}
166
- statements = generate_answer(gemini_client, query, papers)
167
- for statement in statements:
168
- sentence = statement.text
169
- for source_id in statement.source_ids or []:
170
- if not (citation_id := citations.get(source_id)):
171
- citation_id = len(citations) + 1
172
- citations[source_id] = citation_id
173
- sentence += f"[^{citation_id}] "
174
- sentences.append(sentence.strip())
175
- answer = " ".join(sentences)
176
- footnotes = ""
177
- if citations:
178
- footnotes = "\n".join(
179
- f"[^{v}]: :grey-badge[:material/attribution: {paper_map[k].citations}] [{paper_map[k].title}](https://doi.org/{paper_map[k].id}). _{paper_map[k].venue}_ {paper_map[k].year}."
180
- for k, v in citations.items()
181
- )
182
-
183
- result = f"{answer}\n\n{footnotes}"
184
- else:
185
- result = "No relevant papers found."
186
-
187
- response.markdown(result)
188
 
189
 
190
  if __name__ == "__main__":
 
1
+ import json
2
  import os
3
+ import re
4
 
 
5
  import streamlit as st
6
  from google import genai
7
+ from google.genai import types
8
 
9
+ from tools import dailymed, semantic_scholar
10
 
11
+ INSTRUCTION = """\
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  You are a medical research expert.
13
 
14
+ Please answer the user's query clearly and concisely, using no more than 250 words.
15
 
16
  Base every claim or statement strictly on the provided sources. For each claim, include a citation referencing the source's ID (do not include the citation in the `text` field). A claim may be supported by one or multiple sources, but only cite sources that directly support the claim. Do not add unnecessary citations.
17
 
18
+ You can use markdown format, such as **, to highlight the key part of the answer. But do not return the response in a markdown code block.
19
+
20
  If none of the sources contain relevant information to answer the query, politely inform the user that an answer cannot be provided, and do not use any citations.
21
 
22
  If the query is not related to medicine, politely decline to answer.
23
 
24
  <query>{query}</query>
25
 
26
+ Produce JSON matching this specification:
27
+
28
+ Source = {{ "title": string, "url": str }}
29
+ Statement = {{ "text": string, "sources": array<Source> }}
30
+ Return: array<Statement>
31
+
32
+ Do not return the response in a markdown code block.
33
  """
34
 
35
 
36
+ def respond(client: genai.Client, query: str) -> str:
37
+ config = types.GenerateContentConfig(
38
+ tools=[
39
+ dailymed.find_drug_set_ids,
40
+ dailymed.find_drug_instruction,
41
+ semantic_scholar.search_journal_articles,
42
+ ],
43
+ )
44
  response = client.models.generate_content(
45
  model="gemini-2.5-flash-preview-04-17",
46
+ contents=INSTRUCTION.format(query=query),
47
+ config=config,
 
 
 
 
 
48
  )
49
+ return response.text or ""
50
+
51
+
52
+ def format_output(response: str) -> tuple[str, str]:
53
+ response = response.strip()
54
+
55
+ if response.startswith("```"):
56
+ # Extract content inside the first markdown code block (``` or ```json)
57
+ match = re.match(r"^```(?:json)?\s*([\s\S]*?)\s*```", response)
58
+ if match:
59
+ response = match.group(1).strip()
60
+
61
+ try:
62
+ statements = json.loads(response.strip())
63
+ except json.decoder.JSONDecodeError as err:
64
+ print(err)
65
+ return response, ""
66
+
67
+ try:
68
+ answer = ""
69
+ citations = {}
70
+ for statement in statements:
71
+ answer += statement["text"]
72
+ for source in statement.get("sources", []):
73
+ source_str = f"[{source['title']}]({source['url']})"
74
+ if not (citation_id := citations.get(source_str)):
75
+ citation_id = len(citations) + 1
76
+ citations[source_str] = citation_id
77
+ answer += f"[^{citation_id}]"
78
+ answer += " "
79
+ except KeyError as err:
80
+ print(err)
81
+ return response, ""
82
+
83
+ footnotes = "\n".join(f"[^{id}]: {citation}" for citation, id in citations.items())
84
+ return answer, footnotes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
 
87
  def main():
 
88
  gemini_client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
89
 
90
  st.title("Ask ~~Jeeves~~ Elna")
 
94
  response = st.empty()
95
 
96
  if submit:
97
+ with st.spinner("Thinking...", show_time=True):
98
+ output = respond(gemini_client, query)
99
+ answer, footnotes = format_output(output)
100
+ response.markdown(f"{answer}\n\n{footnotes}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
 
103
  if __name__ == "__main__":
tools/__init__.py ADDED
File without changes
tools/dailymed.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+
3
+
4
+ def find_drug_set_ids(name: str) -> list[dict]:
5
+ """Get the Set IDs of drugs by a name.
6
+
7
+ The Set ID can be used to look up a drug's instruction.
8
+
9
+ Args:
10
+ name: Generic or brand name of a drug.
11
+
12
+ Returns:
13
+ A list of drug names and their Set ID.
14
+ """
15
+ resp = httpx.get(
16
+ "https://dailymed.nlm.nih.gov/dailymed/services/v2/spls.json",
17
+ params={"drug_name": name},
18
+ )
19
+ return [
20
+ {
21
+ "name": row["title"],
22
+ "set_id": row["setid"],
23
+ "url": f"https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid={row['setid']}",
24
+ }
25
+ for row in resp.json()["data"]
26
+ ]
27
+
28
+
29
+ def find_drug_instruction(set_id: str) -> str:
30
+ """Get the instruction of a drug from the FDA database.
31
+
32
+ The instruction includes dosage, contradictions, adverse
33
+ reactions, drung interactions, etc.
34
+
35
+ Args:
36
+ set_id: Set ID of the drug to look up.
37
+
38
+ Returns:
39
+ Full package instruction in XML format.
40
+ """
41
+ resp = httpx.get(
42
+ f"https://dailymed.nlm.nih.gov/dailymed/services/v2/spls/{set_id}.xml"
43
+ )
44
+ return resp.text
tools/pubmed.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+
3
+
4
+ def search(query: str, top_k: int = 10, db: str = "pubmed"):
5
+ resp = httpx.get(
6
+ "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
7
+ params={
8
+ "term": query,
9
+ "db": db,
10
+ "retmax": top_k,
11
+ "retmode": "json",
12
+ },
13
+ )
14
+ id_list = resp.json()["esearchresult"]["idlist"]
15
+
16
+ resp = httpx.get(
17
+ "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi",
18
+ params={
19
+ "db": db,
20
+ "id": ",".join(id_list),
21
+ "retmode": "json",
22
+ },
23
+ )
24
+ return resp.json()
tools/semantic_scholar.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import httpx
4
+
5
+
6
+ def search_journal_articles(query: str) -> str | list[dict]:
7
+ """Get abstracts and summaries of related medical journal aritcles.
8
+
9
+ Args:
10
+ query: Search query for medical articles.
11
+
12
+ Returns:
13
+ A list of papers and thier title, summary, published
14
+ vanue and year, and the number of citations.
15
+ """
16
+ max_retries = 5
17
+ for attempt in range(max_retries):
18
+ try:
19
+ resp = httpx.get(
20
+ "https://api.semanticscholar.org/graph/v1/paper/search",
21
+ params={
22
+ "query": query,
23
+ "limit": 20,
24
+ "fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount",
25
+ "fieldsOfStudy": "Medicine,Biology",
26
+ "minCitationCount": 20,
27
+ },
28
+ timeout=10.0,
29
+ )
30
+ resp.raise_for_status()
31
+ break
32
+ except (httpx.HTTPStatusError, httpx.TimeoutException) as err:
33
+ if attempt < max_retries - 1:
34
+ time.sleep(1)
35
+ else:
36
+ raise err
37
+ else:
38
+ return "No related articles found."
39
+
40
+ results = resp.json()
41
+ articles = []
42
+ for article in results.get("data", []):
43
+ article = {
44
+ "url": f"https://doi.org/{article['externalIds'].get('DOI')}",
45
+ "title": article["title"],
46
+ "summary": article["tldr"]["text"] if article["tldr"] else "",
47
+ "abstract": article["abstract"],
48
+ "venue": article["venue"],
49
+ "year": article["year"],
50
+ "citations": article["citationCount"],
51
+ }
52
+ articles.append(article)
53
+ return articles