David Chu commited on
Commit
f2c42a8
·
unverified ·
1 Parent(s): bd5557a

feat: add abstract from pubmed

Browse files
main.py CHANGED
@@ -7,7 +7,7 @@ import streamlit as st
7
  from google import genai
8
  from google.genai import types
9
 
10
- from tools import dailymed, semantic_scholar
11
 
12
  SYSTEM_INSTRUCTION = Path("system_instruction.txt").read_text()
13
 
@@ -17,7 +17,7 @@ def respond(client: genai.Client, query: str) -> str:
17
  tools=[
18
  dailymed.find_drug_set_ids,
19
  dailymed.find_drug_instruction,
20
- semantic_scholar.search_journal_articles,
21
  ],
22
  system_instruction=SYSTEM_INSTRUCTION,
23
  )
 
7
  from google import genai
8
  from google.genai import types
9
 
10
+ from tools import dailymed, literature
11
 
12
  SYSTEM_INSTRUCTION = Path("system_instruction.txt").read_text()
13
 
 
17
  tools=[
18
  dailymed.find_drug_set_ids,
19
  dailymed.find_drug_instruction,
20
+ literature.search_medical_literature,
21
  ],
22
  system_instruction=SYSTEM_INSTRUCTION,
23
  )
pyproject.toml CHANGED
@@ -10,6 +10,7 @@ dependencies = [
10
  "pydantic>=2.11.4",
11
  "python-fasthtml>=0.12.15",
12
  "streamlit>=1.45.0",
 
13
  ]
14
 
15
  [tool.ruff.lint]
 
10
  "pydantic>=2.11.4",
11
  "python-fasthtml>=0.12.15",
12
  "streamlit>=1.45.0",
13
+ "tenacity>=9.1.2",
14
  ]
15
 
16
  [tool.ruff.lint]
system_instruction.txt CHANGED
@@ -1,12 +1,12 @@
1
- You are a medical research expert. Provide a concise answer to the query below, using no more than 250 words.
2
 
3
- Base every claim or statement strictly on the sources returned from the tool calls. For each claim, include a citation referencing the source's ID (do not include the citation in the `text` field). A claim may be supported by one or multiple sources, but only cite sources that directly support the claim. Do not add unnecessary citations.
4
 
5
- You may use markdown formatting, such as **, to highlight key parts of the text. Do not return the response in a markdown code block.
6
 
7
- If none of the sources contain relevant information to answer the query, politely inform the user that an answer cannot be provided, and do not use any citations.
8
 
9
- If the query is not related to medicine, politely decline to answer.
10
 
11
  Produce JSON matching this specification:
12
 
 
1
+ You are a medical research expert. Your audience is medical professionals.
2
 
3
+ Provide a concise answer to medical related queries, using no more than 250 words. Emphasize on readability for the reader to grasp the keys quickly. You may use markdown formatting, such as **, to highlight key parts of the text.
4
 
5
+ If the query is not related to medicine, politely decline to answer.
6
 
7
+ Base every claim or statement strictly on the sources returned from the tool calls. For each claim, include a citation referencing the source's ID (do not include the citation in the `text` field). A claim may be supported by one or multiple sources, but only cite sources that directly support the claim. Do not add unnecessary citations.
8
 
9
+ If none of the sources contain relevant information to answer the query, politely inform the user that an answer cannot be provided, and do not use any citations.
10
 
11
  Produce JSON matching this specification:
12
 
tools/literature.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from xml.etree import ElementTree
3
+
4
+ import httpx
5
+ from tenacity import retry, stop_after_attempt, wait_random_exponential
6
+
7
+
8
+ @retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
9
+ def search_semantic_scholar(
10
+ query: str, top_k: int = 20, min_citation_count: int = 20
11
+ ) -> list[dict]:
12
+ resp = httpx.get(
13
+ "https://api.semanticscholar.org/graph/v1/paper/search",
14
+ params={
15
+ "query": query,
16
+ "limit": top_k,
17
+ "fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount,influentialCitationCount",
18
+ "fieldsOfStudy": "Medicine,Biology",
19
+ "minCitationCount": min_citation_count,
20
+ },
21
+ timeout=10.0,
22
+ )
23
+ resp.raise_for_status()
24
+ return resp.json().get("data", [])
25
+
26
+
27
+ @retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
28
+ def get_pubmed_abstracts(pmids: list[int]) -> dict[str, dict]:
29
+ """
30
+ Referenced `pymed` library for parsing the xml.
31
+ """
32
+ resp = httpx.get(
33
+ "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
34
+ params={"db": "pubmed", "id": pmids, "retmode": "xml"},
35
+ )
36
+ resp.raise_for_status()
37
+ root = ElementTree.fromstring(resp.text)
38
+
39
+ abstracts = {}
40
+ for article in root.iter("PubmedArticle"):
41
+ abstract = ""
42
+ pmid = article.findtext(
43
+ ".//PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']", default=""
44
+ )
45
+ for text in article.findall(".//AbstractText"):
46
+ if label := text.attrib.get("Label"):
47
+ abstract += f"## {label}\n\n"
48
+ abstract += f"{text.text or ''}\n\n"
49
+ abstracts[pmid] = abstract.strip()
50
+
51
+ return abstracts
52
+
53
+
54
+ def format_publication(publication: dict) -> str:
55
+ title = publication["title"]
56
+ summary = publication.get("tldr", {}).get("text", "")
57
+ abstract = publication["abstract"]
58
+ venue = publication["venue"]
59
+ year = publication["year"]
60
+ citations = publication["citationCount"]
61
+ influential_citations = publication["influentialCitationCount"]
62
+ doi = publication["externalIds"].get("DOI")
63
+ url = f"https://doi.org/{doi}" if doi else publication["url"]
64
+ return (
65
+ f"<publication title={title}>\n<url>{url}</url>\n"
66
+ f"<summary>{summary}</summary>\n<abstract>{abstract}</abstract>\n"
67
+ f"<venue>{venue}</venue>\n<year>{year}</year>\n"
68
+ f"<citationCount>{citations}</citationCount>\n<influentialCitationCount>{influential_citations}</influentialCitationCount>\n"
69
+ "</publication>"
70
+ )
71
+
72
+
73
+ def search_medical_literature(query: str) -> str:
74
+ """Get medical literature related to the query.
75
+
76
+ Args:
77
+ query: keywords, a topic, or a concept to search
78
+ for medical literature.
79
+
80
+ Returns:
81
+ A list of papers and their details, including title,
82
+ abstract, publication venue, citation numbers, etc.
83
+ """
84
+ publications = search_semantic_scholar(query=query, top_k=20)
85
+ pmids = [
86
+ publication["externalIds"]["PubMed"]
87
+ for publication in publications
88
+ if publication["externalIds"].get("PubMed")
89
+ ]
90
+ pubmed_abstracts = get_pubmed_abstracts(pmids)
91
+
92
+ outputs = []
93
+ for publication in publications:
94
+ if pubmed_abstract := pubmed_abstracts.get(
95
+ publication["externalIds"].get("PubMed")
96
+ ):
97
+ publication["abstract"] = pubmed_abstract
98
+
99
+ outputs.append(format_publication(publication))
100
+
101
+ return (
102
+ f"<publications>\n{'\n'.join(outputs)}\n</publications>"
103
+ if outputs
104
+ else "No literature found"
105
+ )
tools/semantic_scholar.py DELETED
@@ -1,53 +0,0 @@
1
- import time
2
-
3
- import httpx
4
-
5
-
6
- def search_journal_articles(query: str) -> str | list[dict]:
7
- """Get abstracts and summaries of related medical journal aritcles.
8
-
9
- Args:
10
- query: Search query for medical articles.
11
-
12
- Returns:
13
- A list of papers and thier title, summary, published
14
- vanue and year, and the number of citations.
15
- """
16
- max_retries = 5
17
- for attempt in range(max_retries):
18
- try:
19
- resp = httpx.get(
20
- "https://api.semanticscholar.org/graph/v1/paper/search",
21
- params={
22
- "query": query,
23
- "limit": 20,
24
- "fields": "title,tldr,abstract,externalIds,url,venue,year,citationCount",
25
- "fieldsOfStudy": "Medicine,Biology",
26
- "minCitationCount": 20,
27
- },
28
- timeout=10.0,
29
- )
30
- resp.raise_for_status()
31
- break
32
- except (httpx.HTTPStatusError, httpx.TimeoutException) as err:
33
- if attempt < max_retries - 1:
34
- time.sleep(1)
35
- else:
36
- raise err
37
- else:
38
- return "No related articles found."
39
-
40
- results = resp.json()
41
- articles = []
42
- for article in results.get("data", []):
43
- article = {
44
- "url": f"https://doi.org/{article['externalIds'].get('DOI')}",
45
- "title": article["title"],
46
- "summary": article["tldr"]["text"] if article["tldr"] else "",
47
- "abstract": article["abstract"],
48
- "venue": article["venue"],
49
- "year": article["year"],
50
- "citations": article["citationCount"],
51
- }
52
- articles.append(article)
53
- return articles
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
uv.lock CHANGED
@@ -197,6 +197,7 @@ dependencies = [
197
  { name = "pydantic" },
198
  { name = "python-fasthtml" },
199
  { name = "streamlit" },
 
200
  ]
201
 
202
  [package.metadata]
@@ -206,6 +207,7 @@ requires-dist = [
206
  { name = "pydantic", specifier = ">=2.11.4" },
207
  { name = "python-fasthtml", specifier = ">=0.12.15" },
208
  { name = "streamlit", specifier = ">=1.45.0" },
 
209
  ]
210
 
211
  [[package]]
 
197
  { name = "pydantic" },
198
  { name = "python-fasthtml" },
199
  { name = "streamlit" },
200
+ { name = "tenacity" },
201
  ]
202
 
203
  [package.metadata]
 
207
  { name = "pydantic", specifier = ">=2.11.4" },
208
  { name = "python-fasthtml", specifier = ">=0.12.15" },
209
  { name = "streamlit", specifier = ">=1.45.0" },
210
+ { name = "tenacity", specifier = ">=9.1.2" },
211
  ]
212
 
213
  [[package]]