David Chu commited on
Commit
0f4b0ea
·
unverified ·
1 Parent(s): 4782a9d

feat: reference citations by a short id

Browse files

Title + URL citations significantly increase output length,
especially when reused in tables, extending generation time.
So I switch to give each reference a short ID.

app/agent.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import re
2
  from pathlib import Path
3
 
@@ -23,8 +24,7 @@ RESPONSE_FORMAT = """\
23
 
24
  Return in JSON matching this specification:
25
 
26
- Source = { "title": string, "url": str }
27
- Statement = { "text": string, "sources": array<Source> }
28
  Return: array<Statement>
29
 
30
  Do not return the response in a markdown code block.
@@ -37,7 +37,7 @@ SOURCE_TOOL_NAMES = {
37
 
38
 
39
  def hydrate_sources(
40
- statements: models.Statements, calling_history: list[types.Content]
41
  ) -> models.Statements:
42
  sources = {}
43
  for call in calling_history:
@@ -48,16 +48,21 @@ def hydrate_sources(
48
  and func.response
49
  ):
50
  for source in func.response.get("result", []):
51
- sources[source["url"]] = source
52
 
53
- for statement in statements.statements:
54
- if statement.sources:
55
- statement.sources = [
56
- models.Source.model_validate(sources[source.url])
57
- for source in statement.sources
58
- ]
 
 
 
 
 
59
 
60
- return statements
61
 
62
 
63
  def validate_response(response: types.GenerateContentResponse) -> models.Statements:
@@ -76,17 +81,15 @@ def validate_response(response: types.GenerateContentResponse) -> models.Stateme
76
  text = match.group(1).strip()
77
 
78
  try:
79
- statements = models.Statements.model_validate_json(f'{{"statements":{text}}}')
 
 
80
  statements.thoughts = thoughts
81
- except ValidationError:
82
  statements = models.Statements(
83
  statements=[models.Statement(text=text)],
84
  thoughts=thoughts,
85
  )
86
-
87
- statements = hydrate_sources(
88
- statements, response.automatic_function_calling_history or []
89
- )
90
  return statements
91
 
92
 
 
1
+ import json
2
  import re
3
  from pathlib import Path
4
 
 
24
 
25
  Return in JSON matching this specification:
26
 
27
+ Statement = { "text": string, "sources": array<string> } // the `sources` array contains the ID of the sources
 
28
  Return: array<Statement>
29
 
30
  Do not return the response in a markdown code block.
 
37
 
38
 
39
  def hydrate_sources(
40
+ statements: list[dict], calling_history: list[types.Content]
41
  ) -> models.Statements:
42
  sources = {}
43
  for call in calling_history:
 
48
  and func.response
49
  ):
50
  for source in func.response.get("result", []):
51
+ sources[source["id"]] = source
52
 
53
+ for statement in statements:
54
+ if statement.get("sources"):
55
+ statement_sources = []
56
+ for source_id in statement["sources"]:
57
+ try:
58
+ statement_sources.append(sources[source_id])
59
+ except KeyError:
60
+ print("citaion hullucination")
61
+ print(source_id)
62
+ print(sources)
63
+ statement["sources"] = statement_sources
64
 
65
+ return models.Statements.model_validate({"statements": statements})
66
 
67
 
68
  def validate_response(response: types.GenerateContentResponse) -> models.Statements:
 
81
  text = match.group(1).strip()
82
 
83
  try:
84
+ statements = hydrate_sources(
85
+ json.loads(text), response.automatic_function_calling_history or []
86
+ )
87
  statements.thoughts = thoughts
88
+ except (json.decoder.JSONDecodeError, ValidationError):
89
  statements = models.Statements(
90
  statements=[models.Statement(text=text)],
91
  thoughts=thoughts,
92
  )
 
 
 
 
93
  return statements
94
 
95
 
app/system_instruction.txt CHANGED
@@ -2,9 +2,9 @@ You are a medical research expert providing evidence-based guidance to healthcar
2
 
3
  ## Response Guidelines
4
 
5
- 1. **Conciseness**: Provide focused answers to medical queries using no more than 250 words, prioritizing clinical relevance and actionability
6
  2. **Evidence-based content**: Base all recommendations on current medical literature, clearly distinguishing between established evidence and emerging findings
7
- 3. **Structured presentation**: Use markdown tables to compare treatments, dosages, diagnostic criteria, or clinical findings when multiple options exist
8
  4. **Enhanced readability**:
9
  - Use **bold formatting** for key clinical points, drug names, and critical recommendations
10
  - Use *italics* for emphasis on important considerations or contraindications
@@ -55,13 +55,14 @@ If none of the sources contain relevant information to answer the query, politel
55
 
56
  Produce JSON matching this specification:
57
 
58
- Source = { "title": string, "url": str }
59
- Statement = { "text": string, "sources": array<Source> }
60
  Return: array<Statement>
61
 
62
  Do not return the response in a markdown code block.
63
 
64
- ## Good Response Examples
 
 
65
 
66
  * Query: Management of bleeding from a duodenal ulcer when endoscopic treatment fails
67
  Response:
@@ -86,30 +87,12 @@ Response:
86
  },
87
  {
88
  "text": "\n| **TAE** | 15–40% (↑ vs surgery) | \\~8% | \\~9 days | \\~15% | Minimally invasive, operator- and technique-dependent. Preferred in high-risk patients. | ",
89
- "sources": [
90
- {
91
- "title": "Bleeding Duodenal Ulcer: Strategies in High-Risk Ulcers.",
92
- "url": "https://doi.org/10.1159/000513689"
93
- },
94
- {
95
- "title": "Management of bleeding peptic duodenal ulcer refractory to endoscopic treatment: surgery or transcatheter arterial embolization as first-line therapy? A retrospectivesingle-center study and systematic review.",
96
- "url": "https://doi.org/10.1007/s00068-020-01356-7"
97
- }
98
- ]
99
  },
100
  { "text": " |" },
101
  {
102
  "text": "\n| **Surgery** | Lower (RR 0.55 vs TAE) | \\~32.2% | \\~18 days | \\~14–15% | Lower rebleeding but higher morbidity. Longer recovery. Requires surgical expertise. | ",
103
- "sources": [
104
- {
105
- "title": "Bleeding Duodenal Ulcer: Strategies in High-Risk Ulcers.",
106
- "url": "https://doi.org/10.1159/000513689"
107
- },
108
- {
109
- "title": "Management of bleeding peptic duodenal ulcer refractory to endoscopic treatment: surgery or transcatheter arterial embolization as first-line therapy? A retrospectivesingle-center study and systematic review.",
110
- "url": "https://doi.org/10.1007/s00068-020-01356-7"
111
- }
112
- ]
113
  },
114
  { "text": " |" },
115
  { "text": "\n\n**Clinical Decision Should Consider:**" },
@@ -135,98 +118,44 @@ Response:
135
  },
136
  {
137
  "text": "\n| **Surgical Technique** | One gastrojejunal anastomosis | Two anastomoses (gastrojejunal + jejunojejunal) | OAGB is technically simpler",
138
- "sources": [
139
- {
140
- "title": "Efficacy and Safety of One Anastomosis Gastric Bypass Versus Roux-en-Y Gastric Bypass for Obesity: a Meta-analysis and Systematic Review.",
141
- "url": "https://doi.org/10.1007/s11695-022-06401-5"
142
- }
143
- ]
144
  },
145
  { "text": " |" },
146
  {
147
  "text": "\n| **% Excess BMI Loss (5 yrs)** | \~75.6% | \~71.4% | Non-inferior (YOMEGA study)",
148
- "sources": [
149
- {
150
- "title": "Efficacy and safety of one anastomosis gastric bypass versus Roux-en-Y gastric bypass at 5 years (YOMEGA): a prospective, open-label, non-inferiority, randomised extension study.",
151
- "url": "https://doi.org/10.1016/S2213-8587(24)00035-4"
152
- }
153
- ]
154
  },
155
  { "text": " |" },
156
  {
157
  "text": "\n| **T2DM Remission** | Comparable | Comparable | Similar remission rates",
158
- "sources": [
159
- {
160
- "title": "Efficacy and safety of one anastomosis gastric bypass versus Roux-en-Y gastric bypass at 5 years (YOMEGA): a prospective, open-label, non-inferiority, randomised extension study.",
161
- "url": "https://doi.org/10.1016/S2213-8587(24)00035-4"
162
- },
163
- {
164
- "title": "Remission of Type 2 Diabetes Mellitus (T2DM) after Sleeve Gastrectomy (SG), One-Anastomosis Gastric Bypass (OAGB), and Roux-en-Y Gastric Bypass (RYGB): A Systematic Review.",
165
- "url": "https://doi.org/10.3390/medicina59050985"
166
- },
167
- {
168
- "title": "Efficacy and Safety of One Anastomosis Gastric Bypass Versus Roux-en-Y Gastric Bypass for Obesity: a Meta-analysis and Systematic Review.",
169
- "url": "https://doi.org/10.1007/s11695-022-06401-5"
170
- }
171
- ]
172
  },
173
  { "text": " |" },
174
  {
175
  "text": "\n| **GERD (clinical or de novo)** | Higher (41% clinical GERD; 6.3% de novo) | Lower (18% clinical GERD; \~0.5% de novo) | Significantly more GERD with OAGB",
176
- "sources": [
177
- {
178
- "title": "Efficacy and safety of one anastomosis gastric bypass versus Roux-en-Y gastric bypass at 5 years (YOMEGA): a prospective, open-label, non-inferiority, randomised extension study.",
179
- "url": "https://doi.org/10.1016/S2213-8587(24)00035-4"
180
- },
181
- {
182
- "title": "One-anastomosis gastric bypass (OAGB) versus Roux-en-Y gastric bypass (RYGB) as revisional procedures after failed laparoscopic sleeve gastrectomy (LSG): systematic review and meta-analysis of comparative studies.",
183
- "url": "https://doi.org/10.1007/s00423-023-03175-x"
184
- },
185
- {
186
- "title": "Efficacy and Safety of One Anastomosis Gastric Bypass Versus Roux-en-Y Gastric Bypass for Obesity: a Meta-analysis and Systematic Review.",
187
- "url": "https://doi.org/10.1007/s11695-022-06401-5"
188
- }
189
- ]
190
  },
191
  { "text": " |" },
192
  {
193
  "text": "\n| **Conversion/Revisional Rate** | \~8% converted from OAGB to RYGB | Not reported | Due to GERD symptoms",
194
  "sources": [
195
- {
196
- "title": "Efficacy and safety of one anastomosis gastric bypass versus Roux-en-Y gastric bypass at 5 years (YOMEGA): a prospective, open-label, non-inferiority, randomised extension study.",
197
- "url": "https://doi.org/10.1016/S2213-8587(24)00035-4"
198
- }
199
  ]
200
  },
201
  { "text": " |" },
202
  {
203
  "text": "\n| **Early Post-op Complications** | Fewer | More | Lower early complication rate in OAGB",
204
- "sources": [
205
- {
206
- "title": "Efficacy and Safety of One Anastomosis Gastric Bypass Versus Roux-en-Y Gastric Bypass for Obesity: a Meta-analysis and Systematic Review.",
207
- "url": "https://doi.org/10.1007/s11695-022-06401-5"
208
- }
209
- ]
210
  },
211
  { "text": " |" },
212
  {
213
  "text": "\n| **Operative Time** | Shorter | Longer | Statistically shorter in OAGB",
214
- "sources": [
215
- {
216
- "title": "Efficacy and Safety of One Anastomosis Gastric Bypass Versus Roux-en-Y Gastric Bypass for Obesity: a Meta-analysis and Systematic Review.",
217
- "url": "https://doi.org/10.1007/s11695-022-06401-5"
218
- }
219
- ]
220
  },
221
  { "text": " |" },
222
  {
223
  "text": "\n| **Learning Curve** | Easier | Steeper | Simpler procedure, useful for training",
224
- "sources": [
225
- {
226
- "title": "Efficacy and Safety of One Anastomosis Gastric Bypass Versus Roux-en-Y Gastric Bypass for Obesity: a Meta-analysis and Systematic Review.",
227
- "url": "https://doi.org/10.1007/s11695-022-06401-5"
228
- }
229
- ]
230
  },
231
  { "text": " |" }
232
  ]
 
2
 
3
  ## Response Guidelines
4
 
5
+ 1. **Conciseness**: Provide focused answers to medical queries in one paragraph, prioritizing clinical relevance and actionability
6
  2. **Evidence-based content**: Base all recommendations on current medical literature, clearly distinguishing between established evidence and emerging findings
7
+ 3. **Structured presentation**: Use Markdown tables to compare treatments, dosages, diagnostic criteria, or clinical findings when multiple options exist
8
  4. **Enhanced readability**:
9
  - Use **bold formatting** for key clinical points, drug names, and critical recommendations
10
  - Use *italics* for emphasis on important considerations or contraindications
 
55
 
56
  Produce JSON matching this specification:
57
 
58
+ Statement = { "text": string, "sources": array<string> } // the `sources` array contains the ID of the sources
 
59
  Return: array<Statement>
60
 
61
  Do not return the response in a markdown code block.
62
 
63
+ ## Examples
64
+
65
+ Below are a few examples showing what a good answer looks like:
66
 
67
  * Query: Management of bleeding from a duodenal ulcer when endoscopic treatment fails
68
  Response:
 
87
  },
88
  {
89
  "text": "\n| **TAE** | 15–40% (↑ vs surgery) | \\~8% | \\~9 days | \\~15% | Minimally invasive, operator- and technique-dependent. Preferred in high-risk patients. | ",
90
+ "sources": ["sch-9wn", "sch-4l1"]
 
 
 
 
 
 
 
 
 
91
  },
92
  { "text": " |" },
93
  {
94
  "text": "\n| **Surgery** | Lower (RR 0.55 vs TAE) | \\~32.2% | \\~18 days | \\~14–15% | Lower rebleeding but higher morbidity. Longer recovery. Requires surgical expertise. | ",
95
+ "sources": ["sch-9wn", "sch-4l1"]
 
 
 
 
 
 
 
 
 
96
  },
97
  { "text": " |" },
98
  { "text": "\n\n**Clinical Decision Should Consider:**" },
 
118
  },
119
  {
120
  "text": "\n| **Surgical Technique** | One gastrojejunal anastomosis | Two anastomoses (gastrojejunal + jejunojejunal) | OAGB is technically simpler",
121
+ "sources": ["sch-8rz"]
 
 
 
 
 
122
  },
123
  { "text": " |" },
124
  {
125
  "text": "\n| **% Excess BMI Loss (5 yrs)** | \~75.6% | \~71.4% | Non-inferior (YOMEGA study)",
126
+ "sources": ["sch-zi3"]
 
 
 
 
 
127
  },
128
  { "text": " |" },
129
  {
130
  "text": "\n| **T2DM Remission** | Comparable | Comparable | Similar remission rates",
131
+ "sources": ["sch-zi3", "sch-5vf", "sch-8rz"]
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  },
133
  { "text": " |" },
134
  {
135
  "text": "\n| **GERD (clinical or de novo)** | Higher (41% clinical GERD; 6.3% de novo) | Lower (18% clinical GERD; \~0.5% de novo) | Significantly more GERD with OAGB",
136
+ "sources": ["sch-zi3", "sch-cdf", "sch-8rz"]
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  },
138
  { "text": " |" },
139
  {
140
  "text": "\n| **Conversion/Revisional Rate** | \~8% converted from OAGB to RYGB | Not reported | Due to GERD symptoms",
141
  "sources": [
142
+ "sch-zi3"
 
 
 
143
  ]
144
  },
145
  { "text": " |" },
146
  {
147
  "text": "\n| **Early Post-op Complications** | Fewer | More | Lower early complication rate in OAGB",
148
+ "sources": ["sch-8rz"]
 
 
 
 
 
149
  },
150
  { "text": " |" },
151
  {
152
  "text": "\n| **Operative Time** | Shorter | Longer | Statistically shorter in OAGB",
153
+ "sources": ["sch-8rz"]
 
 
 
 
 
154
  },
155
  { "text": " |" },
156
  {
157
  "text": "\n| **Learning Curve** | Easier | Steeper | Simpler procedure, useful for training",
158
+ "sources": ["sch-8rz"]
 
 
 
 
 
159
  },
160
  { "text": " |" }
161
  ]
app/tools/dailymed.py CHANGED
@@ -1,5 +1,7 @@
1
  import httpx
2
 
 
 
3
 
4
  def find_drug_set_ids(name: str) -> list[dict]:
5
  """Get the Set IDs of drugs by a name.
@@ -23,6 +25,7 @@ def find_drug_set_ids(name: str) -> list[dict]:
23
  "venue": "DailyMed",
24
  "year": row["published_date"][-4:], # Original format: "May 05, 2025"
25
  "url": f"https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid={row['setid']}",
 
26
  }
27
  for row in resp.json()["data"]
28
  ]
 
1
  import httpx
2
 
3
+ from app.tools.utils import generate_id
4
+
5
 
6
  def find_drug_set_ids(name: str) -> list[dict]:
7
  """Get the Set IDs of drugs by a name.
 
25
  "venue": "DailyMed",
26
  "year": row["published_date"][-4:], # Original format: "May 05, 2025"
27
  "url": f"https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid={row['setid']}",
28
+ "id": f"med-{generate_id(row['setid'])}",
29
  }
30
  for row in resp.json()["data"]
31
  ]
app/tools/literature.py CHANGED
@@ -4,6 +4,7 @@ import httpx
4
  from tenacity import retry, stop_after_attempt, wait_random_exponential
5
 
6
  from app.config import settings
 
7
 
8
 
9
  @retry(
@@ -69,6 +70,7 @@ def format_publication(publication: dict) -> dict:
69
  publication["doi"] = doi
70
  if doi:
71
  publication["url"] = f"https://doi.org/{doi}"
 
72
  return publication
73
 
74
 
 
4
  from tenacity import retry, stop_after_attempt, wait_random_exponential
5
 
6
  from app.config import settings
7
+ from app.tools.utils import generate_id
8
 
9
 
10
  @retry(
 
70
  publication["doi"] = doi
71
  if doi:
72
  publication["url"] = f"https://doi.org/{doi}"
73
+ publication["id"] = f"sch-{generate_id(publication['url'])}"
74
  return publication
75
 
76
 
app/tools/utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import string
3
+
4
+
5
+ def generate_id(text: str) -> str:
6
+ """Generate a 3-character alphanumeric hash from a URL that is unlikely to collide."""
7
+
8
+ hash_object = hashlib.md5(text.encode())
9
+ hash_hex = hash_object.hexdigest()
10
+
11
+ # Convert to integer
12
+ hash_int = int(hash_hex, 16)
13
+
14
+ # Convert to base62 using the same character set
15
+ characters = string.ascii_lowercase + string.digits
16
+ base = len(characters)
17
+
18
+ result = ""
19
+ for _ in range(3):
20
+ result = characters[hash_int % base] + result
21
+ hash_int //= base
22
+
23
+ return result