Pau Rué commited on
Commit
3865f47
·
1 Parent(s): 23f52ec

feat: include publication type and mesh terms to article metadata

Browse files
Files changed (1) hide show
  1. app/tools/literature.py +197 -30
app/tools/literature.py CHANGED
@@ -41,7 +41,7 @@ def search_semantic_scholar(
41
 
42
 
43
  @retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
44
- def get_pubmed_abstracts(pmids: list[int]) -> dict[str, dict]:
45
  resp = httpx.get(
46
  "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
47
  params={"db": "pubmed", "id": pmids, "retmode": "xml"},
@@ -49,7 +49,7 @@ def get_pubmed_abstracts(pmids: list[int]) -> dict[str, dict]:
49
  resp.raise_for_status()
50
  root = ElementTree.fromstring(resp.text)
51
 
52
- abstracts = {}
53
  for article in root.iter("PubmedArticle"):
54
  abstract = ""
55
  pmid = article.findtext(
@@ -61,9 +61,34 @@ def get_pubmed_abstracts(pmids: list[int]) -> dict[str, dict]:
61
  if label := text.attrib.get("Label"):
62
  abstract += f"## {label}\n\n"
63
  abstract += f"{text.text or ''}\n\n"
64
- abstracts[pmid] = abstract.strip()
65
 
66
- return abstracts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  def format_publication(publication: dict) -> dict:
@@ -84,35 +109,177 @@ def search_medical_literature(query: str) -> list[dict]:
84
  """Search medical literature and prioritize high-quality evidence sources.
85
 
86
  CRITICAL: This tool returns literature that varies significantly in evidence quality.
87
- You MUST prioritize clinical practice guidelines and large RCTs in your responses.
88
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  EVIDENCE PRIORITIZATION (when analyzing results):
90
- 1. **PRIMARY SOURCES (80-90% of response weight)**:
91
- - Clinical practice guidelines from professional societies (AHA, ACP, IDSA, etc.)
92
- - Large randomized controlled trials (n>1000 or landmark studies)
93
- - Look for: "guideline", "recommendation", "consensus", large sample sizes
94
-
95
- 2. **SECONDARY SOURCES (10-15% weight)**:
96
- - Systematic reviews, meta-analyses, smaller RCTs
97
- - Look for: "systematic review", "meta-analysis", moderate sample sizes
98
-
99
- 3. **TERTIARY SOURCES (<5% weight)**:
100
- - Observational studies, case series, expert opinions
101
- - Use only when higher-quality evidence is unavailable
102
 
103
  SEARCH OPTIMIZATION GUIDELINES:
104
- 1. **Medical Term Extraction**: Focus on core medical concepts, conditions,
105
  procedures, and medications from the clinical query
106
- 2. **Broad Conceptual Scope**: Use 2-4 core medical terms. Avoid overly
107
- specific modifiers like "criteria," "indicators," "guidelines,"
108
  "recommendations," "treatment," or "management"
109
- 3. **Medical Terminology**: Convert colloquial terms to precise medical
110
  terminology for better literature retrieval
111
- 4. **Search Strategy**: Construct queries that will capture both guidelines
112
  AND research studies to ensure comprehensive evidence coverage
113
 
114
  SEARCH EXAMPLES:
115
- - Query: "ACE inhibitor side effects diabetes"
116
  (captures both guidelines and studies on ACE inhibitors in diabetic patients)
117
  - Query: "anticoagulation perioperative management elderly"
118
  (broad enough to find guidelines and RCTs on perioperative anticoagulation)
@@ -126,7 +293,7 @@ def search_medical_literature(query: str) -> list[dict]:
126
  - title, abstract, venue, year, citation counts
127
  - id (for citation), doi, url
128
  - summary (TLDR when available)
129
-
130
  IMPORTANT: Examine citation counts, venue, and content to identify
131
  high-quality sources (guidelines, large RCTs) for response prioritization.
132
  """
@@ -136,16 +303,16 @@ def search_medical_literature(query: str) -> list[dict]:
136
  for publication in publications
137
  if publication["externalIds"].get("PubMed")
138
  ]
139
- pubmed_abstracts = get_pubmed_abstracts(pmids)
140
 
141
  outputs = []
142
  for publication in publications:
143
- if pubmed_abstract := pubmed_abstracts.get(
144
- publication["externalIds"].get("PubMed")
145
- ):
146
  # Abstracts on PubMed are more complete than the
147
  # ones returned from Semantic Scholar.
148
- publication["abstract"] = pubmed_abstract
 
 
149
 
150
  outputs.append(format_publication(publication))
151
 
 
41
 
42
 
43
  @retry(stop=stop_after_attempt(5), wait=wait_random_exponential(multiplier=0.5, max=10))
44
+ def get_pubmed_metadata(pmids: list[int]) -> dict[str, dict]:
45
  resp = httpx.get(
46
  "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
47
  params={"db": "pubmed", "id": pmids, "retmode": "xml"},
 
49
  resp.raise_for_status()
50
  root = ElementTree.fromstring(resp.text)
51
 
52
+ results = {}
53
  for article in root.iter("PubmedArticle"):
54
  abstract = ""
55
  pmid = article.findtext(
 
61
  if label := text.attrib.get("Label"):
62
  abstract += f"## {label}\n\n"
63
  abstract += f"{text.text or ''}\n\n"
 
64
 
65
+ # Extract publication types
66
+ # https://www.nlm.nih.gov/mesh/pubtypes.html
67
+ publication_types = [
68
+ pt.text
69
+ for pt in article.findall(".//PublicationTypeList/PublicationType")
70
+ if pt.text
71
+ ]
72
+
73
+ # Extract MeSH terms (DescriptorName and QualifierName)
74
+ mesh_terms = []
75
+ for mesh_heading in article.findall(
76
+ ".//MedlineCitation/MeshHeadingList/MeshHeading"
77
+ ):
78
+ descriptor = mesh_heading.findtext("DescriptorName")
79
+ if descriptor:
80
+ mesh_terms.append(descriptor.strip())
81
+ for qualifier in mesh_heading.findall("QualifierName"):
82
+ if qualifier.text:
83
+ mesh_terms.append(qualifier.text.strip())
84
+
85
+ results[pmid] = {
86
+ "abstract": abstract.strip(),
87
+ "publication_types": publication_types,
88
+ "mesh_terms": mesh_terms,
89
+ }
90
+
91
+ return results
92
 
93
 
94
  def format_publication(publication: dict) -> dict:
 
109
  """Search medical literature and prioritize high-quality evidence sources.
110
 
111
  CRITICAL: This tool returns literature that varies significantly in evidence quality.
112
+ You MUST prioritize publications with consolidated evidence based on the following criteria:
113
+
114
+ **Type of evidence**
115
+ - Gold Standard Evidence
116
+ - Systematic Review
117
+ - Meta-Analysis
118
+ - Randomized Controlled Trial (RCT)
119
+
120
+ - High-Quality Clinical Evidence
121
+ - Controlled Clinical Trial
122
+ - Clinical Trial, Phase III
123
+
124
+ - Specialized High-Quality Studies
125
+ - Pragmatic Clinical Trial
126
+ - Clinical Trial, Phase II
127
+ - Equivalence Trial
128
+
129
+ Authoritative References
130
+ - Practice Guideline
131
+ - Pharmacopoeia
132
+ - Consensus Development Conference (NIH or not)
133
+
134
+ - Other Important Clinical Evidence
135
+ - Clinical Study
136
+ - Observational Study
137
+ - Validation Study
138
+ - Comparative Study
139
+ - Case Reports
140
+ - Multicenter Study
141
+ - Evaluation Study
142
+
143
+ **Credibility of the publisher**:
144
+ - Top general medicine journals
145
+ - The Lancet (or any of its specialty journals)
146
+ - New England Journal of Medicine (NEJM, or any of its specialty journals)
147
+ - Nature Medicine (or any of its medical specialty journals)
148
+ - Journal of the American Medical Association (JAMA, or any of its specialty journals)
149
+ - BMJ
150
+ - Top specialized medicine journals
151
+ - Journal of Clinical Oncology
152
+ - European Heart Journal
153
+ - Circulation
154
+ - Journal of the American College of Cardiology
155
+ - Cancer Cell
156
+ - Annals of Oncology
157
+ - Gastroenterology
158
+ - International Journal of Epidemiology
159
+ - Blood
160
+ - Molecular Psychiatry
161
+ - Journal of the National Cancer Institute
162
+ - Gut
163
+ - Cancer Discovery
164
+ - Clinical Cancer Research
165
+ - Science Translational Medicine
166
+ - Immunity
167
+ - Brain
168
+ - Yearbook of Paediatric Endocrinology
169
+ - Journal of Allergy and Clinical Immunology
170
+ - Annals of Internal Medicine
171
+ - Journal of Clinical Investigation
172
+ - Alzheimer's and Dementia
173
+ - Journal of Hepatology
174
+ - Clinical Infectious Diseases
175
+ - Hepatology
176
+ - Neurology
177
+ - PLoS Medicine
178
+ - Annals of the Rheumatic Diseases
179
+ - Leukemia
180
+ - European Urology
181
+ - Biological Psychiatry
182
+ - Cell Metabolism
183
+ - American Journal of Psychiatry
184
+ - American Journal of Respiratory and Critical Care Medicine
185
+ - European Journal of Heart Failure
186
+ - Journal for ImmunoTherapy of Cancer
187
+ - European Respiratory Journal
188
+ - American Journal of Epidemiology
189
+ - Annals of Neurology
190
+ - Kidney International
191
+ - Diabetes Care
192
+ - Acta Neuropathologica
193
+ - Cancer
194
+ - JCI insight
195
+ - Frontiers in Immunology
196
+ - European Journal of Cancer
197
+ - Journal of Thoracic Oncology
198
+ - Journal of the National Comprehensive Cancer Network : JNCCN
199
+ - Genetics in Medicine
200
+ - Science Immunology
201
+ - Blood advances
202
+ - Journal of the American Heart Association
203
+ - Hypertension
204
+ - Intensive Care Medicine
205
+ - BMC Medicine
206
+ - Circulation Research
207
+ - Arthritis & Rheumatology
208
+ - Diabetologia
209
+ - Journal of the American Society of Nephrology (JASN)
210
+ - Journal of Clinical Endocrinology and Metabolism
211
+ - Genome Medicine
212
+ - Journal of Experimental Medicine
213
+ - American Heart Journal
214
+ - Clinical Gastroenterology and Hepatology
215
+ - Nutrients
216
+ - Diabetes
217
+ - British Journal of Cancer
218
+ - Obstetrical and Gynecological Survey
219
+ - Annals of Surgery
220
+ - Haematologica
221
+
222
+ **Reputation of the authors**
223
+ Prioritize publications from professional societies:
224
+ - World Health Organization (WHO)
225
+ - World Medical Association (WMA)
226
+ - Centers for Disease Control and Prevention (CDC)
227
+ - National Institutes of Health (NIH)
228
+ - U.S. Preventive Services Task Force (USPSTF)
229
+ - American College of Physicians (ACP)
230
+ - National Medical Association (NMA)
231
+ - American College of Cardiology (ACC)
232
+ - American Heart Association (AHA)
233
+ - American Society of Clinical Oncology (ASCO)
234
+ - National Comprehensive Cancer Network (NCCN)
235
+ - Infectious Diseases Society of America (IDSA)
236
+ - American Academy of Pediatrics (AAP)
237
+ - American College of Obstetricians and Gynecologists (ACOG)
238
+ - American Psychiatric Association (APA)
239
+ - American College of Surgeons (ACS)
240
+ - American College of Emergency Physicians (ACEP)
241
+ - American Academy of Neurology (AAN)
242
+ - Endocrine Society
243
+ - National Institute for Health and Care Excellence (NICE)
244
+ - European Medical Association (EMA)
245
+ - European Union of Medical Specialists (UEMS)
246
+ - European Medicines Agency (EMA)
247
+ - European Society of Cardiology (ESC)
248
+ - European Respiratory Society (ERS)
249
+ - European Society of Anaesthesiology and Intensive Care (ESAIC)
250
+ - European Academy of Neurology (EAN)
251
+ - European Society for Medical Oncology (ESMO)
252
+ - European Association for the Study of the Liver (EASL)
253
+ - European Society of Clinical Microbiology and Infectious Diseases (ESCMID)
254
+ - European Association of Urology (EAU)
255
+ - European Society of Endocrinology (ESE)
256
+ - European Paediatric Association (EPA/UNEPSA)
257
+ - European Society of Human Reproduction and Embryology (ESHRE)
258
+ - European Federation of Internal Medicine (EFIM)
259
+ - European Stroke Organisation (ESO)
260
+ - European Psychiatric Association (EPA)
261
+ - European Society of Radiology (ESR)
262
+ - European Hematology Association (EHA)
263
+ - European Society for Emergency Medicine (EUSEM)
264
+
265
  EVIDENCE PRIORITIZATION (when analyzing results):
266
+ To the extent possible, the answer should be grounded in top-tier evidence provided by reputable authors and medical societies
267
+ and published in reputable journals.
268
+
 
 
 
 
 
 
 
 
 
269
 
270
  SEARCH OPTIMIZATION GUIDELINES:
271
+ 1. **Medical Term Extraction**: Focus on core medical concepts, conditions,
272
  procedures, and medications from the clinical query
273
+ 2. **Broad Conceptual Scope**: Use 2-4 core medical terms. Avoid overly
274
+ specific modifiers like "criteria," "indicators," "guidelines,"
275
  "recommendations," "treatment," or "management"
276
+ 3. **Medical Terminology**: Convert colloquial terms to precise medical
277
  terminology for better literature retrieval
278
+ 4. **Search Strategy**: Construct queries that will capture both guidelines
279
  AND research studies to ensure comprehensive evidence coverage
280
 
281
  SEARCH EXAMPLES:
282
+ - Query: "ACE inhibitor side effects diabetes"
283
  (captures both guidelines and studies on ACE inhibitors in diabetic patients)
284
  - Query: "anticoagulation perioperative management elderly"
285
  (broad enough to find guidelines and RCTs on perioperative anticoagulation)
 
293
  - title, abstract, venue, year, citation counts
294
  - id (for citation), doi, url
295
  - summary (TLDR when available)
296
+
297
  IMPORTANT: Examine citation counts, venue, and content to identify
298
  high-quality sources (guidelines, large RCTs) for response prioritization.
299
  """
 
303
  for publication in publications
304
  if publication["externalIds"].get("PubMed")
305
  ]
306
+ pubmed_metadata = get_pubmed_metadata(pmids)
307
 
308
  outputs = []
309
  for publication in publications:
310
+ if metadata := pubmed_metadata.get(publication["externalIds"].get("PubMed")):
 
 
311
  # Abstracts on PubMed are more complete than the
312
  # ones returned from Semantic Scholar.
313
+ publication["abstract"] = metadata.get("abstract")
314
+ publication["publication_types"] = metadata.get("publication_types")
315
+ publication["mesh_terms"] = metadata.get("mesh_terms")
316
 
317
  outputs.append(format_publication(publication))
318