harshalmore31 commited on
Commit
ff4c876
Β·
1 Parent(s): 25d50f4

Implement code changes to enhance functionality and improve performance

Browse files
Files changed (1) hide show
  1. mai_dx/main.py +912 -336
mai_dx/main.py CHANGED
@@ -18,7 +18,7 @@ Key Features:
18
 
19
  Example Usage:
20
  # Standard MAI-DxO usage
21
- orchestrator = MaiDxOrchestrator(model_name="gpt-4.1")
22
  result = orchestrator.run(initial_case_info, full_case_details, ground_truth)
23
 
24
  # Budget-constrained variant
@@ -33,13 +33,16 @@ import os
33
  import json
34
  import sys
35
  import time
36
- from dataclasses import dataclass
37
  from enum import Enum
38
  from typing import Any, Dict, List, Union, Literal
39
 
40
  from loguru import logger
41
  from pydantic import BaseModel, Field
42
  from swarms import Agent, Conversation
 
 
 
43
 
44
  # Configure Loguru with beautiful formatting and features
45
  logger.remove() # Remove default handler
@@ -91,6 +94,131 @@ class AgentRole(Enum):
91
  JUDGE = "Judge"
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  @dataclass
95
  class DiagnosisResult:
96
  """Stores the final result of a diagnostic session."""
@@ -127,11 +255,13 @@ class MaiDxOrchestrator:
127
  Implements the MAI Diagnostic Orchestrator (MAI-DxO) framework.
128
  This class orchestrates a virtual panel of AI agents to perform sequential medical diagnosis,
129
  evaluates the final diagnosis, and tracks costs.
 
 
130
  """
131
 
132
  def __init__(
133
  self,
134
- model_name: str = "gpt-4.1",
135
  max_iterations: int = 10,
136
  initial_budget: int = 10000,
137
  mode: str = "no_budget", # "instant", "question_only", "budgeted", "no_budget", "ensemble"
@@ -139,7 +269,7 @@ class MaiDxOrchestrator:
139
  enable_budget_tracking: bool = False,
140
  ):
141
  """
142
- Initializes the MAI-DxO system.
143
 
144
  Args:
145
  model_name (str): The language model to be used by all agents.
@@ -161,6 +291,9 @@ class MaiDxOrchestrator:
161
  self.conversation = Conversation(
162
  time_enabled=True, autosave=False, save_enabled=False
163
  )
 
 
 
164
 
165
  # Enhanced cost model based on the paper's methodology
166
  self.test_cost_db = {
@@ -198,6 +331,20 @@ class MaiDxOrchestrator:
198
  f"πŸ₯ MAI Diagnostic Orchestrator initialized successfully in '{mode}' mode with budget ${initial_budget:,}"
199
  )
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  def _init_agents(self) -> None:
202
  """Initializes all required agents with their specific roles and prompts."""
203
  self.agents = {
@@ -210,6 +357,7 @@ class MaiDxOrchestrator:
210
  "json" if role == AgentRole.CONSENSUS else "str"
211
  ),
212
  print_on=True, # Enable printing for all agents to see outputs
 
213
  )
214
  for role in AgentRole
215
  }
@@ -217,8 +365,354 @@ class MaiDxOrchestrator:
217
  f"πŸ‘₯ {len(self.agents)} virtual physician agents initialized and ready for consultation"
218
  )
219
 
220
- def _get_prompt_for_role(self, role: AgentRole) -> str:
221
- """Returns the system prompt for a given agent role."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  prompts = {
223
  AgentRole.HYPOTHESIS: (
224
  """
@@ -525,8 +1019,8 @@ class MaiDxOrchestrator:
525
  }
526
  return prompts[role]
527
 
528
- def _parse_json_response(self, response: str) -> Dict[str, Any]:
529
- """Safely parses a JSON string, returning a dictionary."""
530
  try:
531
  # Extract the actual response content from the agent response
532
  if isinstance(response, str):
@@ -600,14 +1094,59 @@ class MaiDxOrchestrator:
600
  logger.debug(
601
  f"Response content: {response[:500]}..."
602
  ) # Log first 500 chars
603
- # Fallback to a default action if parsing fails
604
- return {
605
- "action_type": "ask",
606
- "content": (
607
- "Could you please clarify the next best step? The previous analysis was inconclusive."
608
- ),
609
- "reasoning": "Fallback due to parsing error.",
610
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
 
612
  def _estimate_cost(self, tests: Union[List[str], str]) -> int:
613
  """Estimates the cost of diagnostic tests."""
@@ -696,191 +1235,124 @@ class MaiDxOrchestrator:
696
 
697
  return cost
698
 
699
- def _run_panel_deliberation(self) -> Action:
700
- """Orchestrates one round of debate among the virtual panel to decide the next action."""
701
  logger.info(
702
  "🩺 Virtual medical panel deliberation commenced - analyzing patient case"
703
  )
704
  logger.debug(
705
  "Panel members: Dr. Hypothesis, Dr. Test-Chooser, Dr. Challenger, Dr. Stewardship, Dr. Checklist"
706
  )
707
- panel_conversation = Conversation(
708
- time_enabled=True, autosave=False, save_enabled=False
709
- )
710
 
711
- # Prepare comprehensive panel context
712
- remaining_budget = self.initial_budget - self.cumulative_cost
 
 
 
713
  budget_status = (
714
  "EXCEEDED"
715
  if remaining_budget < 0
716
  else f"${remaining_budget:,}"
717
  )
718
 
719
- panel_context = f"""
720
- DIAGNOSTIC CASE STATUS - ROUND {len(self.conversation.return_history_as_string().split('Gatekeeper:')) - 1}
721
-
722
- === CASE INFORMATION ===
723
- {self.conversation.get_str()}
724
-
725
- === CURRENT STATE ===
726
- Differential Diagnosis: {self.differential_diagnosis}
727
- Cumulative Cost: ${self.cumulative_cost:,}
728
- Remaining Budget: {budget_status}
729
- Mode: {self.mode}
730
- Max Iterations: {self.max_iterations}
731
-
732
- === PANEL TASK ===
733
- Virtual medical panel, please deliberate systematically on the next best diagnostic action.
734
- Each specialist should provide their expert analysis in sequence.
735
  """
736
- panel_conversation.add("System", panel_context)
737
 
738
  # Check mode-specific constraints
739
  if self.mode == "instant":
740
  # For instant mode, skip deliberation and go straight to diagnosis
741
  action_dict = {
742
  "action_type": "diagnose",
743
- "content": (
744
- self.differential_diagnosis.split("\n")[0]
745
- if "\n" in self.differential_diagnosis
746
- else self.differential_diagnosis
747
- ),
748
  "reasoning": (
749
  "Instant diagnosis mode - providing immediate assessment based on initial presentation"
750
  ),
751
  }
752
  return Action(**action_dict)
753
 
754
- if self.mode == "question_only":
755
- # For question-only mode, prevent test ordering
756
- panel_context += "\n\nIMPORTANT: This is QUESTION-ONLY mode. You may ONLY ask patient questions, not order diagnostic tests."
757
- panel_conversation.add("System", panel_context)
 
 
 
 
 
 
 
758
 
759
- # Sequential expert deliberation with enhanced methodology
 
760
  try:
761
  # Dr. Hypothesis - Differential diagnosis and probability assessment
762
- logger.info(
763
- "🧠 Dr. Hypothesis analyzing differential diagnosis..."
764
- )
765
- hypothesis = self.agents[AgentRole.HYPOTHESIS].run(
766
- panel_conversation.get_str()
767
- )
768
- self.differential_diagnosis = (
769
- hypothesis # Update main state
770
- )
771
- panel_conversation.add(
772
- self.agents[AgentRole.HYPOTHESIS].agent_name,
773
- hypothesis,
774
- )
775
 
776
  # Dr. Test-Chooser - Information value optimization
777
- logger.info(
778
- "πŸ”¬ Dr. Test-Chooser selecting optimal tests..."
779
- )
780
- test_choices = self.agents[AgentRole.TEST_CHOOSER].run(
781
- panel_conversation.get_str()
782
- )
783
- panel_conversation.add(
784
- self.agents[AgentRole.TEST_CHOOSER].agent_name,
785
- test_choices,
786
- )
787
 
788
  # Dr. Challenger - Bias identification and alternative hypotheses
789
- logger.info(
790
- "πŸ€” Dr. Challenger challenging assumptions..."
791
- )
792
- challenges = self.agents[AgentRole.CHALLENGER].run(
793
- panel_conversation.get_str()
794
- )
795
- panel_conversation.add(
796
- self.agents[AgentRole.CHALLENGER].agent_name,
797
- challenges,
798
- )
799
 
800
  # Dr. Stewardship - Cost-effectiveness analysis
801
- logger.info(
802
- "πŸ’° Dr. Stewardship evaluating cost-effectiveness..."
803
- )
804
- stewardship_context = panel_conversation.get_str()
805
  if self.enable_budget_tracking:
806
- stewardship_context += f"\n\nBUDGET TRACKING ENABLED - Current cost: ${self.cumulative_cost}, Remaining: ${remaining_budget}"
807
- stewardship_rec = self.agents[AgentRole.STEWARDSHIP].run(
808
- stewardship_context
809
- )
810
- panel_conversation.add(
811
- self.agents[AgentRole.STEWARDSHIP].agent_name,
812
- stewardship_rec,
813
- )
814
 
815
  # Dr. Checklist - Quality assurance
816
- logger.info(
817
- "βœ… Dr. Checklist performing quality control..."
818
- )
819
- checklist_rep = self.agents[AgentRole.CHECKLIST].run(
820
- panel_conversation.get_str()
821
- )
822
- panel_conversation.add(
823
- self.agents[AgentRole.CHECKLIST].agent_name,
824
- checklist_rep,
825
- )
826
-
827
- # Consensus Coordinator - Final decision synthesis
828
- logger.info(
829
- "🀝 Consensus Coordinator synthesizing panel decision..."
830
- )
831
- consensus_context = panel_conversation.get_str()
832
 
 
 
 
 
 
 
833
  # Add mode-specific constraints to consensus
834
  if self.mode == "budgeted" and remaining_budget <= 0:
835
- consensus_context += "\n\nBUDGET CONSTRAINT: Budget exceeded - must either ask questions or provide final diagnosis."
836
 
837
- consensus_response = self.agents[AgentRole.CONSENSUS].run(
838
- consensus_context
839
- )
840
- logger.debug(
841
- f"Raw consensus response: {consensus_response}"
842
  )
843
 
844
- # Extract the actual text content from agent response
845
- if hasattr(consensus_response, "content"):
846
- response_text = consensus_response.content
847
- elif isinstance(consensus_response, str):
848
- response_text = consensus_response
849
- else:
850
- response_text = str(consensus_response)
851
-
852
- action_dict = self._parse_json_response(response_text)
853
-
854
  # Validate action based on mode constraints
855
  action = Action(**action_dict)
856
- if (
857
- self.mode == "question_only"
858
- and action.action_type == "test"
859
- ):
860
- logger.warning(
861
- "Test ordering attempted in question-only mode, converting to ask action"
862
- )
863
- action.action_type = "ask"
864
- action.content = "Can you provide more details about the patient's symptoms and history?"
865
- action.reasoning = (
866
- "Mode constraint: question-only mode active"
867
- )
868
-
869
- if (
870
- self.mode == "budgeted"
871
- and action.action_type == "test"
872
- and remaining_budget <= 0
873
- ):
874
- logger.warning(
875
- "Test ordering attempted with insufficient budget, converting to diagnose action"
876
- )
877
- action.action_type = "diagnose"
878
- action.content = (
879
- self.differential_diagnosis.split("\n")[0]
880
- if "\n" in self.differential_diagnosis
881
- else self.differential_diagnosis
882
- )
883
- action.reasoning = "Budget constraint: insufficient funds for additional testing"
884
 
885
  return action
886
 
@@ -892,6 +1364,97 @@ class MaiDxOrchestrator:
892
  content="Could you please provide more information about the patient's current condition?",
893
  reasoning=f"Fallback due to panel deliberation error: {str(e)}",
894
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
895
 
896
  def _interact_with_gatekeeper(
897
  self, action: Action, full_case_details: str
@@ -951,7 +1514,7 @@ class MaiDxOrchestrator:
951
  ground_truth_diagnosis: str,
952
  ) -> DiagnosisResult:
953
  """
954
- Executes the full sequential diagnostic process.
955
 
956
  Args:
957
  initial_case_info (str): The initial abstract of the case.
@@ -962,13 +1525,22 @@ class MaiDxOrchestrator:
962
  DiagnosisResult: An object containing the final diagnosis, evaluation, cost, and history.
963
  """
964
  start_time = time.time()
 
 
 
 
 
 
 
 
 
 
965
  self.conversation.add(
966
  "Gatekeeper",
967
  f"Initial Case Information: {initial_case_info}",
968
  )
 
969
 
970
- # Add initial physician visit cost
971
- self.cumulative_cost += self.physician_visit_cost
972
  logger.info(
973
  f"Initial physician visit cost: ${self.physician_visit_cost}"
974
  )
@@ -978,16 +1550,18 @@ class MaiDxOrchestrator:
978
 
979
  for i in range(self.max_iterations):
980
  iteration_count = i + 1
 
 
981
  logger.info(
982
  f"--- Starting Diagnostic Loop {iteration_count}/{self.max_iterations} ---"
983
  )
984
  logger.info(
985
- f"Current cost: ${self.cumulative_cost:,} | Remaining budget: ${self.initial_budget - self.cumulative_cost:,}"
986
  )
987
 
988
  try:
989
- # Panel deliberates to decide on the next action
990
- action = self._run_panel_deliberation()
991
  logger.info(
992
  f"βš•οΈ Panel decision: {action.action_type.upper()} -> {action.content}"
993
  )
@@ -995,6 +1569,9 @@ class MaiDxOrchestrator:
995
  f"πŸ’­ Medical reasoning: {action.reasoning}"
996
  )
997
 
 
 
 
998
  if action.action_type == "diagnose":
999
  final_diagnosis = action.content
1000
  logger.info(
@@ -1002,7 +1579,7 @@ class MaiDxOrchestrator:
1002
  )
1003
  break
1004
 
1005
- # Handle mode-specific constraints
1006
  if (
1007
  self.mode == "question_only"
1008
  and action.action_type == "test"
@@ -1021,7 +1598,7 @@ class MaiDxOrchestrator:
1021
  action.content
1022
  )
1023
  if (
1024
- self.cumulative_cost + estimated_test_cost
1025
  > self.initial_budget
1026
  ):
1027
  logger.warning(
@@ -1034,16 +1611,21 @@ class MaiDxOrchestrator:
1034
  action, full_case_details
1035
  )
1036
  self.conversation.add("Gatekeeper", response)
 
1037
 
1038
- # Update costs based on action type
1039
  if action.action_type == "test":
1040
  test_cost = self._estimate_cost(action.content)
1041
- self.cumulative_cost += test_cost
 
 
 
1042
  logger.info(f"Tests ordered: {action.content}")
1043
  logger.info(
1044
- f"Test cost: ${test_cost:,} | Cumulative cost: ${self.cumulative_cost:,}"
1045
  )
1046
  elif action.action_type == "ask":
 
1047
  # Questions are part of the same visit until tests are ordered
1048
  logger.info(f"Questions asked: {action.content}")
1049
  logger.info(
@@ -1053,17 +1635,13 @@ class MaiDxOrchestrator:
1053
  # Check budget constraints for budgeted mode
1054
  if (
1055
  self.mode == "budgeted"
1056
- and self.cumulative_cost >= self.initial_budget
1057
  ):
1058
  logger.warning(
1059
  "Budget limit reached. Forcing final diagnosis."
1060
  )
1061
- # Use current differential diagnosis or make best guess
1062
- final_diagnosis = (
1063
- self.differential_diagnosis.split("\n")[0]
1064
- if "\n" in self.differential_diagnosis
1065
- else "Diagnosis not reached within budget constraints."
1066
- )
1067
  break
1068
 
1069
  except Exception as e:
@@ -1075,11 +1653,9 @@ class MaiDxOrchestrator:
1075
 
1076
  else:
1077
  # Max iterations reached without diagnosis
1078
- final_diagnosis = (
1079
- self.differential_diagnosis.split("\n")[0]
1080
- if "\n" in self.differential_diagnosis
1081
- else "Diagnosis not reached within maximum iterations."
1082
- )
1083
  logger.warning(
1084
  f"Max iterations ({self.max_iterations}) reached. Using best available diagnosis."
1085
  )
@@ -1115,7 +1691,7 @@ class MaiDxOrchestrator:
1115
  ground_truth=ground_truth_diagnosis,
1116
  accuracy_score=judgement["score"],
1117
  accuracy_reasoning=judgement["reasoning"],
1118
- total_cost=self.cumulative_cost,
1119
  iterations=iteration_count,
1120
  conversation_history=self.conversation.get_str(),
1121
  )
@@ -1124,7 +1700,7 @@ class MaiDxOrchestrator:
1124
  logger.info(f" Final diagnosis: {final_diagnosis}")
1125
  logger.info(f" Ground truth: {ground_truth_diagnosis}")
1126
  logger.info(f" Accuracy score: {judgement['score']}/5.0")
1127
- logger.info(f" Total cost: ${self.cumulative_cost:,}")
1128
  logger.info(f" Iterations: {iteration_count}")
1129
 
1130
  return result
@@ -1406,159 +1982,159 @@ def run_mai_dxo_demo(
1406
  return results
1407
 
1408
 
1409
- # if __name__ == "__main__":
1410
- # # Example case inspired by the paper's Figure 1
1411
- # initial_info = (
1412
- # "A 29-year-old woman was admitted to the hospital because of sore throat and peritonsillar swelling "
1413
- # "and bleeding. Symptoms did not abate with antimicrobial therapy."
1414
- # )
1415
-
1416
- # full_case = """
1417
- # Patient: 29-year-old female.
1418
- # History: Onset of sore throat 7 weeks prior to admission. Worsening right-sided pain and swelling.
1419
- # No fevers, headaches, or gastrointestinal symptoms. Past medical history is unremarkable. No history of smoking or significant alcohol use.
1420
- # Physical Exam: Right peritonsillar mass, displacing the uvula. No other significant findings.
1421
- # Initial Labs: FBC, clotting studies normal.
1422
- # MRI Neck: Showed a large, enhancing mass in the right peritonsillar space.
1423
- # Biopsy (H&E): Infiltrative round-cell neoplasm with high nuclear-to-cytoplasmic ratio and frequent mitotic figures.
1424
- # Biopsy (Immunohistochemistry for Carcinoma): CD31, D2-40, CD34, ERG, GLUT-1, pan-cytokeratin, CD45, CD20, CD3 all negative. Ki-67: 60% nuclear positivity.
1425
- # Biopsy (Immunohistochemistry for Rhabdomyosarcoma): Desmin and MyoD1 diffusely positive. Myogenin multifocally positive.
1426
- # Biopsy (FISH): No FOXO1 (13q14) rearrangements detected.
1427
- # Final Diagnosis from Pathology: Embryonal rhabdomyosarcoma of the pharynx.
1428
- # """
1429
-
1430
- # ground_truth = "Embryonal rhabdomyosarcoma of the pharynx"
1431
-
1432
- # # --- Demonstrate Different MAI-DxO Variants ---
1433
- # try:
1434
- # print("\n" + "=" * 80)
1435
- # print(
1436
- # " MAI DIAGNOSTIC ORCHESTRATOR (MAI-DxO) - SEQUENTIAL DIAGNOSIS BENCHMARK"
1437
- # )
1438
- # print(
1439
- # " Implementation based on the NEJM Research Paper"
1440
- # )
1441
- # print("=" * 80)
1442
-
1443
- # # Test different variants as described in the paper
1444
- # variants_to_test = [
1445
- # (
1446
- # "no_budget",
1447
- # "Standard MAI-DxO with no budget constraints",
1448
- # ),
1449
- # ("budgeted", "Budget-constrained MAI-DxO ($3000 limit)"),
1450
- # (
1451
- # "question_only",
1452
- # "Question-only variant (no diagnostic tests)",
1453
- # ),
1454
- # ]
1455
-
1456
- # results = {}
1457
-
1458
- # for variant_name, description in variants_to_test:
1459
- # print(f"\n{'='*60}")
1460
- # print(f"Testing Variant: {variant_name.upper()}")
1461
- # print(f"Description: {description}")
1462
- # print("=" * 60)
1463
-
1464
- # # Create the variant
1465
- # if variant_name == "budgeted":
1466
- # orchestrator = MaiDxOrchestrator.create_variant(
1467
- # variant_name,
1468
- # budget=3000,
1469
- # model_name="gpt-4.1",
1470
- # max_iterations=5,
1471
- # )
1472
- # else:
1473
- # orchestrator = MaiDxOrchestrator.create_variant(
1474
- # variant_name,
1475
- # model_name="gpt-4.1",
1476
- # max_iterations=5,
1477
- # )
1478
-
1479
- # # Run the diagnostic process
1480
- # result = orchestrator.run(
1481
- # initial_case_info=initial_info,
1482
- # full_case_details=full_case,
1483
- # ground_truth_diagnosis=ground_truth,
1484
- # )
1485
-
1486
- # results[variant_name] = result
1487
-
1488
- # # Display results
1489
- # print(f"\nπŸš€ Final Diagnosis: {result.final_diagnosis}")
1490
- # print(f"🎯 Ground Truth: {result.ground_truth}")
1491
- # print(f"⭐ Accuracy Score: {result.accuracy_score}/5.0")
1492
- # print(f" Reasoning: {result.accuracy_reasoning}")
1493
- # print(f"πŸ’° Total Cost: ${result.total_cost:,}")
1494
- # print(f"πŸ”„ Iterations: {result.iterations}")
1495
- # print(f"⏱️ Mode: {orchestrator.mode}")
1496
-
1497
- # # Demonstrate ensemble approach
1498
- # print(f"\n{'='*60}")
1499
- # print("Testing Variant: ENSEMBLE")
1500
- # print(
1501
- # "Description: Multiple independent runs with consensus aggregation"
1502
- # )
1503
- # print("=" * 60)
1504
-
1505
- # ensemble_orchestrator = MaiDxOrchestrator.create_variant(
1506
- # "ensemble",
1507
- # model_name="gpt-4.1",
1508
- # max_iterations=3, # Shorter iterations for ensemble
1509
- # )
1510
-
1511
- # ensemble_result = ensemble_orchestrator.run_ensemble(
1512
- # initial_case_info=initial_info,
1513
- # full_case_details=full_case,
1514
- # ground_truth_diagnosis=ground_truth,
1515
- # num_runs=2, # Reduced for demo
1516
- # )
1517
-
1518
- # results["ensemble"] = ensemble_result
1519
-
1520
- # print(
1521
- # f"\nπŸš€ Ensemble Diagnosis: {ensemble_result.final_diagnosis}"
1522
- # )
1523
- # print(f"🎯 Ground Truth: {ensemble_result.ground_truth}")
1524
- # print(
1525
- # f"⭐ Ensemble Score: {ensemble_result.accuracy_score}/5.0"
1526
- # )
1527
- # print(
1528
- # f"πŸ’° Total Ensemble Cost: ${ensemble_result.total_cost:,}"
1529
- # )
1530
-
1531
- # # --- Summary Comparison ---
1532
- # print(f"\n{'='*80}")
1533
- # print(" RESULTS SUMMARY")
1534
- # print("=" * 80)
1535
- # print(
1536
- # f"{'Variant':<15} {'Diagnosis Match':<15} {'Score':<8} {'Cost':<12} {'Iterations':<12}"
1537
- # )
1538
- # print("-" * 80)
1539
-
1540
- # for variant_name, result in results.items():
1541
- # match_status = (
1542
- # "βœ“ Match"
1543
- # if result.accuracy_score >= 4.0
1544
- # else "βœ— No Match"
1545
- # )
1546
- # print(
1547
- # f"{variant_name:<15} {match_status:<15} {result.accuracy_score:<8.1f} ${result.total_cost:<11,} {result.iterations:<12}"
1548
- # )
1549
-
1550
- # print(f"\n{'='*80}")
1551
- # print(
1552
- # "Implementation successfully demonstrates the MAI-DxO framework"
1553
- # )
1554
- # print(
1555
- # "as described in 'Sequential Diagnosis with Language Models' paper"
1556
- # )
1557
- # print("=" * 80)
1558
-
1559
- # except Exception as e:
1560
- # logger.exception(
1561
- # f"An error occurred during the diagnostic session: {e}"
1562
- # )
1563
- # print(f"\n❌ Error occurred: {e}")
1564
- # print("Please check your model configuration and API keys.")
 
18
 
19
  Example Usage:
20
  # Standard MAI-DxO usage
21
+ orchestrator = MaiDxOrchestrator(model_name="gpt-4o")
22
  result = orchestrator.run(initial_case_info, full_case_details, ground_truth)
23
 
24
  # Budget-constrained variant
 
33
  import json
34
  import sys
35
  import time
36
+ from dataclasses import dataclass, field
37
  from enum import Enum
38
  from typing import Any, Dict, List, Union, Literal
39
 
40
  from loguru import logger
41
  from pydantic import BaseModel, Field
42
  from swarms import Agent, Conversation
43
+ from dotenv import load_dotenv
44
+
45
+ load_dotenv()
46
 
47
  # Configure Loguru with beautiful formatting and features
48
  logger.remove() # Remove default handler
 
94
  JUDGE = "Judge"
95
 
96
 
97
+ @dataclass
98
+ class CaseState:
99
+ """Structured state management for diagnostic process - addresses Category 2.1"""
100
+ initial_vignette: str
101
+ evidence_log: List[str] = field(default_factory=list)
102
+ differential_diagnosis: Dict[str, float] = field(default_factory=dict)
103
+ tests_performed: List[str] = field(default_factory=list)
104
+ questions_asked: List[str] = field(default_factory=list)
105
+ cumulative_cost: int = 0
106
+ iteration: int = 0
107
+ last_actions: List['Action'] = field(default_factory=list) # For stagnation detection
108
+
109
+ def add_evidence(self, evidence: str):
110
+ """Add new evidence to the case"""
111
+ self.evidence_log.append(f"[Turn {self.iteration}] {evidence}")
112
+
113
+ def update_differential(self, diagnosis_dict: Dict[str, float]):
114
+ """Update differential diagnosis probabilities"""
115
+ self.differential_diagnosis.update(diagnosis_dict)
116
+
117
+ def add_test(self, test_name: str):
118
+ """Record a test that was performed"""
119
+ self.tests_performed.append(test_name)
120
+
121
+ def add_question(self, question: str):
122
+ """Record a question that was asked"""
123
+ self.questions_asked.append(question)
124
+
125
+ def is_stagnating(self, new_action: 'Action') -> bool:
126
+ """Detect if the system is stuck in a loop - addresses Category 1.2"""
127
+ if len(self.last_actions) < 2:
128
+ return False
129
+
130
+ # Check if the new action is identical to recent ones
131
+ for last_action in self.last_actions[-2:]:
132
+ if (last_action.action_type == new_action.action_type and
133
+ last_action.content == new_action.content):
134
+ return True
135
+ return False
136
+
137
+ def add_action(self, action: 'Action'):
138
+ """Add action to history and maintain sliding window"""
139
+ self.last_actions.append(action)
140
+ if len(self.last_actions) > 3: # Keep only last 3 actions
141
+ self.last_actions.pop(0)
142
+
143
+ def get_max_confidence(self) -> float:
144
+ """Get the maximum confidence from differential diagnosis"""
145
+ if not self.differential_diagnosis:
146
+ return 0.0
147
+ return max(self.differential_diagnosis.values())
148
+
149
+ def get_leading_diagnosis(self) -> str:
150
+ """Get the diagnosis with highest confidence"""
151
+ if not self.differential_diagnosis:
152
+ return "No diagnosis formulated"
153
+ return max(self.differential_diagnosis.items(), key=lambda x: x[1])[0]
154
+
155
+ def summarize_evidence(self) -> str:
156
+ """Create a concise summary of evidence for token efficiency"""
157
+ if len(self.evidence_log) <= 5:
158
+ return "\n".join(self.evidence_log)
159
+
160
+ # Keep first 2 and last 3 entries, summarize middle
161
+ summary_parts = []
162
+ summary_parts.extend(self.evidence_log[:2])
163
+
164
+ if len(self.evidence_log) > 5:
165
+ middle_count = len(self.evidence_log) - 5
166
+ summary_parts.append(f"[... {middle_count} additional findings ...]")
167
+
168
+ summary_parts.extend(self.evidence_log[-3:])
169
+ return "\n".join(summary_parts)
170
+
171
+
172
+ @dataclass
173
+ class DeliberationState:
174
+ """Structured state for panel deliberation - addresses Category 1.1"""
175
+ hypothesis_analysis: str = ""
176
+ test_chooser_analysis: str = ""
177
+ challenger_analysis: str = ""
178
+ stewardship_analysis: str = ""
179
+ checklist_analysis: str = ""
180
+ situational_context: str = ""
181
+ stagnation_detected: bool = False
182
+ retry_count: int = 0
183
+
184
+ def to_consensus_prompt(self) -> str:
185
+ """Generate a structured prompt for the consensus coordinator"""
186
+ prompt = f"""
187
+ You are the Consensus Coordinator. Here is the summary of the panel's deliberation for this turn:
188
+
189
+ **Current Differential Diagnosis (from Dr. Hypothesis):**
190
+ {self.hypothesis_analysis}
191
+
192
+ **Recommended Tests (from Dr. Test-Chooser):**
193
+ {self.test_chooser_analysis}
194
+
195
+ **Identified Biases & Challenges (from Dr. Challenger):**
196
+ {self.challenger_analysis}
197
+
198
+ **Cost & Stewardship Concerns (from Dr. Stewardship):**
199
+ {self.stewardship_analysis}
200
+
201
+ **Quality Control Assessment (from Dr. Checklist):**
202
+ {self.checklist_analysis}
203
+ """
204
+
205
+ if self.stagnation_detected:
206
+ prompt += f"""
207
+ **CRITICAL INTERVENTION: STAGNATION DETECTED**
208
+ The panel is stalled. You MUST propose a different and more decisive action.
209
+ If you cannot find a new test or question, you must move to a final diagnosis.
210
+ """
211
+
212
+ if self.situational_context:
213
+ prompt += f"""
214
+ **SITUATIONAL CONTEXT:**
215
+ {self.situational_context}
216
+ """
217
+
218
+ prompt += "\nBased on this synthesized input, provide your single best action in the required JSON format."
219
+ return prompt
220
+
221
+
222
  @dataclass
223
  class DiagnosisResult:
224
  """Stores the final result of a diagnostic session."""
 
255
  Implements the MAI Diagnostic Orchestrator (MAI-DxO) framework.
256
  This class orchestrates a virtual panel of AI agents to perform sequential medical diagnosis,
257
  evaluates the final diagnosis, and tracks costs.
258
+
259
+ Enhanced with structured deliberation and proper state management as per research paper.
260
  """
261
 
262
  def __init__(
263
  self,
264
+ model_name: str = "gpt-4.1", # Updated to GPT-4.1 as requested (GPT-4 Turbo)
265
  max_iterations: int = 10,
266
  initial_budget: int = 10000,
267
  mode: str = "no_budget", # "instant", "question_only", "budgeted", "no_budget", "ensemble"
 
269
  enable_budget_tracking: bool = False,
270
  ):
271
  """
272
+ Initializes the MAI-DxO system with improved architecture.
273
 
274
  Args:
275
  model_name (str): The language model to be used by all agents.
 
291
  self.conversation = Conversation(
292
  time_enabled=True, autosave=False, save_enabled=False
293
  )
294
+
295
+ # Initialize case state for structured state management
296
+ self.case_state = None
297
 
298
  # Enhanced cost model based on the paper's methodology
299
  self.test_cost_db = {
 
331
  f"πŸ₯ MAI Diagnostic Orchestrator initialized successfully in '{mode}' mode with budget ${initial_budget:,}"
332
  )
333
 
334
+ def _get_agent_max_tokens(self, role: AgentRole) -> int:
335
+ """Get max_tokens for each agent based on their role - addresses token optimization"""
336
+ token_limits = {
337
+ AgentRole.HYPOTHESIS: 800, # Needs space for differential diagnosis
338
+ AgentRole.TEST_CHOOSER: 600, # Test recommendations
339
+ AgentRole.CHALLENGER: 700, # Bias identification and alternatives
340
+ AgentRole.STEWARDSHIP: 500, # Cost analysis
341
+ AgentRole.CHECKLIST: 400, # Brief validation
342
+ AgentRole.CONSENSUS: 300, # Just JSON output
343
+ AgentRole.GATEKEEPER: 1000, # Detailed clinical findings
344
+ AgentRole.JUDGE: 600, # Scoring and reasoning
345
+ }
346
+ return token_limits.get(role, 500)
347
+
348
  def _init_agents(self) -> None:
349
  """Initializes all required agents with their specific roles and prompts."""
350
  self.agents = {
 
357
  "json" if role == AgentRole.CONSENSUS else "str"
358
  ),
359
  print_on=True, # Enable printing for all agents to see outputs
360
+ max_tokens=self._get_agent_max_tokens(role), # Role-specific token limits
361
  )
362
  for role in AgentRole
363
  }
 
365
  f"πŸ‘₯ {len(self.agents)} virtual physician agents initialized and ready for consultation"
366
  )
367
 
368
+ def _get_dynamic_context(self, role: AgentRole, case_state: CaseState) -> str:
369
+ """Generate dynamic context for agents based on current situation - addresses Category 4.2"""
370
+ remaining_budget = self.initial_budget - case_state.cumulative_cost
371
+
372
+ # Calculate confidence from differential diagnosis
373
+ max_confidence = max(case_state.differential_diagnosis.values()) if case_state.differential_diagnosis else 0
374
+
375
+ context = ""
376
+
377
+ if role == AgentRole.STEWARDSHIP and remaining_budget < 1000:
378
+ context = f"""
379
+ **SITUATIONAL CONTEXT: URGENT**
380
+ The remaining budget is critically low (${remaining_budget}). All recommendations must be focused on maximum cost-effectiveness. Veto any non-essential or high-cost tests.
381
+ """
382
+
383
+ elif role == AgentRole.HYPOTHESIS and max_confidence > 0.75:
384
+ context = f"""
385
+ **SITUATIONAL CONTEXT: FINAL STAGES**
386
+ The panel is converging on a diagnosis (current max confidence: {max_confidence:.0%}). Your primary role now is to confirm the leading hypothesis or state what single piece of evidence is needed to reach >85% confidence.
387
+ """
388
+
389
+ elif role == AgentRole.CONSENSUS and case_state.iteration > 5:
390
+ context = f"""
391
+ **SITUATIONAL CONTEXT: EXTENDED CASE**
392
+ This case has gone through {case_state.iteration} iterations. Focus on decisive actions that will lead to a definitive diagnosis rather than additional exploratory steps.
393
+ """
394
+
395
+ return context
396
+
397
+ def _get_prompt_for_role(self, role: AgentRole, case_state: CaseState = None) -> str:
398
+ """Returns the system prompt for a given agent role with dynamic context."""
399
+
400
+ # Add dynamic context if case_state is provided
401
+ dynamic_context = ""
402
+ if case_state:
403
+ dynamic_context = self._get_dynamic_context(role, case_state)
404
+
405
+ base_prompts = {
406
+ AgentRole.HYPOTHESIS: f"""
407
+ {dynamic_context}
408
+
409
+ You are Dr. Hypothesis, a specialist in maintaining differential diagnoses. Your role is critical to the diagnostic process.
410
+
411
+ CORE RESPONSIBILITIES:
412
+ - Maintain a probability-ranked differential diagnosis with the top 3 most likely conditions
413
+ - Update probabilities using Bayesian reasoning after each new finding
414
+ - Consider both common and rare diseases appropriate to the clinical context
415
+ - Explicitly track how new evidence changes your diagnostic thinking
416
+
417
+ APPROACH:
418
+ 1. Start with the most likely diagnoses based on presenting symptoms
419
+ 2. For each new piece of evidence, consider:
420
+ - How it supports or refutes each hypothesis
421
+ - Whether it suggests new diagnoses to consider
422
+ - How it changes the relative probabilities
423
+ 3. Always explain your Bayesian reasoning clearly
424
+
425
+ OUTPUT FORMAT:
426
+ Provide your updated differential diagnosis with:
427
+ - Top 3 diagnoses with probability estimates (percentages)
428
+ - Brief rationale for each
429
+ - Key evidence supporting each hypothesis
430
+ - Evidence that contradicts or challenges each hypothesis
431
+
432
+ Remember: Your differential drives the entire diagnostic process. Be thorough, evidence-based, and adaptive.
433
+ """,
434
+
435
+ AgentRole.TEST_CHOOSER: (
436
+ """
437
+ You are Dr. Test-Chooser, a specialist in diagnostic test selection and information theory.
438
+
439
+ CORE RESPONSIBILITIES:
440
+ - Select up to 3 diagnostic tests per round that maximally discriminate between leading hypotheses
441
+ - Optimize for information value, not just clinical reasonableness
442
+ - Consider test characteristics: sensitivity, specificity, positive/negative predictive values
443
+ - Balance diagnostic yield with patient burden and resource utilization
444
+
445
+ SELECTION CRITERIA:
446
+ 1. Information Value: How much will this test change diagnostic probabilities?
447
+ 2. Discriminatory Power: How well does it distinguish between competing hypotheses?
448
+ 3. Clinical Impact: Will the result meaningfully alter management?
449
+ 4. Sequential Logic: What should we establish first before ordering more complex tests?
450
+
451
+ APPROACH:
452
+ - For each proposed test, explicitly state which hypotheses it will help confirm or exclude
453
+ - Consider both positive and negative results and their implications
454
+ - Think about test sequences (e.g., basic labs before advanced imaging)
455
+ - Avoid redundant tests that won't add new information
456
+
457
+ OUTPUT FORMAT:
458
+ For each recommended test:
459
+ - Test name (be specific)
460
+ - Primary hypotheses it will help evaluate
461
+ - Expected information gain
462
+ - How results will change management decisions
463
+
464
+ Focus on tests that will most efficiently narrow the differential diagnosis.
465
+ """
466
+ ),
467
+ AgentRole.CHALLENGER: (
468
+ """
469
+ You are Dr. Challenger, the critical thinking specialist and devil's advocate.
470
+
471
+ CORE RESPONSIBILITIES:
472
+ - Identify and challenge cognitive biases in the diagnostic process
473
+ - Highlight contradictory evidence that might be overlooked
474
+ - Propose alternative hypotheses and falsifying tests
475
+ - Guard against premature diagnostic closure
476
+
477
+ COGNITIVE BIASES TO WATCH FOR:
478
+ 1. Anchoring: Over-reliance on initial impressions
479
+ 2. Confirmation bias: Seeking only supporting evidence
480
+ 3. Availability bias: Overestimating probability of recently seen conditions
481
+ 4. Representativeness: Ignoring base rates and prevalence
482
+ 5. Search satisficing: Stopping at "good enough" explanations
483
+
484
+ YOUR APPROACH:
485
+ - Ask "What else could this be?" and "What doesn't fit?"
486
+ - Challenge assumptions and look for alternative explanations
487
+ - Propose tests that could disprove the leading hypothesis
488
+ - Consider rare diseases when common ones don't fully explain the picture
489
+ - Advocate for considering multiple conditions simultaneously
490
+
491
+ OUTPUT FORMAT:
492
+ - Specific biases you've identified in the current reasoning
493
+ - Evidence that contradicts the leading hypotheses
494
+ - Alternative diagnoses to consider
495
+ - Tests that could falsify current assumptions
496
+ - Red flags or concerning patterns that need attention
497
+
498
+ Be constructively critical - your role is to strengthen diagnostic accuracy through rigorous challenge.
499
+ """
500
+ ),
501
+ AgentRole.STEWARDSHIP: (
502
+ """
503
+ You are Dr. Stewardship, the resource optimization and cost-effectiveness specialist.
504
+
505
+ CORE RESPONSIBILITIES:
506
+ - Enforce cost-conscious, high-value care
507
+ - Advocate for cheaper alternatives when diagnostically equivalent
508
+ - Challenge low-yield, expensive tests
509
+ - Balance diagnostic thoroughness with resource stewardship
510
+
511
+ COST-VALUE FRAMEWORK:
512
+ 1. High-Value Tests: Low cost, high diagnostic yield, changes management
513
+ 2. Moderate-Value Tests: Moderate cost, specific indication, incremental value
514
+ 3. Low-Value Tests: High cost, low yield, minimal impact on decisions
515
+ 4. No-Value Tests: Any cost, no diagnostic value, ordered out of habit
516
+
517
+ ALTERNATIVE STRATEGIES:
518
+ - Could patient history/physical exam provide this information?
519
+ - Is there a less expensive test with similar diagnostic value?
520
+ - Can we use a staged approach (cheap test first, expensive if needed)?
521
+ - Does the test result actually change management?
522
+
523
+ YOUR APPROACH:
524
+ - Review all proposed tests for necessity and value
525
+ - Suggest cost-effective alternatives
526
+ - Question tests that don't clearly advance diagnosis
527
+ - Advocate for asking questions before ordering expensive tests
528
+ - Consider the cumulative cost burden
529
+
530
+ OUTPUT FORMAT:
531
+ - Assessment of proposed tests (high/moderate/low/no value)
532
+ - Specific cost-effective alternatives
533
+ - Questions that might obviate need for testing
534
+ - Recommended modifications to testing strategy
535
+ - Cumulative cost considerations
536
+
537
+ Your goal: Maximum diagnostic accuracy at minimum necessary cost.
538
+ """
539
+ ),
540
+ AgentRole.CHECKLIST: (
541
+ """
542
+ You are Dr. Checklist, the quality assurance and consistency specialist.
543
+
544
+ CORE RESPONSIBILITIES:
545
+ - Perform silent quality control on all panel deliberations
546
+ - Ensure test names are valid and properly specified
547
+ - Check internal consistency of reasoning across panel members
548
+ - Flag logical errors or contradictions in the diagnostic approach
549
+
550
+ QUALITY CHECKS:
551
+ 1. Test Validity: Are proposed tests real and properly named?
552
+ 2. Logical Consistency: Do the recommendations align with the differential?
553
+ 3. Evidence Integration: Are all findings being considered appropriately?
554
+ 4. Process Adherence: Is the panel following proper diagnostic methodology?
555
+ 5. Safety Checks: Are any critical possibilities being overlooked?
556
+
557
+ SPECIFIC VALIDATIONS:
558
+ - Test names match standard medical terminology
559
+ - Proposed tests are appropriate for the clinical scenario
560
+ - No contradictions between different panel members' reasoning
561
+ - All significant findings are being addressed
562
+ - No gaps in the diagnostic logic
563
+
564
+ OUTPUT FORMAT:
565
+ - Brief validation summary (βœ“ Clear / ⚠ Issues noted)
566
+ - Any test name corrections needed
567
+ - Logical inconsistencies identified
568
+ - Missing considerations or gaps
569
+ - Process improvement suggestions
570
+
571
+ Keep your feedback concise but comprehensive. Flag any issues that could compromise diagnostic quality.
572
+ """
573
+ ),
574
+ AgentRole.CONSENSUS: f"""
575
+ {dynamic_context}
576
+
577
+ You are the Consensus Coordinator, responsible for synthesizing the virtual panel's expertise into a single, optimal decision.
578
+
579
+ CORE RESPONSIBILITIES:
580
+ - Integrate input from Dr. Hypothesis, Dr. Test-Chooser, Dr. Challenger, Dr. Stewardship, and Dr. Checklist
581
+ - Decide on the single best next action: 'ask', 'test', or 'diagnose'
582
+ - Balance competing priorities: accuracy, cost, efficiency, and thoroughness
583
+ - Ensure the chosen action advances the diagnostic process optimally
584
+
585
+ **PRIORITIZED DECISION FRAMEWORK:**
586
+ Use the following prioritized framework to make your decision:
587
+
588
+ 1. **Certainty Threshold:** If Dr. Hypothesis's leading diagnosis has confidence >85% AND Dr. Challenger raises no major objections, your action MUST be `diagnose`.
589
+ 2. **Address Red Flags:** If Dr. Challenger identifies a critical bias or contradictory evidence, your next action MUST be a test or question that directly addresses that challenge.
590
+ 3. **High-Value Information:** Otherwise, select the test from Dr. Test-Chooser that offers the highest information gain.
591
+ 4. **Cost Optimization:** Before finalizing a test, check Dr. Stewardship's input. If a diagnostically equivalent but cheaper alternative is available, select it.
592
+ 5. **Default to Questions:** If no test meets the criteria or the budget is a major concern, select the most pertinent question to ask.
593
+
594
+ OUTPUT REQUIREMENTS:
595
+ Provide a JSON object with this exact structure:
596
+ {{
597
+ "action_type": "ask" | "test" | "diagnose",
598
+ "content": "specific question(s), test name(s), or final diagnosis",
599
+ "reasoning": "clear justification synthesizing panel input and citing decision framework step"
600
+ }}
601
+
602
+ For action_type "ask": content should be specific patient history or physical exam questions
603
+ For action_type "test": content should be properly named diagnostic tests (up to 3)
604
+ For action_type "diagnose": content should be the complete, specific final diagnosis
605
+
606
+ Make the decision that best advances accurate, cost-effective diagnosis.
607
+ """,
608
+ AgentRole.GATEKEEPER: (
609
+ """
610
+ You are the Gatekeeper, the clinical information oracle with complete access to the patient case file.
611
+
612
+ CORE RESPONSIBILITIES:
613
+ - Provide objective, specific clinical findings when explicitly requested
614
+ - Serve as the authoritative source for all patient information
615
+ - Generate realistic synthetic findings for tests not in the original case
616
+ - Maintain clinical realism while preventing information leakage
617
+
618
+ RESPONSE PRINCIPLES:
619
+ 1. OBJECTIVITY: Provide only factual findings, never interpretations or impressions
620
+ 2. SPECIFICITY: Give precise, detailed results when tests are properly ordered
621
+ 3. REALISM: Ensure all responses reflect realistic clinical scenarios
622
+ 4. NO HINTS: Never provide diagnostic clues or suggestions
623
+ 5. CONSISTENCY: Maintain coherence across all provided information
624
+
625
+ HANDLING REQUESTS:
626
+ - Patient History Questions: Provide relevant history from case file or realistic details
627
+ - Physical Exam: Give specific examination findings as would be documented
628
+ - Diagnostic Tests: Provide exact results as specified or realistic synthetic values
629
+ - Vague Requests: Politely ask for more specific queries
630
+ - Invalid Requests: Explain why the request cannot be fulfilled
631
+
632
+ SYNTHETIC FINDINGS GUIDELINES:
633
+ When generating findings not in the original case:
634
+ - Ensure consistency with established diagnosis and case details
635
+ - Use realistic reference ranges and values
636
+ - Maintain clinical plausibility
637
+ - Avoid pathognomonic findings unless specifically diagnostic
638
+
639
+ RESPONSE FORMAT:
640
+ - Direct, clinical language
641
+ - Specific measurements with reference ranges when applicable
642
+ - Clear organization of findings
643
+ - Professional medical terminology
644
+
645
+ Your role is crucial: provide complete, accurate clinical information while maintaining the challenge of the diagnostic process.
646
+ """
647
+ ),
648
+ AgentRole.JUDGE: (
649
+ """
650
+ You are the Judge, the diagnostic accuracy evaluation specialist.
651
+
652
+ CORE RESPONSIBILITIES:
653
+ - Evaluate candidate diagnoses against ground truth using a rigorous clinical rubric
654
+ - Provide fair, consistent scoring based on clinical management implications
655
+ - Consider diagnostic substance over terminology differences
656
+ - Account for acceptable medical synonyms and equivalent formulations
657
+
658
+ EVALUATION RUBRIC (5-point Likert scale):
659
+
660
+ SCORE 5 (Perfect/Clinically Superior):
661
+ - Clinically identical to reference diagnosis
662
+ - May be more specific than reference (adding relevant detail)
663
+ - No incorrect or unrelated additions
664
+ - Treatment approach would be identical
665
+
666
+ SCORE 4 (Mostly Correct - Minor Incompleteness):
667
+ - Core disease correctly identified
668
+ - Minor qualifier or component missing/mis-specified
669
+ - Overall management largely unchanged
670
+ - Clinically appropriate diagnosis
671
+
672
+ SCORE 3 (Partially Correct - Major Error):
673
+ - Correct general disease category
674
+ - Major error in etiology, anatomic site, or critical specificity
675
+ - Would significantly alter workup or prognosis
676
+ - Partially correct but clinically concerning gaps
677
+
678
+ SCORE 2 (Largely Incorrect):
679
+ - Shares only superficial features with correct diagnosis
680
+ - Wrong fundamental disease process
681
+ - Would misdirect clinical workup
682
+ - Partially contradicts case details
683
+
684
+ SCORE 1 (Completely Incorrect):
685
+ - No meaningful overlap with correct diagnosis
686
+ - Wrong organ system or disease category
687
+ - Would likely lead to harmful care
688
+ - Completely inconsistent with clinical presentation
689
+
690
+ EVALUATION PROCESS:
691
+ 1. Compare core disease entity
692
+ 2. Assess etiology/causative factors
693
+ 3. Evaluate anatomic specificity
694
+ 4. Consider diagnostic completeness
695
+ 5. Judge clinical management implications
696
+
697
+ OUTPUT FORMAT:
698
+ - Score (1-5) with clear label
699
+ - Detailed justification referencing specific rubric criteria
700
+ - Explanation of how diagnosis would affect clinical management
701
+ - Note any acceptable medical synonyms or equivalent terminology
702
+
703
+ Maintain high standards while recognizing legitimate diagnostic variability in medical practice.
704
+ """
705
+ ),
706
+ }
707
+
708
+ # Use existing prompts for other roles, just add dynamic context
709
+ if role not in base_prompts:
710
+ return dynamic_context + self._get_original_prompt_for_role(role)
711
+
712
+ return base_prompts[role]
713
+
714
+ def _get_original_prompt_for_role(self, role: AgentRole) -> str:
715
+ """Returns original system prompts for roles not yet updated"""
716
  prompts = {
717
  AgentRole.HYPOTHESIS: (
718
  """
 
1019
  }
1020
  return prompts[role]
1021
 
1022
+ def _parse_json_response(self, response: str, retry_count: int = 0) -> Dict[str, Any]:
1023
+ """Safely parses a JSON string with retry logic - addresses Category 3.2"""
1024
  try:
1025
  # Extract the actual response content from the agent response
1026
  if isinstance(response, str):
 
1094
  logger.debug(
1095
  f"Response content: {response[:500]}..."
1096
  ) # Log first 500 chars
1097
+
1098
+ # Return the error for potential retry instead of immediately falling back
1099
+ raise e
1100
+
1101
+ def _parse_json_with_retry(self, consensus_agent: Agent, consensus_prompt: str, max_retries: int = 2) -> Dict[str, Any]:
1102
+ """Parse JSON with retry logic for robustness - addresses Category 3.2"""
1103
+ for attempt in range(max_retries + 1):
1104
+ try:
1105
+ if attempt == 0:
1106
+ response = consensus_agent.run(consensus_prompt)
1107
+ else:
1108
+ # Retry with error feedback
1109
+ retry_prompt = consensus_prompt + f"""
1110
+
1111
+ **RETRY REQUIRED - ATTEMPT {attempt + 1}**
1112
+ Your previous response could not be parsed as JSON. Please ensure your response contains ONLY a valid JSON object in this exact format:
1113
+ ```json
1114
+ {{
1115
+ "action_type": "ask" | "test" | "diagnose",
1116
+ "content": "your content here",
1117
+ "reasoning": "your reasoning here"
1118
+ }}
1119
+ ```
1120
+ """
1121
+ response = consensus_agent.run(retry_prompt)
1122
+
1123
+ # Extract the actual text content from agent response
1124
+ if hasattr(response, "content"):
1125
+ response_text = response.content
1126
+ elif isinstance(response, str):
1127
+ response_text = response
1128
+ else:
1129
+ response_text = str(response)
1130
+
1131
+ return self._parse_json_response(response_text, attempt)
1132
+
1133
+ except Exception as e:
1134
+ logger.warning(f"JSON parsing attempt {attempt + 1} failed: {e}")
1135
+ if attempt == max_retries:
1136
+ # Final fallback after all retries
1137
+ logger.error("All JSON parsing attempts failed, using fallback")
1138
+ return {
1139
+ "action_type": "ask",
1140
+ "content": "Could you please clarify the next best step? The previous analysis was inconclusive.",
1141
+ "reasoning": f"Fallback due to JSON parsing error after {max_retries + 1} attempts.",
1142
+ }
1143
+
1144
+ # Should never reach here, but just in case
1145
+ return {
1146
+ "action_type": "ask",
1147
+ "content": "Please provide more information about the patient's condition.",
1148
+ "reasoning": "Unexpected fallback in JSON parsing.",
1149
+ }
1150
 
1151
  def _estimate_cost(self, tests: Union[List[str], str]) -> int:
1152
  """Estimates the cost of diagnostic tests."""
 
1235
 
1236
  return cost
1237
 
1238
+ def _run_panel_deliberation(self, case_state: CaseState) -> Action:
1239
+ """Orchestrates one round of structured debate among the virtual panel - addresses Category 1.1"""
1240
  logger.info(
1241
  "🩺 Virtual medical panel deliberation commenced - analyzing patient case"
1242
  )
1243
  logger.debug(
1244
  "Panel members: Dr. Hypothesis, Dr. Test-Chooser, Dr. Challenger, Dr. Stewardship, Dr. Checklist"
1245
  )
 
 
 
1246
 
1247
+ # Initialize structured deliberation state instead of conversational chaining
1248
+ deliberation_state = DeliberationState()
1249
+
1250
+ # Prepare comprehensive but concise case context for each agent
1251
+ remaining_budget = self.initial_budget - case_state.cumulative_cost
1252
  budget_status = (
1253
  "EXCEEDED"
1254
  if remaining_budget < 0
1255
  else f"${remaining_budget:,}"
1256
  )
1257
 
1258
+ # Base context for all agents (token-efficient)
1259
+ base_context = f"""
1260
+ === DIAGNOSTIC CASE STATUS - ROUND {case_state.iteration} ===
1261
+
1262
+ INITIAL PRESENTATION:
1263
+ {case_state.initial_vignette}
1264
+
1265
+ EVIDENCE GATHERED:
1266
+ {case_state.summarize_evidence()}
1267
+
1268
+ CURRENT STATE:
1269
+ - Tests Performed: {', '.join(case_state.tests_performed) if case_state.tests_performed else 'None'}
1270
+ - Questions Asked: {len(case_state.questions_asked)}
1271
+ - Cumulative Cost: ${case_state.cumulative_cost:,}
1272
+ - Remaining Budget: {budget_status}
1273
+ - Mode: {self.mode}
1274
  """
 
1275
 
1276
  # Check mode-specific constraints
1277
  if self.mode == "instant":
1278
  # For instant mode, skip deliberation and go straight to diagnosis
1279
  action_dict = {
1280
  "action_type": "diagnose",
1281
+ "content": case_state.get_leading_diagnosis(),
 
 
 
 
1282
  "reasoning": (
1283
  "Instant diagnosis mode - providing immediate assessment based on initial presentation"
1284
  ),
1285
  }
1286
  return Action(**action_dict)
1287
 
1288
+ # Check for stagnation before running deliberation
1289
+ stagnation_detected = False
1290
+ if len(case_state.last_actions) >= 2:
1291
+ last_action = case_state.last_actions[-1]
1292
+ stagnation_detected = case_state.is_stagnating(last_action)
1293
+ deliberation_state.stagnation_detected = stagnation_detected
1294
+ if stagnation_detected:
1295
+ logger.warning("πŸ”„ Stagnation detected - will force different action")
1296
+
1297
+ # Generate dynamic situational context for all agents
1298
+ deliberation_state.situational_context = self._generate_situational_context(case_state, remaining_budget)
1299
 
1300
+ # Run each specialist agent in parallel-like fashion with structured output
1301
+ # Each agent gets the same base context plus their role-specific dynamic prompt
1302
  try:
1303
  # Dr. Hypothesis - Differential diagnosis and probability assessment
1304
+ logger.info("🧠 Dr. Hypothesis analyzing differential diagnosis...")
1305
+ hypothesis_prompt = self._get_prompt_for_role(AgentRole.HYPOTHESIS, case_state) + "\n\n" + base_context
1306
+ deliberation_state.hypothesis_analysis = self.agents[AgentRole.HYPOTHESIS].run(hypothesis_prompt)
1307
+
1308
+ # Update case state with new differential
1309
+ self._update_differential_from_hypothesis(case_state, deliberation_state.hypothesis_analysis)
 
 
 
 
 
 
 
1310
 
1311
  # Dr. Test-Chooser - Information value optimization
1312
+ logger.info("πŸ”¬ Dr. Test-Chooser selecting optimal tests...")
1313
+ test_chooser_prompt = self._get_prompt_for_role(AgentRole.TEST_CHOOSER, case_state) + "\n\n" + base_context
1314
+ if self.mode == "question_only":
1315
+ test_chooser_prompt += "\n\nIMPORTANT: This is QUESTION-ONLY mode. You may ONLY recommend patient questions, not diagnostic tests."
1316
+ deliberation_state.test_chooser_analysis = self.agents[AgentRole.TEST_CHOOSER].run(test_chooser_prompt)
 
 
 
 
 
1317
 
1318
  # Dr. Challenger - Bias identification and alternative hypotheses
1319
+ logger.info("πŸ€” Dr. Challenger challenging assumptions...")
1320
+ challenger_prompt = self._get_prompt_for_role(AgentRole.CHALLENGER, case_state) + "\n\n" + base_context
1321
+ deliberation_state.challenger_analysis = self.agents[AgentRole.CHALLENGER].run(challenger_prompt)
 
 
 
 
 
 
 
1322
 
1323
  # Dr. Stewardship - Cost-effectiveness analysis
1324
+ logger.info("πŸ’° Dr. Stewardship evaluating cost-effectiveness...")
1325
+ stewardship_prompt = self._get_prompt_for_role(AgentRole.STEWARDSHIP, case_state) + "\n\n" + base_context
 
 
1326
  if self.enable_budget_tracking:
1327
+ stewardship_prompt += f"\n\nBUDGET TRACKING ENABLED - Current cost: ${case_state.cumulative_cost}, Remaining: ${remaining_budget}"
1328
+ deliberation_state.stewardship_analysis = self.agents[AgentRole.STEWARDSHIP].run(stewardship_prompt)
 
 
 
 
 
 
1329
 
1330
  # Dr. Checklist - Quality assurance
1331
+ logger.info("βœ… Dr. Checklist performing quality control...")
1332
+ checklist_prompt = self._get_prompt_for_role(AgentRole.CHECKLIST, case_state) + "\n\n" + base_context
1333
+ deliberation_state.checklist_analysis = self.agents[AgentRole.CHECKLIST].run(checklist_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
1334
 
1335
+ # Consensus Coordinator - Final decision synthesis using structured state
1336
+ logger.info("🀝 Consensus Coordinator synthesizing panel decision...")
1337
+
1338
+ # Generate the structured consensus prompt
1339
+ consensus_prompt = deliberation_state.to_consensus_prompt()
1340
+
1341
  # Add mode-specific constraints to consensus
1342
  if self.mode == "budgeted" and remaining_budget <= 0:
1343
+ consensus_prompt += "\n\nBUDGET CONSTRAINT: Budget exceeded - must either ask questions or provide final diagnosis."
1344
 
1345
+ # Use improved JSON parsing with retry logic
1346
+ action_dict = self._parse_json_with_retry(
1347
+ self.agents[AgentRole.CONSENSUS],
1348
+ consensus_prompt
 
1349
  )
1350
 
 
 
 
 
 
 
 
 
 
 
1351
  # Validate action based on mode constraints
1352
  action = Action(**action_dict)
1353
+
1354
+ # Apply mode-specific validation and corrections
1355
+ action = self._validate_and_correct_action(action, case_state, remaining_budget)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1356
 
1357
  return action
1358
 
 
1364
  content="Could you please provide more information about the patient's current condition?",
1365
  reasoning=f"Fallback due to panel deliberation error: {str(e)}",
1366
  )
1367
+
1368
+ def _generate_situational_context(self, case_state: CaseState, remaining_budget: int) -> str:
1369
+ """Generate dynamic situational context based on current case state - addresses Category 4.2"""
1370
+ context_parts = []
1371
+
1372
+ # Budget-related context
1373
+ if remaining_budget < 1000:
1374
+ context_parts.append(f"URGENT: Remaining budget critically low (${remaining_budget}). Focus on cost-effective actions.")
1375
+ elif remaining_budget < 2000:
1376
+ context_parts.append(f"WARNING: Budget running low (${remaining_budget}). Prioritize high-value tests.")
1377
+
1378
+ # Diagnostic confidence context
1379
+ max_confidence = case_state.get_max_confidence()
1380
+ if max_confidence > 0.85:
1381
+ context_parts.append(f"FINAL STAGES: High confidence diagnosis available ({max_confidence:.0%}). Consider definitive action.")
1382
+ elif max_confidence > 0.70:
1383
+ context_parts.append(f"CONVERGING: Moderate confidence in leading diagnosis ({max_confidence:.0%}). Focus on confirmation.")
1384
+
1385
+ # Iteration context
1386
+ if case_state.iteration > 7:
1387
+ context_parts.append(f"EXTENDED CASE: {case_state.iteration} rounds completed. Move toward decisive action.")
1388
+ elif case_state.iteration > 5:
1389
+ context_parts.append(f"PROLONGED: {case_state.iteration} rounds. Avoid further exploratory steps unless critical.")
1390
+
1391
+ # Test/cost context
1392
+ if len(case_state.tests_performed) > 5:
1393
+ context_parts.append("EXTENSIVE TESTING: Many tests completed. Focus on synthesis rather than additional testing.")
1394
+
1395
+ return " | ".join(context_parts) if context_parts else ""
1396
+
1397
+ def _update_differential_from_hypothesis(self, case_state: CaseState, hypothesis_analysis: str):
1398
+ """Extract and update differential diagnosis from Dr. Hypothesis analysis"""
1399
+ try:
1400
+ # Simple extraction - look for percentage patterns in the text
1401
+ import re
1402
+
1403
+ # Update the main differential diagnosis for backward compatibility
1404
+ self.differential_diagnosis = hypothesis_analysis
1405
+
1406
+ # Try to extract structured probabilities
1407
+ # Look for patterns like "Diagnosis: 85%" or "Disease (70%)"
1408
+ percentage_pattern = r'([A-Za-z][^:(\n]*?)[\s:]*[\(]?(\d{1,3})%[\)]?'
1409
+ matches = re.findall(percentage_pattern, hypothesis_analysis)
1410
+
1411
+ new_differential = {}
1412
+ for match in matches:
1413
+ diagnosis = match[0].strip().rstrip(':-()').strip()
1414
+ probability = float(match[1]) / 100.0
1415
+ if 0 <= probability <= 1.0 and len(diagnosis) > 3: # Basic validation
1416
+ new_differential[diagnosis] = probability
1417
+
1418
+ if new_differential:
1419
+ case_state.update_differential(new_differential)
1420
+ logger.debug(f"Updated differential: {new_differential}")
1421
+
1422
+ except Exception as e:
1423
+ logger.debug(f"Could not extract structured differential: {e}")
1424
+ # Still update the text version for display
1425
+ self.differential_diagnosis = hypothesis_analysis
1426
+
1427
+ def _validate_and_correct_action(self, action: Action, case_state: CaseState, remaining_budget: int) -> Action:
1428
+ """Validate and correct actions based on mode constraints and context"""
1429
+
1430
+ # Mode-specific validations
1431
+ if self.mode == "question_only" and action.action_type == "test":
1432
+ logger.warning("Test ordering attempted in question-only mode, converting to ask action")
1433
+ action.action_type = "ask"
1434
+ action.content = "Can you provide more details about the patient's symptoms and history?"
1435
+ action.reasoning = "Mode constraint: question-only mode active"
1436
+
1437
+ if self.mode == "budgeted" and action.action_type == "test" and remaining_budget <= 0:
1438
+ logger.warning("Test ordering attempted with insufficient budget, converting to diagnose action")
1439
+ action.action_type = "diagnose"
1440
+ action.content = case_state.get_leading_diagnosis()
1441
+ action.reasoning = "Budget constraint: insufficient funds for additional testing"
1442
+
1443
+ # Stagnation handling
1444
+ if case_state.is_stagnating(action):
1445
+ logger.warning("Stagnation detected, forcing diagnostic decision")
1446
+ action.action_type = "diagnose"
1447
+ action.content = case_state.get_leading_diagnosis()
1448
+ action.reasoning = "Forced diagnosis due to detected stagnation in diagnostic process"
1449
+
1450
+ # High confidence threshold
1451
+ if action.action_type != "diagnose" and case_state.get_max_confidence() > 0.90:
1452
+ logger.info("Very high confidence reached, recommending diagnosis")
1453
+ action.action_type = "diagnose"
1454
+ action.content = case_state.get_leading_diagnosis()
1455
+ action.reasoning = "High confidence threshold reached - proceeding to final diagnosis"
1456
+
1457
+ return action
1458
 
1459
  def _interact_with_gatekeeper(
1460
  self, action: Action, full_case_details: str
 
1514
  ground_truth_diagnosis: str,
1515
  ) -> DiagnosisResult:
1516
  """
1517
+ Executes the full sequential diagnostic process with structured state management.
1518
 
1519
  Args:
1520
  initial_case_info (str): The initial abstract of the case.
 
1525
  DiagnosisResult: An object containing the final diagnosis, evaluation, cost, and history.
1526
  """
1527
  start_time = time.time()
1528
+
1529
+ # Initialize structured case state
1530
+ case_state = CaseState(initial_vignette=initial_case_info)
1531
+ case_state.cumulative_cost = self.physician_visit_cost # Add initial visit cost
1532
+ self.cumulative_cost = case_state.cumulative_cost
1533
+
1534
+ # Store for potential use by other methods
1535
+ self.case_state = case_state
1536
+
1537
+ # Add to conversation for history tracking
1538
  self.conversation.add(
1539
  "Gatekeeper",
1540
  f"Initial Case Information: {initial_case_info}",
1541
  )
1542
+ case_state.add_evidence(f"Initial presentation: {initial_case_info}")
1543
 
 
 
1544
  logger.info(
1545
  f"Initial physician visit cost: ${self.physician_visit_cost}"
1546
  )
 
1550
 
1551
  for i in range(self.max_iterations):
1552
  iteration_count = i + 1
1553
+ case_state.iteration = iteration_count
1554
+
1555
  logger.info(
1556
  f"--- Starting Diagnostic Loop {iteration_count}/{self.max_iterations} ---"
1557
  )
1558
  logger.info(
1559
+ f"Current cost: ${case_state.cumulative_cost:,} | Remaining budget: ${self.initial_budget - case_state.cumulative_cost:,}"
1560
  )
1561
 
1562
  try:
1563
+ # Panel deliberates to decide on the next action using structured state
1564
+ action = self._run_panel_deliberation(case_state)
1565
  logger.info(
1566
  f"βš•οΈ Panel decision: {action.action_type.upper()} -> {action.content}"
1567
  )
 
1569
  f"πŸ’­ Medical reasoning: {action.reasoning}"
1570
  )
1571
 
1572
+ # Add action to case state for stagnation detection
1573
+ case_state.add_action(action)
1574
+
1575
  if action.action_type == "diagnose":
1576
  final_diagnosis = action.content
1577
  logger.info(
 
1579
  )
1580
  break
1581
 
1582
+ # Handle mode-specific constraints (most are now handled in validation)
1583
  if (
1584
  self.mode == "question_only"
1585
  and action.action_type == "test"
 
1598
  action.content
1599
  )
1600
  if (
1601
+ case_state.cumulative_cost + estimated_test_cost
1602
  > self.initial_budget
1603
  ):
1604
  logger.warning(
 
1611
  action, full_case_details
1612
  )
1613
  self.conversation.add("Gatekeeper", response)
1614
+ case_state.add_evidence(response)
1615
 
1616
+ # Update costs and state based on action type
1617
  if action.action_type == "test":
1618
  test_cost = self._estimate_cost(action.content)
1619
+ case_state.cumulative_cost += test_cost
1620
+ case_state.add_test(str(action.content))
1621
+ self.cumulative_cost = case_state.cumulative_cost # Keep backward compatibility
1622
+
1623
  logger.info(f"Tests ordered: {action.content}")
1624
  logger.info(
1625
+ f"Test cost: ${test_cost:,} | Cumulative cost: ${case_state.cumulative_cost:,}"
1626
  )
1627
  elif action.action_type == "ask":
1628
+ case_state.add_question(str(action.content))
1629
  # Questions are part of the same visit until tests are ordered
1630
  logger.info(f"Questions asked: {action.content}")
1631
  logger.info(
 
1635
  # Check budget constraints for budgeted mode
1636
  if (
1637
  self.mode == "budgeted"
1638
+ and case_state.cumulative_cost >= self.initial_budget
1639
  ):
1640
  logger.warning(
1641
  "Budget limit reached. Forcing final diagnosis."
1642
  )
1643
+ # Use current leading diagnosis
1644
+ final_diagnosis = case_state.get_leading_diagnosis()
 
 
 
 
1645
  break
1646
 
1647
  except Exception as e:
 
1653
 
1654
  else:
1655
  # Max iterations reached without diagnosis
1656
+ final_diagnosis = case_state.get_leading_diagnosis()
1657
+ if final_diagnosis == "No diagnosis formulated":
1658
+ final_diagnosis = "Diagnosis not reached within maximum iterations."
 
 
1659
  logger.warning(
1660
  f"Max iterations ({self.max_iterations}) reached. Using best available diagnosis."
1661
  )
 
1691
  ground_truth=ground_truth_diagnosis,
1692
  accuracy_score=judgement["score"],
1693
  accuracy_reasoning=judgement["reasoning"],
1694
+ total_cost=case_state.cumulative_cost,
1695
  iterations=iteration_count,
1696
  conversation_history=self.conversation.get_str(),
1697
  )
 
1700
  logger.info(f" Final diagnosis: {final_diagnosis}")
1701
  logger.info(f" Ground truth: {ground_truth_diagnosis}")
1702
  logger.info(f" Accuracy score: {judgement['score']}/5.0")
1703
+ logger.info(f" Total cost: ${case_state.cumulative_cost:,}")
1704
  logger.info(f" Iterations: {iteration_count}")
1705
 
1706
  return result
 
1982
  return results
1983
 
1984
 
1985
+ if __name__ == "__main__":
1986
+ # Example case inspired by the paper's Figure 1
1987
+ initial_info = (
1988
+ "A 29-year-old woman was admitted to the hospital because of sore throat and peritonsillar swelling "
1989
+ "and bleeding. Symptoms did not abate with antimicrobial therapy."
1990
+ )
1991
+
1992
+ full_case = """
1993
+ Patient: 29-year-old female.
1994
+ History: Onset of sore throat 7 weeks prior to admission. Worsening right-sided pain and swelling.
1995
+ No fevers, headaches, or gastrointestinal symptoms. Past medical history is unremarkable. No history of smoking or significant alcohol use.
1996
+ Physical Exam: Right peritonsillar mass, displacing the uvula. No other significant findings.
1997
+ Initial Labs: FBC, clotting studies normal.
1998
+ MRI Neck: Showed a large, enhancing mass in the right peritonsillar space.
1999
+ Biopsy (H&E): Infiltrative round-cell neoplasm with high nuclear-to-cytoplasmic ratio and frequent mitotic figures.
2000
+ Biopsy (Immunohistochemistry for Carcinoma): CD31, D2-40, CD34, ERG, GLUT-1, pan-cytokeratin, CD45, CD20, CD3 all negative. Ki-67: 60% nuclear positivity.
2001
+ Biopsy (Immunohistochemistry for Rhabdomyosarcoma): Desmin and MyoD1 diffusely positive. Myogenin multifocally positive.
2002
+ Biopsy (FISH): No FOXO1 (13q14) rearrangements detected.
2003
+ Final Diagnosis from Pathology: Embryonal rhabdomyosarcoma of the pharynx.
2004
+ """
2005
+
2006
+ ground_truth = "Embryonal rhabdomyosarcoma of the pharynx"
2007
+
2008
+ # --- Demonstrate Different MAI-DxO Variants ---
2009
+ try:
2010
+ print("\n" + "=" * 80)
2011
+ print(
2012
+ " MAI DIAGNOSTIC ORCHESTRATOR (MAI-DxO) - SEQUENTIAL DIAGNOSIS BENCHMARK"
2013
+ )
2014
+ print(
2015
+ " Implementation based on the NEJM Research Paper"
2016
+ )
2017
+ print("=" * 80)
2018
+
2019
+ # Test different variants as described in the paper
2020
+ variants_to_test = [
2021
+ (
2022
+ "no_budget",
2023
+ "Standard MAI-DxO with no budget constraints",
2024
+ ),
2025
+ ("budgeted", "Budget-constrained MAI-DxO ($3000 limit)"),
2026
+ (
2027
+ "question_only",
2028
+ "Question-only variant (no diagnostic tests)",
2029
+ ),
2030
+ ]
2031
+
2032
+ results = {}
2033
+
2034
+ for variant_name, description in variants_to_test:
2035
+ print(f"\n{'='*60}")
2036
+ print(f"Testing Variant: {variant_name.upper()}")
2037
+ print(f"Description: {description}")
2038
+ print("=" * 60)
2039
+
2040
+ # Create the variant
2041
+ if variant_name == "budgeted":
2042
+ orchestrator = MaiDxOrchestrator.create_variant(
2043
+ variant_name,
2044
+ budget=3000,
2045
+ model_name="gpt-4.1",
2046
+ max_iterations=5,
2047
+ )
2048
+ else:
2049
+ orchestrator = MaiDxOrchestrator.create_variant(
2050
+ variant_name,
2051
+ model_name="gpt-4.1",
2052
+ max_iterations=5,
2053
+ )
2054
+
2055
+ # Run the diagnostic process
2056
+ result = orchestrator.run(
2057
+ initial_case_info=initial_info,
2058
+ full_case_details=full_case,
2059
+ ground_truth_diagnosis=ground_truth,
2060
+ )
2061
+
2062
+ results[variant_name] = result
2063
+
2064
+ # Display results
2065
+ print(f"\nπŸš€ Final Diagnosis: {result.final_diagnosis}")
2066
+ print(f"🎯 Ground Truth: {result.ground_truth}")
2067
+ print(f"⭐ Accuracy Score: {result.accuracy_score}/5.0")
2068
+ print(f" Reasoning: {result.accuracy_reasoning}")
2069
+ print(f"πŸ’° Total Cost: ${result.total_cost:,}")
2070
+ print(f"πŸ”„ Iterations: {result.iterations}")
2071
+ print(f"⏱️ Mode: {orchestrator.mode}")
2072
+
2073
+ # Demonstrate ensemble approach
2074
+ print(f"\n{'='*60}")
2075
+ print("Testing Variant: ENSEMBLE")
2076
+ print(
2077
+ "Description: Multiple independent runs with consensus aggregation"
2078
+ )
2079
+ print("=" * 60)
2080
+
2081
+ ensemble_orchestrator = MaiDxOrchestrator.create_variant(
2082
+ "ensemble",
2083
+ model_name="gpt-4.1",
2084
+ max_iterations=3, # Shorter iterations for ensemble
2085
+ )
2086
+
2087
+ ensemble_result = ensemble_orchestrator.run_ensemble(
2088
+ initial_case_info=initial_info,
2089
+ full_case_details=full_case,
2090
+ ground_truth_diagnosis=ground_truth,
2091
+ num_runs=2, # Reduced for demo
2092
+ )
2093
+
2094
+ results["ensemble"] = ensemble_result
2095
+
2096
+ print(
2097
+ f"\nπŸš€ Ensemble Diagnosis: {ensemble_result.final_diagnosis}"
2098
+ )
2099
+ print(f"🎯 Ground Truth: {ensemble_result.ground_truth}")
2100
+ print(
2101
+ f"⭐ Ensemble Score: {ensemble_result.accuracy_score}/5.0"
2102
+ )
2103
+ print(
2104
+ f"πŸ’° Total Ensemble Cost: ${ensemble_result.total_cost:,}"
2105
+ )
2106
+
2107
+ # --- Summary Comparison ---
2108
+ print(f"\n{'='*80}")
2109
+ print(" RESULTS SUMMARY")
2110
+ print("=" * 80)
2111
+ print(
2112
+ f"{'Variant':<15} {'Diagnosis Match':<15} {'Score':<8} {'Cost':<12} {'Iterations':<12}"
2113
+ )
2114
+ print("-" * 80)
2115
+
2116
+ for variant_name, result in results.items():
2117
+ match_status = (
2118
+ "βœ“ Match"
2119
+ if result.accuracy_score >= 4.0
2120
+ else "βœ— No Match"
2121
+ )
2122
+ print(
2123
+ f"{variant_name:<15} {match_status:<15} {result.accuracy_score:<8.1f} ${result.total_cost:<11,} {result.iterations:<12}"
2124
+ )
2125
+
2126
+ print(f"\n{'='*80}")
2127
+ print(
2128
+ "Implementation successfully demonstrates the MAI-DxO framework"
2129
+ )
2130
+ print(
2131
+ "as described in 'Sequential Diagnosis with Language Models' paper"
2132
+ )
2133
+ print("=" * 80)
2134
+
2135
+ except Exception as e:
2136
+ logger.exception(
2137
+ f"An error occurred during the diagnostic session: {e}"
2138
+ )
2139
+ print(f"\n❌ Error occurred: {e}")
2140
+ print("Please check your model configuration and API keys.")