Spaces:
Sleeping
Sleeping
Commit
Β·
ff4c876
1
Parent(s):
25d50f4
Implement code changes to enhance functionality and improve performance
Browse files- mai_dx/main.py +912 -336
mai_dx/main.py
CHANGED
@@ -18,7 +18,7 @@ Key Features:
|
|
18 |
|
19 |
Example Usage:
|
20 |
# Standard MAI-DxO usage
|
21 |
-
orchestrator = MaiDxOrchestrator(model_name="gpt-
|
22 |
result = orchestrator.run(initial_case_info, full_case_details, ground_truth)
|
23 |
|
24 |
# Budget-constrained variant
|
@@ -33,13 +33,16 @@ import os
|
|
33 |
import json
|
34 |
import sys
|
35 |
import time
|
36 |
-
from dataclasses import dataclass
|
37 |
from enum import Enum
|
38 |
from typing import Any, Dict, List, Union, Literal
|
39 |
|
40 |
from loguru import logger
|
41 |
from pydantic import BaseModel, Field
|
42 |
from swarms import Agent, Conversation
|
|
|
|
|
|
|
43 |
|
44 |
# Configure Loguru with beautiful formatting and features
|
45 |
logger.remove() # Remove default handler
|
@@ -91,6 +94,131 @@ class AgentRole(Enum):
|
|
91 |
JUDGE = "Judge"
|
92 |
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
@dataclass
|
95 |
class DiagnosisResult:
|
96 |
"""Stores the final result of a diagnostic session."""
|
@@ -127,11 +255,13 @@ class MaiDxOrchestrator:
|
|
127 |
Implements the MAI Diagnostic Orchestrator (MAI-DxO) framework.
|
128 |
This class orchestrates a virtual panel of AI agents to perform sequential medical diagnosis,
|
129 |
evaluates the final diagnosis, and tracks costs.
|
|
|
|
|
130 |
"""
|
131 |
|
132 |
def __init__(
|
133 |
self,
|
134 |
-
model_name: str = "gpt-4.1",
|
135 |
max_iterations: int = 10,
|
136 |
initial_budget: int = 10000,
|
137 |
mode: str = "no_budget", # "instant", "question_only", "budgeted", "no_budget", "ensemble"
|
@@ -139,7 +269,7 @@ class MaiDxOrchestrator:
|
|
139 |
enable_budget_tracking: bool = False,
|
140 |
):
|
141 |
"""
|
142 |
-
Initializes the MAI-DxO system.
|
143 |
|
144 |
Args:
|
145 |
model_name (str): The language model to be used by all agents.
|
@@ -161,6 +291,9 @@ class MaiDxOrchestrator:
|
|
161 |
self.conversation = Conversation(
|
162 |
time_enabled=True, autosave=False, save_enabled=False
|
163 |
)
|
|
|
|
|
|
|
164 |
|
165 |
# Enhanced cost model based on the paper's methodology
|
166 |
self.test_cost_db = {
|
@@ -198,6 +331,20 @@ class MaiDxOrchestrator:
|
|
198 |
f"π₯ MAI Diagnostic Orchestrator initialized successfully in '{mode}' mode with budget ${initial_budget:,}"
|
199 |
)
|
200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
def _init_agents(self) -> None:
|
202 |
"""Initializes all required agents with their specific roles and prompts."""
|
203 |
self.agents = {
|
@@ -210,6 +357,7 @@ class MaiDxOrchestrator:
|
|
210 |
"json" if role == AgentRole.CONSENSUS else "str"
|
211 |
),
|
212 |
print_on=True, # Enable printing for all agents to see outputs
|
|
|
213 |
)
|
214 |
for role in AgentRole
|
215 |
}
|
@@ -217,8 +365,354 @@ class MaiDxOrchestrator:
|
|
217 |
f"π₯ {len(self.agents)} virtual physician agents initialized and ready for consultation"
|
218 |
)
|
219 |
|
220 |
-
def
|
221 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
prompts = {
|
223 |
AgentRole.HYPOTHESIS: (
|
224 |
"""
|
@@ -525,8 +1019,8 @@ class MaiDxOrchestrator:
|
|
525 |
}
|
526 |
return prompts[role]
|
527 |
|
528 |
-
def _parse_json_response(self, response: str) -> Dict[str, Any]:
|
529 |
-
"""Safely parses a JSON string
|
530 |
try:
|
531 |
# Extract the actual response content from the agent response
|
532 |
if isinstance(response, str):
|
@@ -600,14 +1094,59 @@ class MaiDxOrchestrator:
|
|
600 |
logger.debug(
|
601 |
f"Response content: {response[:500]}..."
|
602 |
) # Log first 500 chars
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
611 |
|
612 |
def _estimate_cost(self, tests: Union[List[str], str]) -> int:
|
613 |
"""Estimates the cost of diagnostic tests."""
|
@@ -696,191 +1235,124 @@ class MaiDxOrchestrator:
|
|
696 |
|
697 |
return cost
|
698 |
|
699 |
-
def _run_panel_deliberation(self) -> Action:
|
700 |
-
"""Orchestrates one round of debate among the virtual panel
|
701 |
logger.info(
|
702 |
"π©Ί Virtual medical panel deliberation commenced - analyzing patient case"
|
703 |
)
|
704 |
logger.debug(
|
705 |
"Panel members: Dr. Hypothesis, Dr. Test-Chooser, Dr. Challenger, Dr. Stewardship, Dr. Checklist"
|
706 |
)
|
707 |
-
panel_conversation = Conversation(
|
708 |
-
time_enabled=True, autosave=False, save_enabled=False
|
709 |
-
)
|
710 |
|
711 |
-
#
|
712 |
-
|
|
|
|
|
|
|
713 |
budget_status = (
|
714 |
"EXCEEDED"
|
715 |
if remaining_budget < 0
|
716 |
else f"${remaining_budget:,}"
|
717 |
)
|
718 |
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
731 |
-
|
732 |
-
|
733 |
-
|
734 |
-
|
735 |
"""
|
736 |
-
panel_conversation.add("System", panel_context)
|
737 |
|
738 |
# Check mode-specific constraints
|
739 |
if self.mode == "instant":
|
740 |
# For instant mode, skip deliberation and go straight to diagnosis
|
741 |
action_dict = {
|
742 |
"action_type": "diagnose",
|
743 |
-
"content": (
|
744 |
-
self.differential_diagnosis.split("\n")[0]
|
745 |
-
if "\n" in self.differential_diagnosis
|
746 |
-
else self.differential_diagnosis
|
747 |
-
),
|
748 |
"reasoning": (
|
749 |
"Instant diagnosis mode - providing immediate assessment based on initial presentation"
|
750 |
),
|
751 |
}
|
752 |
return Action(**action_dict)
|
753 |
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
758 |
|
759 |
-
#
|
|
|
760 |
try:
|
761 |
# Dr. Hypothesis - Differential diagnosis and probability assessment
|
762 |
-
logger.info(
|
763 |
-
|
764 |
-
)
|
765 |
-
|
766 |
-
|
767 |
-
)
|
768 |
-
self.differential_diagnosis = (
|
769 |
-
hypothesis # Update main state
|
770 |
-
)
|
771 |
-
panel_conversation.add(
|
772 |
-
self.agents[AgentRole.HYPOTHESIS].agent_name,
|
773 |
-
hypothesis,
|
774 |
-
)
|
775 |
|
776 |
# Dr. Test-Chooser - Information value optimization
|
777 |
-
logger.info(
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
)
|
783 |
-
panel_conversation.add(
|
784 |
-
self.agents[AgentRole.TEST_CHOOSER].agent_name,
|
785 |
-
test_choices,
|
786 |
-
)
|
787 |
|
788 |
# Dr. Challenger - Bias identification and alternative hypotheses
|
789 |
-
logger.info(
|
790 |
-
|
791 |
-
)
|
792 |
-
challenges = self.agents[AgentRole.CHALLENGER].run(
|
793 |
-
panel_conversation.get_str()
|
794 |
-
)
|
795 |
-
panel_conversation.add(
|
796 |
-
self.agents[AgentRole.CHALLENGER].agent_name,
|
797 |
-
challenges,
|
798 |
-
)
|
799 |
|
800 |
# Dr. Stewardship - Cost-effectiveness analysis
|
801 |
-
logger.info(
|
802 |
-
|
803 |
-
)
|
804 |
-
stewardship_context = panel_conversation.get_str()
|
805 |
if self.enable_budget_tracking:
|
806 |
-
|
807 |
-
|
808 |
-
stewardship_context
|
809 |
-
)
|
810 |
-
panel_conversation.add(
|
811 |
-
self.agents[AgentRole.STEWARDSHIP].agent_name,
|
812 |
-
stewardship_rec,
|
813 |
-
)
|
814 |
|
815 |
# Dr. Checklist - Quality assurance
|
816 |
-
logger.info(
|
817 |
-
|
818 |
-
)
|
819 |
-
checklist_rep = self.agents[AgentRole.CHECKLIST].run(
|
820 |
-
panel_conversation.get_str()
|
821 |
-
)
|
822 |
-
panel_conversation.add(
|
823 |
-
self.agents[AgentRole.CHECKLIST].agent_name,
|
824 |
-
checklist_rep,
|
825 |
-
)
|
826 |
-
|
827 |
-
# Consensus Coordinator - Final decision synthesis
|
828 |
-
logger.info(
|
829 |
-
"π€ Consensus Coordinator synthesizing panel decision..."
|
830 |
-
)
|
831 |
-
consensus_context = panel_conversation.get_str()
|
832 |
|
|
|
|
|
|
|
|
|
|
|
|
|
833 |
# Add mode-specific constraints to consensus
|
834 |
if self.mode == "budgeted" and remaining_budget <= 0:
|
835 |
-
|
836 |
|
837 |
-
|
838 |
-
|
839 |
-
|
840 |
-
|
841 |
-
f"Raw consensus response: {consensus_response}"
|
842 |
)
|
843 |
|
844 |
-
# Extract the actual text content from agent response
|
845 |
-
if hasattr(consensus_response, "content"):
|
846 |
-
response_text = consensus_response.content
|
847 |
-
elif isinstance(consensus_response, str):
|
848 |
-
response_text = consensus_response
|
849 |
-
else:
|
850 |
-
response_text = str(consensus_response)
|
851 |
-
|
852 |
-
action_dict = self._parse_json_response(response_text)
|
853 |
-
|
854 |
# Validate action based on mode constraints
|
855 |
action = Action(**action_dict)
|
856 |
-
|
857 |
-
|
858 |
-
|
859 |
-
):
|
860 |
-
logger.warning(
|
861 |
-
"Test ordering attempted in question-only mode, converting to ask action"
|
862 |
-
)
|
863 |
-
action.action_type = "ask"
|
864 |
-
action.content = "Can you provide more details about the patient's symptoms and history?"
|
865 |
-
action.reasoning = (
|
866 |
-
"Mode constraint: question-only mode active"
|
867 |
-
)
|
868 |
-
|
869 |
-
if (
|
870 |
-
self.mode == "budgeted"
|
871 |
-
and action.action_type == "test"
|
872 |
-
and remaining_budget <= 0
|
873 |
-
):
|
874 |
-
logger.warning(
|
875 |
-
"Test ordering attempted with insufficient budget, converting to diagnose action"
|
876 |
-
)
|
877 |
-
action.action_type = "diagnose"
|
878 |
-
action.content = (
|
879 |
-
self.differential_diagnosis.split("\n")[0]
|
880 |
-
if "\n" in self.differential_diagnosis
|
881 |
-
else self.differential_diagnosis
|
882 |
-
)
|
883 |
-
action.reasoning = "Budget constraint: insufficient funds for additional testing"
|
884 |
|
885 |
return action
|
886 |
|
@@ -892,6 +1364,97 @@ class MaiDxOrchestrator:
|
|
892 |
content="Could you please provide more information about the patient's current condition?",
|
893 |
reasoning=f"Fallback due to panel deliberation error: {str(e)}",
|
894 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
895 |
|
896 |
def _interact_with_gatekeeper(
|
897 |
self, action: Action, full_case_details: str
|
@@ -951,7 +1514,7 @@ class MaiDxOrchestrator:
|
|
951 |
ground_truth_diagnosis: str,
|
952 |
) -> DiagnosisResult:
|
953 |
"""
|
954 |
-
Executes the full sequential diagnostic process.
|
955 |
|
956 |
Args:
|
957 |
initial_case_info (str): The initial abstract of the case.
|
@@ -962,13 +1525,22 @@ class MaiDxOrchestrator:
|
|
962 |
DiagnosisResult: An object containing the final diagnosis, evaluation, cost, and history.
|
963 |
"""
|
964 |
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
965 |
self.conversation.add(
|
966 |
"Gatekeeper",
|
967 |
f"Initial Case Information: {initial_case_info}",
|
968 |
)
|
|
|
969 |
|
970 |
-
# Add initial physician visit cost
|
971 |
-
self.cumulative_cost += self.physician_visit_cost
|
972 |
logger.info(
|
973 |
f"Initial physician visit cost: ${self.physician_visit_cost}"
|
974 |
)
|
@@ -978,16 +1550,18 @@ class MaiDxOrchestrator:
|
|
978 |
|
979 |
for i in range(self.max_iterations):
|
980 |
iteration_count = i + 1
|
|
|
|
|
981 |
logger.info(
|
982 |
f"--- Starting Diagnostic Loop {iteration_count}/{self.max_iterations} ---"
|
983 |
)
|
984 |
logger.info(
|
985 |
-
f"Current cost: ${
|
986 |
)
|
987 |
|
988 |
try:
|
989 |
-
# Panel deliberates to decide on the next action
|
990 |
-
action = self._run_panel_deliberation()
|
991 |
logger.info(
|
992 |
f"βοΈ Panel decision: {action.action_type.upper()} -> {action.content}"
|
993 |
)
|
@@ -995,6 +1569,9 @@ class MaiDxOrchestrator:
|
|
995 |
f"π Medical reasoning: {action.reasoning}"
|
996 |
)
|
997 |
|
|
|
|
|
|
|
998 |
if action.action_type == "diagnose":
|
999 |
final_diagnosis = action.content
|
1000 |
logger.info(
|
@@ -1002,7 +1579,7 @@ class MaiDxOrchestrator:
|
|
1002 |
)
|
1003 |
break
|
1004 |
|
1005 |
-
# Handle mode-specific constraints
|
1006 |
if (
|
1007 |
self.mode == "question_only"
|
1008 |
and action.action_type == "test"
|
@@ -1021,7 +1598,7 @@ class MaiDxOrchestrator:
|
|
1021 |
action.content
|
1022 |
)
|
1023 |
if (
|
1024 |
-
|
1025 |
> self.initial_budget
|
1026 |
):
|
1027 |
logger.warning(
|
@@ -1034,16 +1611,21 @@ class MaiDxOrchestrator:
|
|
1034 |
action, full_case_details
|
1035 |
)
|
1036 |
self.conversation.add("Gatekeeper", response)
|
|
|
1037 |
|
1038 |
-
# Update costs based on action type
|
1039 |
if action.action_type == "test":
|
1040 |
test_cost = self._estimate_cost(action.content)
|
1041 |
-
|
|
|
|
|
|
|
1042 |
logger.info(f"Tests ordered: {action.content}")
|
1043 |
logger.info(
|
1044 |
-
f"Test cost: ${test_cost:,} | Cumulative cost: ${
|
1045 |
)
|
1046 |
elif action.action_type == "ask":
|
|
|
1047 |
# Questions are part of the same visit until tests are ordered
|
1048 |
logger.info(f"Questions asked: {action.content}")
|
1049 |
logger.info(
|
@@ -1053,17 +1635,13 @@ class MaiDxOrchestrator:
|
|
1053 |
# Check budget constraints for budgeted mode
|
1054 |
if (
|
1055 |
self.mode == "budgeted"
|
1056 |
-
and
|
1057 |
):
|
1058 |
logger.warning(
|
1059 |
"Budget limit reached. Forcing final diagnosis."
|
1060 |
)
|
1061 |
-
# Use current
|
1062 |
-
final_diagnosis = (
|
1063 |
-
self.differential_diagnosis.split("\n")[0]
|
1064 |
-
if "\n" in self.differential_diagnosis
|
1065 |
-
else "Diagnosis not reached within budget constraints."
|
1066 |
-
)
|
1067 |
break
|
1068 |
|
1069 |
except Exception as e:
|
@@ -1075,11 +1653,9 @@ class MaiDxOrchestrator:
|
|
1075 |
|
1076 |
else:
|
1077 |
# Max iterations reached without diagnosis
|
1078 |
-
final_diagnosis = (
|
1079 |
-
|
1080 |
-
|
1081 |
-
else "Diagnosis not reached within maximum iterations."
|
1082 |
-
)
|
1083 |
logger.warning(
|
1084 |
f"Max iterations ({self.max_iterations}) reached. Using best available diagnosis."
|
1085 |
)
|
@@ -1115,7 +1691,7 @@ class MaiDxOrchestrator:
|
|
1115 |
ground_truth=ground_truth_diagnosis,
|
1116 |
accuracy_score=judgement["score"],
|
1117 |
accuracy_reasoning=judgement["reasoning"],
|
1118 |
-
total_cost=
|
1119 |
iterations=iteration_count,
|
1120 |
conversation_history=self.conversation.get_str(),
|
1121 |
)
|
@@ -1124,7 +1700,7 @@ class MaiDxOrchestrator:
|
|
1124 |
logger.info(f" Final diagnosis: {final_diagnosis}")
|
1125 |
logger.info(f" Ground truth: {ground_truth_diagnosis}")
|
1126 |
logger.info(f" Accuracy score: {judgement['score']}/5.0")
|
1127 |
-
logger.info(f" Total cost: ${
|
1128 |
logger.info(f" Iterations: {iteration_count}")
|
1129 |
|
1130 |
return result
|
@@ -1406,159 +1982,159 @@ def run_mai_dxo_demo(
|
|
1406 |
return results
|
1407 |
|
1408 |
|
1409 |
-
|
1410 |
-
#
|
1411 |
-
|
1412 |
-
|
1413 |
-
|
1414 |
-
|
1415 |
-
|
1416 |
-
|
1417 |
-
|
1418 |
-
|
1419 |
-
|
1420 |
-
|
1421 |
-
|
1422 |
-
|
1423 |
-
|
1424 |
-
|
1425 |
-
|
1426 |
-
|
1427 |
-
|
1428 |
-
|
1429 |
-
|
1430 |
-
|
1431 |
-
|
1432 |
-
#
|
1433 |
-
|
1434 |
-
|
1435 |
-
|
1436 |
-
|
1437 |
-
|
1438 |
-
|
1439 |
-
|
1440 |
-
|
1441 |
-
|
1442 |
-
|
1443 |
-
#
|
1444 |
-
|
1445 |
-
|
1446 |
-
|
1447 |
-
|
1448 |
-
|
1449 |
-
|
1450 |
-
|
1451 |
-
|
1452 |
-
|
1453 |
-
|
1454 |
-
|
1455 |
-
|
1456 |
-
|
1457 |
-
|
1458 |
-
|
1459 |
-
|
1460 |
-
|
1461 |
-
|
1462 |
-
|
1463 |
-
|
1464 |
-
#
|
1465 |
-
|
1466 |
-
|
1467 |
-
|
1468 |
-
|
1469 |
-
|
1470 |
-
|
1471 |
-
|
1472 |
-
|
1473 |
-
|
1474 |
-
|
1475 |
-
|
1476 |
-
|
1477 |
-
|
1478 |
-
|
1479 |
-
#
|
1480 |
-
|
1481 |
-
|
1482 |
-
|
1483 |
-
|
1484 |
-
|
1485 |
-
|
1486 |
-
|
1487 |
-
|
1488 |
-
#
|
1489 |
-
|
1490 |
-
|
1491 |
-
|
1492 |
-
|
1493 |
-
|
1494 |
-
|
1495 |
-
|
1496 |
-
|
1497 |
-
#
|
1498 |
-
|
1499 |
-
|
1500 |
-
|
1501 |
-
|
1502 |
-
|
1503 |
-
|
1504 |
-
|
1505 |
-
|
1506 |
-
|
1507 |
-
|
1508 |
-
|
1509 |
-
|
1510 |
-
|
1511 |
-
|
1512 |
-
|
1513 |
-
|
1514 |
-
|
1515 |
-
|
1516 |
-
|
1517 |
-
|
1518 |
-
|
1519 |
-
|
1520 |
-
|
1521 |
-
|
1522 |
-
|
1523 |
-
|
1524 |
-
|
1525 |
-
|
1526 |
-
|
1527 |
-
|
1528 |
-
|
1529 |
-
|
1530 |
-
|
1531 |
-
#
|
1532 |
-
|
1533 |
-
|
1534 |
-
|
1535 |
-
|
1536 |
-
|
1537 |
-
|
1538 |
-
|
1539 |
-
|
1540 |
-
|
1541 |
-
|
1542 |
-
|
1543 |
-
|
1544 |
-
|
1545 |
-
|
1546 |
-
|
1547 |
-
|
1548 |
-
|
1549 |
-
|
1550 |
-
|
1551 |
-
|
1552 |
-
|
1553 |
-
|
1554 |
-
|
1555 |
-
|
1556 |
-
|
1557 |
-
|
1558 |
-
|
1559 |
-
|
1560 |
-
|
1561 |
-
|
1562 |
-
|
1563 |
-
|
1564 |
-
|
|
|
18 |
|
19 |
Example Usage:
|
20 |
# Standard MAI-DxO usage
|
21 |
+
orchestrator = MaiDxOrchestrator(model_name="gpt-4o")
|
22 |
result = orchestrator.run(initial_case_info, full_case_details, ground_truth)
|
23 |
|
24 |
# Budget-constrained variant
|
|
|
33 |
import json
|
34 |
import sys
|
35 |
import time
|
36 |
+
from dataclasses import dataclass, field
|
37 |
from enum import Enum
|
38 |
from typing import Any, Dict, List, Union, Literal
|
39 |
|
40 |
from loguru import logger
|
41 |
from pydantic import BaseModel, Field
|
42 |
from swarms import Agent, Conversation
|
43 |
+
from dotenv import load_dotenv
|
44 |
+
|
45 |
+
load_dotenv()
|
46 |
|
47 |
# Configure Loguru with beautiful formatting and features
|
48 |
logger.remove() # Remove default handler
|
|
|
94 |
JUDGE = "Judge"
|
95 |
|
96 |
|
97 |
+
@dataclass
|
98 |
+
class CaseState:
|
99 |
+
"""Structured state management for diagnostic process - addresses Category 2.1"""
|
100 |
+
initial_vignette: str
|
101 |
+
evidence_log: List[str] = field(default_factory=list)
|
102 |
+
differential_diagnosis: Dict[str, float] = field(default_factory=dict)
|
103 |
+
tests_performed: List[str] = field(default_factory=list)
|
104 |
+
questions_asked: List[str] = field(default_factory=list)
|
105 |
+
cumulative_cost: int = 0
|
106 |
+
iteration: int = 0
|
107 |
+
last_actions: List['Action'] = field(default_factory=list) # For stagnation detection
|
108 |
+
|
109 |
+
def add_evidence(self, evidence: str):
|
110 |
+
"""Add new evidence to the case"""
|
111 |
+
self.evidence_log.append(f"[Turn {self.iteration}] {evidence}")
|
112 |
+
|
113 |
+
def update_differential(self, diagnosis_dict: Dict[str, float]):
|
114 |
+
"""Update differential diagnosis probabilities"""
|
115 |
+
self.differential_diagnosis.update(diagnosis_dict)
|
116 |
+
|
117 |
+
def add_test(self, test_name: str):
|
118 |
+
"""Record a test that was performed"""
|
119 |
+
self.tests_performed.append(test_name)
|
120 |
+
|
121 |
+
def add_question(self, question: str):
|
122 |
+
"""Record a question that was asked"""
|
123 |
+
self.questions_asked.append(question)
|
124 |
+
|
125 |
+
def is_stagnating(self, new_action: 'Action') -> bool:
|
126 |
+
"""Detect if the system is stuck in a loop - addresses Category 1.2"""
|
127 |
+
if len(self.last_actions) < 2:
|
128 |
+
return False
|
129 |
+
|
130 |
+
# Check if the new action is identical to recent ones
|
131 |
+
for last_action in self.last_actions[-2:]:
|
132 |
+
if (last_action.action_type == new_action.action_type and
|
133 |
+
last_action.content == new_action.content):
|
134 |
+
return True
|
135 |
+
return False
|
136 |
+
|
137 |
+
def add_action(self, action: 'Action'):
|
138 |
+
"""Add action to history and maintain sliding window"""
|
139 |
+
self.last_actions.append(action)
|
140 |
+
if len(self.last_actions) > 3: # Keep only last 3 actions
|
141 |
+
self.last_actions.pop(0)
|
142 |
+
|
143 |
+
def get_max_confidence(self) -> float:
|
144 |
+
"""Get the maximum confidence from differential diagnosis"""
|
145 |
+
if not self.differential_diagnosis:
|
146 |
+
return 0.0
|
147 |
+
return max(self.differential_diagnosis.values())
|
148 |
+
|
149 |
+
def get_leading_diagnosis(self) -> str:
|
150 |
+
"""Get the diagnosis with highest confidence"""
|
151 |
+
if not self.differential_diagnosis:
|
152 |
+
return "No diagnosis formulated"
|
153 |
+
return max(self.differential_diagnosis.items(), key=lambda x: x[1])[0]
|
154 |
+
|
155 |
+
def summarize_evidence(self) -> str:
|
156 |
+
"""Create a concise summary of evidence for token efficiency"""
|
157 |
+
if len(self.evidence_log) <= 5:
|
158 |
+
return "\n".join(self.evidence_log)
|
159 |
+
|
160 |
+
# Keep first 2 and last 3 entries, summarize middle
|
161 |
+
summary_parts = []
|
162 |
+
summary_parts.extend(self.evidence_log[:2])
|
163 |
+
|
164 |
+
if len(self.evidence_log) > 5:
|
165 |
+
middle_count = len(self.evidence_log) - 5
|
166 |
+
summary_parts.append(f"[... {middle_count} additional findings ...]")
|
167 |
+
|
168 |
+
summary_parts.extend(self.evidence_log[-3:])
|
169 |
+
return "\n".join(summary_parts)
|
170 |
+
|
171 |
+
|
172 |
+
@dataclass
|
173 |
+
class DeliberationState:
|
174 |
+
"""Structured state for panel deliberation - addresses Category 1.1"""
|
175 |
+
hypothesis_analysis: str = ""
|
176 |
+
test_chooser_analysis: str = ""
|
177 |
+
challenger_analysis: str = ""
|
178 |
+
stewardship_analysis: str = ""
|
179 |
+
checklist_analysis: str = ""
|
180 |
+
situational_context: str = ""
|
181 |
+
stagnation_detected: bool = False
|
182 |
+
retry_count: int = 0
|
183 |
+
|
184 |
+
def to_consensus_prompt(self) -> str:
|
185 |
+
"""Generate a structured prompt for the consensus coordinator"""
|
186 |
+
prompt = f"""
|
187 |
+
You are the Consensus Coordinator. Here is the summary of the panel's deliberation for this turn:
|
188 |
+
|
189 |
+
**Current Differential Diagnosis (from Dr. Hypothesis):**
|
190 |
+
{self.hypothesis_analysis}
|
191 |
+
|
192 |
+
**Recommended Tests (from Dr. Test-Chooser):**
|
193 |
+
{self.test_chooser_analysis}
|
194 |
+
|
195 |
+
**Identified Biases & Challenges (from Dr. Challenger):**
|
196 |
+
{self.challenger_analysis}
|
197 |
+
|
198 |
+
**Cost & Stewardship Concerns (from Dr. Stewardship):**
|
199 |
+
{self.stewardship_analysis}
|
200 |
+
|
201 |
+
**Quality Control Assessment (from Dr. Checklist):**
|
202 |
+
{self.checklist_analysis}
|
203 |
+
"""
|
204 |
+
|
205 |
+
if self.stagnation_detected:
|
206 |
+
prompt += f"""
|
207 |
+
**CRITICAL INTERVENTION: STAGNATION DETECTED**
|
208 |
+
The panel is stalled. You MUST propose a different and more decisive action.
|
209 |
+
If you cannot find a new test or question, you must move to a final diagnosis.
|
210 |
+
"""
|
211 |
+
|
212 |
+
if self.situational_context:
|
213 |
+
prompt += f"""
|
214 |
+
**SITUATIONAL CONTEXT:**
|
215 |
+
{self.situational_context}
|
216 |
+
"""
|
217 |
+
|
218 |
+
prompt += "\nBased on this synthesized input, provide your single best action in the required JSON format."
|
219 |
+
return prompt
|
220 |
+
|
221 |
+
|
222 |
@dataclass
|
223 |
class DiagnosisResult:
|
224 |
"""Stores the final result of a diagnostic session."""
|
|
|
255 |
Implements the MAI Diagnostic Orchestrator (MAI-DxO) framework.
|
256 |
This class orchestrates a virtual panel of AI agents to perform sequential medical diagnosis,
|
257 |
evaluates the final diagnosis, and tracks costs.
|
258 |
+
|
259 |
+
Enhanced with structured deliberation and proper state management as per research paper.
|
260 |
"""
|
261 |
|
262 |
def __init__(
|
263 |
self,
|
264 |
+
model_name: str = "gpt-4.1", # Updated to GPT-4.1 as requested (GPT-4 Turbo)
|
265 |
max_iterations: int = 10,
|
266 |
initial_budget: int = 10000,
|
267 |
mode: str = "no_budget", # "instant", "question_only", "budgeted", "no_budget", "ensemble"
|
|
|
269 |
enable_budget_tracking: bool = False,
|
270 |
):
|
271 |
"""
|
272 |
+
Initializes the MAI-DxO system with improved architecture.
|
273 |
|
274 |
Args:
|
275 |
model_name (str): The language model to be used by all agents.
|
|
|
291 |
self.conversation = Conversation(
|
292 |
time_enabled=True, autosave=False, save_enabled=False
|
293 |
)
|
294 |
+
|
295 |
+
# Initialize case state for structured state management
|
296 |
+
self.case_state = None
|
297 |
|
298 |
# Enhanced cost model based on the paper's methodology
|
299 |
self.test_cost_db = {
|
|
|
331 |
f"π₯ MAI Diagnostic Orchestrator initialized successfully in '{mode}' mode with budget ${initial_budget:,}"
|
332 |
)
|
333 |
|
334 |
+
def _get_agent_max_tokens(self, role: AgentRole) -> int:
|
335 |
+
"""Get max_tokens for each agent based on their role - addresses token optimization"""
|
336 |
+
token_limits = {
|
337 |
+
AgentRole.HYPOTHESIS: 800, # Needs space for differential diagnosis
|
338 |
+
AgentRole.TEST_CHOOSER: 600, # Test recommendations
|
339 |
+
AgentRole.CHALLENGER: 700, # Bias identification and alternatives
|
340 |
+
AgentRole.STEWARDSHIP: 500, # Cost analysis
|
341 |
+
AgentRole.CHECKLIST: 400, # Brief validation
|
342 |
+
AgentRole.CONSENSUS: 300, # Just JSON output
|
343 |
+
AgentRole.GATEKEEPER: 1000, # Detailed clinical findings
|
344 |
+
AgentRole.JUDGE: 600, # Scoring and reasoning
|
345 |
+
}
|
346 |
+
return token_limits.get(role, 500)
|
347 |
+
|
348 |
def _init_agents(self) -> None:
|
349 |
"""Initializes all required agents with their specific roles and prompts."""
|
350 |
self.agents = {
|
|
|
357 |
"json" if role == AgentRole.CONSENSUS else "str"
|
358 |
),
|
359 |
print_on=True, # Enable printing for all agents to see outputs
|
360 |
+
max_tokens=self._get_agent_max_tokens(role), # Role-specific token limits
|
361 |
)
|
362 |
for role in AgentRole
|
363 |
}
|
|
|
365 |
f"π₯ {len(self.agents)} virtual physician agents initialized and ready for consultation"
|
366 |
)
|
367 |
|
368 |
+
def _get_dynamic_context(self, role: AgentRole, case_state: CaseState) -> str:
|
369 |
+
"""Generate dynamic context for agents based on current situation - addresses Category 4.2"""
|
370 |
+
remaining_budget = self.initial_budget - case_state.cumulative_cost
|
371 |
+
|
372 |
+
# Calculate confidence from differential diagnosis
|
373 |
+
max_confidence = max(case_state.differential_diagnosis.values()) if case_state.differential_diagnosis else 0
|
374 |
+
|
375 |
+
context = ""
|
376 |
+
|
377 |
+
if role == AgentRole.STEWARDSHIP and remaining_budget < 1000:
|
378 |
+
context = f"""
|
379 |
+
**SITUATIONAL CONTEXT: URGENT**
|
380 |
+
The remaining budget is critically low (${remaining_budget}). All recommendations must be focused on maximum cost-effectiveness. Veto any non-essential or high-cost tests.
|
381 |
+
"""
|
382 |
+
|
383 |
+
elif role == AgentRole.HYPOTHESIS and max_confidence > 0.75:
|
384 |
+
context = f"""
|
385 |
+
**SITUATIONAL CONTEXT: FINAL STAGES**
|
386 |
+
The panel is converging on a diagnosis (current max confidence: {max_confidence:.0%}). Your primary role now is to confirm the leading hypothesis or state what single piece of evidence is needed to reach >85% confidence.
|
387 |
+
"""
|
388 |
+
|
389 |
+
elif role == AgentRole.CONSENSUS and case_state.iteration > 5:
|
390 |
+
context = f"""
|
391 |
+
**SITUATIONAL CONTEXT: EXTENDED CASE**
|
392 |
+
This case has gone through {case_state.iteration} iterations. Focus on decisive actions that will lead to a definitive diagnosis rather than additional exploratory steps.
|
393 |
+
"""
|
394 |
+
|
395 |
+
return context
|
396 |
+
|
397 |
+
def _get_prompt_for_role(self, role: AgentRole, case_state: CaseState = None) -> str:
|
398 |
+
"""Returns the system prompt for a given agent role with dynamic context."""
|
399 |
+
|
400 |
+
# Add dynamic context if case_state is provided
|
401 |
+
dynamic_context = ""
|
402 |
+
if case_state:
|
403 |
+
dynamic_context = self._get_dynamic_context(role, case_state)
|
404 |
+
|
405 |
+
base_prompts = {
|
406 |
+
AgentRole.HYPOTHESIS: f"""
|
407 |
+
{dynamic_context}
|
408 |
+
|
409 |
+
You are Dr. Hypothesis, a specialist in maintaining differential diagnoses. Your role is critical to the diagnostic process.
|
410 |
+
|
411 |
+
CORE RESPONSIBILITIES:
|
412 |
+
- Maintain a probability-ranked differential diagnosis with the top 3 most likely conditions
|
413 |
+
- Update probabilities using Bayesian reasoning after each new finding
|
414 |
+
- Consider both common and rare diseases appropriate to the clinical context
|
415 |
+
- Explicitly track how new evidence changes your diagnostic thinking
|
416 |
+
|
417 |
+
APPROACH:
|
418 |
+
1. Start with the most likely diagnoses based on presenting symptoms
|
419 |
+
2. For each new piece of evidence, consider:
|
420 |
+
- How it supports or refutes each hypothesis
|
421 |
+
- Whether it suggests new diagnoses to consider
|
422 |
+
- How it changes the relative probabilities
|
423 |
+
3. Always explain your Bayesian reasoning clearly
|
424 |
+
|
425 |
+
OUTPUT FORMAT:
|
426 |
+
Provide your updated differential diagnosis with:
|
427 |
+
- Top 3 diagnoses with probability estimates (percentages)
|
428 |
+
- Brief rationale for each
|
429 |
+
- Key evidence supporting each hypothesis
|
430 |
+
- Evidence that contradicts or challenges each hypothesis
|
431 |
+
|
432 |
+
Remember: Your differential drives the entire diagnostic process. Be thorough, evidence-based, and adaptive.
|
433 |
+
""",
|
434 |
+
|
435 |
+
AgentRole.TEST_CHOOSER: (
|
436 |
+
"""
|
437 |
+
You are Dr. Test-Chooser, a specialist in diagnostic test selection and information theory.
|
438 |
+
|
439 |
+
CORE RESPONSIBILITIES:
|
440 |
+
- Select up to 3 diagnostic tests per round that maximally discriminate between leading hypotheses
|
441 |
+
- Optimize for information value, not just clinical reasonableness
|
442 |
+
- Consider test characteristics: sensitivity, specificity, positive/negative predictive values
|
443 |
+
- Balance diagnostic yield with patient burden and resource utilization
|
444 |
+
|
445 |
+
SELECTION CRITERIA:
|
446 |
+
1. Information Value: How much will this test change diagnostic probabilities?
|
447 |
+
2. Discriminatory Power: How well does it distinguish between competing hypotheses?
|
448 |
+
3. Clinical Impact: Will the result meaningfully alter management?
|
449 |
+
4. Sequential Logic: What should we establish first before ordering more complex tests?
|
450 |
+
|
451 |
+
APPROACH:
|
452 |
+
- For each proposed test, explicitly state which hypotheses it will help confirm or exclude
|
453 |
+
- Consider both positive and negative results and their implications
|
454 |
+
- Think about test sequences (e.g., basic labs before advanced imaging)
|
455 |
+
- Avoid redundant tests that won't add new information
|
456 |
+
|
457 |
+
OUTPUT FORMAT:
|
458 |
+
For each recommended test:
|
459 |
+
- Test name (be specific)
|
460 |
+
- Primary hypotheses it will help evaluate
|
461 |
+
- Expected information gain
|
462 |
+
- How results will change management decisions
|
463 |
+
|
464 |
+
Focus on tests that will most efficiently narrow the differential diagnosis.
|
465 |
+
"""
|
466 |
+
),
|
467 |
+
AgentRole.CHALLENGER: (
|
468 |
+
"""
|
469 |
+
You are Dr. Challenger, the critical thinking specialist and devil's advocate.
|
470 |
+
|
471 |
+
CORE RESPONSIBILITIES:
|
472 |
+
- Identify and challenge cognitive biases in the diagnostic process
|
473 |
+
- Highlight contradictory evidence that might be overlooked
|
474 |
+
- Propose alternative hypotheses and falsifying tests
|
475 |
+
- Guard against premature diagnostic closure
|
476 |
+
|
477 |
+
COGNITIVE BIASES TO WATCH FOR:
|
478 |
+
1. Anchoring: Over-reliance on initial impressions
|
479 |
+
2. Confirmation bias: Seeking only supporting evidence
|
480 |
+
3. Availability bias: Overestimating probability of recently seen conditions
|
481 |
+
4. Representativeness: Ignoring base rates and prevalence
|
482 |
+
5. Search satisficing: Stopping at "good enough" explanations
|
483 |
+
|
484 |
+
YOUR APPROACH:
|
485 |
+
- Ask "What else could this be?" and "What doesn't fit?"
|
486 |
+
- Challenge assumptions and look for alternative explanations
|
487 |
+
- Propose tests that could disprove the leading hypothesis
|
488 |
+
- Consider rare diseases when common ones don't fully explain the picture
|
489 |
+
- Advocate for considering multiple conditions simultaneously
|
490 |
+
|
491 |
+
OUTPUT FORMAT:
|
492 |
+
- Specific biases you've identified in the current reasoning
|
493 |
+
- Evidence that contradicts the leading hypotheses
|
494 |
+
- Alternative diagnoses to consider
|
495 |
+
- Tests that could falsify current assumptions
|
496 |
+
- Red flags or concerning patterns that need attention
|
497 |
+
|
498 |
+
Be constructively critical - your role is to strengthen diagnostic accuracy through rigorous challenge.
|
499 |
+
"""
|
500 |
+
),
|
501 |
+
AgentRole.STEWARDSHIP: (
|
502 |
+
"""
|
503 |
+
You are Dr. Stewardship, the resource optimization and cost-effectiveness specialist.
|
504 |
+
|
505 |
+
CORE RESPONSIBILITIES:
|
506 |
+
- Enforce cost-conscious, high-value care
|
507 |
+
- Advocate for cheaper alternatives when diagnostically equivalent
|
508 |
+
- Challenge low-yield, expensive tests
|
509 |
+
- Balance diagnostic thoroughness with resource stewardship
|
510 |
+
|
511 |
+
COST-VALUE FRAMEWORK:
|
512 |
+
1. High-Value Tests: Low cost, high diagnostic yield, changes management
|
513 |
+
2. Moderate-Value Tests: Moderate cost, specific indication, incremental value
|
514 |
+
3. Low-Value Tests: High cost, low yield, minimal impact on decisions
|
515 |
+
4. No-Value Tests: Any cost, no diagnostic value, ordered out of habit
|
516 |
+
|
517 |
+
ALTERNATIVE STRATEGIES:
|
518 |
+
- Could patient history/physical exam provide this information?
|
519 |
+
- Is there a less expensive test with similar diagnostic value?
|
520 |
+
- Can we use a staged approach (cheap test first, expensive if needed)?
|
521 |
+
- Does the test result actually change management?
|
522 |
+
|
523 |
+
YOUR APPROACH:
|
524 |
+
- Review all proposed tests for necessity and value
|
525 |
+
- Suggest cost-effective alternatives
|
526 |
+
- Question tests that don't clearly advance diagnosis
|
527 |
+
- Advocate for asking questions before ordering expensive tests
|
528 |
+
- Consider the cumulative cost burden
|
529 |
+
|
530 |
+
OUTPUT FORMAT:
|
531 |
+
- Assessment of proposed tests (high/moderate/low/no value)
|
532 |
+
- Specific cost-effective alternatives
|
533 |
+
- Questions that might obviate need for testing
|
534 |
+
- Recommended modifications to testing strategy
|
535 |
+
- Cumulative cost considerations
|
536 |
+
|
537 |
+
Your goal: Maximum diagnostic accuracy at minimum necessary cost.
|
538 |
+
"""
|
539 |
+
),
|
540 |
+
AgentRole.CHECKLIST: (
|
541 |
+
"""
|
542 |
+
You are Dr. Checklist, the quality assurance and consistency specialist.
|
543 |
+
|
544 |
+
CORE RESPONSIBILITIES:
|
545 |
+
- Perform silent quality control on all panel deliberations
|
546 |
+
- Ensure test names are valid and properly specified
|
547 |
+
- Check internal consistency of reasoning across panel members
|
548 |
+
- Flag logical errors or contradictions in the diagnostic approach
|
549 |
+
|
550 |
+
QUALITY CHECKS:
|
551 |
+
1. Test Validity: Are proposed tests real and properly named?
|
552 |
+
2. Logical Consistency: Do the recommendations align with the differential?
|
553 |
+
3. Evidence Integration: Are all findings being considered appropriately?
|
554 |
+
4. Process Adherence: Is the panel following proper diagnostic methodology?
|
555 |
+
5. Safety Checks: Are any critical possibilities being overlooked?
|
556 |
+
|
557 |
+
SPECIFIC VALIDATIONS:
|
558 |
+
- Test names match standard medical terminology
|
559 |
+
- Proposed tests are appropriate for the clinical scenario
|
560 |
+
- No contradictions between different panel members' reasoning
|
561 |
+
- All significant findings are being addressed
|
562 |
+
- No gaps in the diagnostic logic
|
563 |
+
|
564 |
+
OUTPUT FORMAT:
|
565 |
+
- Brief validation summary (β Clear / β Issues noted)
|
566 |
+
- Any test name corrections needed
|
567 |
+
- Logical inconsistencies identified
|
568 |
+
- Missing considerations or gaps
|
569 |
+
- Process improvement suggestions
|
570 |
+
|
571 |
+
Keep your feedback concise but comprehensive. Flag any issues that could compromise diagnostic quality.
|
572 |
+
"""
|
573 |
+
),
|
574 |
+
AgentRole.CONSENSUS: f"""
|
575 |
+
{dynamic_context}
|
576 |
+
|
577 |
+
You are the Consensus Coordinator, responsible for synthesizing the virtual panel's expertise into a single, optimal decision.
|
578 |
+
|
579 |
+
CORE RESPONSIBILITIES:
|
580 |
+
- Integrate input from Dr. Hypothesis, Dr. Test-Chooser, Dr. Challenger, Dr. Stewardship, and Dr. Checklist
|
581 |
+
- Decide on the single best next action: 'ask', 'test', or 'diagnose'
|
582 |
+
- Balance competing priorities: accuracy, cost, efficiency, and thoroughness
|
583 |
+
- Ensure the chosen action advances the diagnostic process optimally
|
584 |
+
|
585 |
+
**PRIORITIZED DECISION FRAMEWORK:**
|
586 |
+
Use the following prioritized framework to make your decision:
|
587 |
+
|
588 |
+
1. **Certainty Threshold:** If Dr. Hypothesis's leading diagnosis has confidence >85% AND Dr. Challenger raises no major objections, your action MUST be `diagnose`.
|
589 |
+
2. **Address Red Flags:** If Dr. Challenger identifies a critical bias or contradictory evidence, your next action MUST be a test or question that directly addresses that challenge.
|
590 |
+
3. **High-Value Information:** Otherwise, select the test from Dr. Test-Chooser that offers the highest information gain.
|
591 |
+
4. **Cost Optimization:** Before finalizing a test, check Dr. Stewardship's input. If a diagnostically equivalent but cheaper alternative is available, select it.
|
592 |
+
5. **Default to Questions:** If no test meets the criteria or the budget is a major concern, select the most pertinent question to ask.
|
593 |
+
|
594 |
+
OUTPUT REQUIREMENTS:
|
595 |
+
Provide a JSON object with this exact structure:
|
596 |
+
{{
|
597 |
+
"action_type": "ask" | "test" | "diagnose",
|
598 |
+
"content": "specific question(s), test name(s), or final diagnosis",
|
599 |
+
"reasoning": "clear justification synthesizing panel input and citing decision framework step"
|
600 |
+
}}
|
601 |
+
|
602 |
+
For action_type "ask": content should be specific patient history or physical exam questions
|
603 |
+
For action_type "test": content should be properly named diagnostic tests (up to 3)
|
604 |
+
For action_type "diagnose": content should be the complete, specific final diagnosis
|
605 |
+
|
606 |
+
Make the decision that best advances accurate, cost-effective diagnosis.
|
607 |
+
""",
|
608 |
+
AgentRole.GATEKEEPER: (
|
609 |
+
"""
|
610 |
+
You are the Gatekeeper, the clinical information oracle with complete access to the patient case file.
|
611 |
+
|
612 |
+
CORE RESPONSIBILITIES:
|
613 |
+
- Provide objective, specific clinical findings when explicitly requested
|
614 |
+
- Serve as the authoritative source for all patient information
|
615 |
+
- Generate realistic synthetic findings for tests not in the original case
|
616 |
+
- Maintain clinical realism while preventing information leakage
|
617 |
+
|
618 |
+
RESPONSE PRINCIPLES:
|
619 |
+
1. OBJECTIVITY: Provide only factual findings, never interpretations or impressions
|
620 |
+
2. SPECIFICITY: Give precise, detailed results when tests are properly ordered
|
621 |
+
3. REALISM: Ensure all responses reflect realistic clinical scenarios
|
622 |
+
4. NO HINTS: Never provide diagnostic clues or suggestions
|
623 |
+
5. CONSISTENCY: Maintain coherence across all provided information
|
624 |
+
|
625 |
+
HANDLING REQUESTS:
|
626 |
+
- Patient History Questions: Provide relevant history from case file or realistic details
|
627 |
+
- Physical Exam: Give specific examination findings as would be documented
|
628 |
+
- Diagnostic Tests: Provide exact results as specified or realistic synthetic values
|
629 |
+
- Vague Requests: Politely ask for more specific queries
|
630 |
+
- Invalid Requests: Explain why the request cannot be fulfilled
|
631 |
+
|
632 |
+
SYNTHETIC FINDINGS GUIDELINES:
|
633 |
+
When generating findings not in the original case:
|
634 |
+
- Ensure consistency with established diagnosis and case details
|
635 |
+
- Use realistic reference ranges and values
|
636 |
+
- Maintain clinical plausibility
|
637 |
+
- Avoid pathognomonic findings unless specifically diagnostic
|
638 |
+
|
639 |
+
RESPONSE FORMAT:
|
640 |
+
- Direct, clinical language
|
641 |
+
- Specific measurements with reference ranges when applicable
|
642 |
+
- Clear organization of findings
|
643 |
+
- Professional medical terminology
|
644 |
+
|
645 |
+
Your role is crucial: provide complete, accurate clinical information while maintaining the challenge of the diagnostic process.
|
646 |
+
"""
|
647 |
+
),
|
648 |
+
AgentRole.JUDGE: (
|
649 |
+
"""
|
650 |
+
You are the Judge, the diagnostic accuracy evaluation specialist.
|
651 |
+
|
652 |
+
CORE RESPONSIBILITIES:
|
653 |
+
- Evaluate candidate diagnoses against ground truth using a rigorous clinical rubric
|
654 |
+
- Provide fair, consistent scoring based on clinical management implications
|
655 |
+
- Consider diagnostic substance over terminology differences
|
656 |
+
- Account for acceptable medical synonyms and equivalent formulations
|
657 |
+
|
658 |
+
EVALUATION RUBRIC (5-point Likert scale):
|
659 |
+
|
660 |
+
SCORE 5 (Perfect/Clinically Superior):
|
661 |
+
- Clinically identical to reference diagnosis
|
662 |
+
- May be more specific than reference (adding relevant detail)
|
663 |
+
- No incorrect or unrelated additions
|
664 |
+
- Treatment approach would be identical
|
665 |
+
|
666 |
+
SCORE 4 (Mostly Correct - Minor Incompleteness):
|
667 |
+
- Core disease correctly identified
|
668 |
+
- Minor qualifier or component missing/mis-specified
|
669 |
+
- Overall management largely unchanged
|
670 |
+
- Clinically appropriate diagnosis
|
671 |
+
|
672 |
+
SCORE 3 (Partially Correct - Major Error):
|
673 |
+
- Correct general disease category
|
674 |
+
- Major error in etiology, anatomic site, or critical specificity
|
675 |
+
- Would significantly alter workup or prognosis
|
676 |
+
- Partially correct but clinically concerning gaps
|
677 |
+
|
678 |
+
SCORE 2 (Largely Incorrect):
|
679 |
+
- Shares only superficial features with correct diagnosis
|
680 |
+
- Wrong fundamental disease process
|
681 |
+
- Would misdirect clinical workup
|
682 |
+
- Partially contradicts case details
|
683 |
+
|
684 |
+
SCORE 1 (Completely Incorrect):
|
685 |
+
- No meaningful overlap with correct diagnosis
|
686 |
+
- Wrong organ system or disease category
|
687 |
+
- Would likely lead to harmful care
|
688 |
+
- Completely inconsistent with clinical presentation
|
689 |
+
|
690 |
+
EVALUATION PROCESS:
|
691 |
+
1. Compare core disease entity
|
692 |
+
2. Assess etiology/causative factors
|
693 |
+
3. Evaluate anatomic specificity
|
694 |
+
4. Consider diagnostic completeness
|
695 |
+
5. Judge clinical management implications
|
696 |
+
|
697 |
+
OUTPUT FORMAT:
|
698 |
+
- Score (1-5) with clear label
|
699 |
+
- Detailed justification referencing specific rubric criteria
|
700 |
+
- Explanation of how diagnosis would affect clinical management
|
701 |
+
- Note any acceptable medical synonyms or equivalent terminology
|
702 |
+
|
703 |
+
Maintain high standards while recognizing legitimate diagnostic variability in medical practice.
|
704 |
+
"""
|
705 |
+
),
|
706 |
+
}
|
707 |
+
|
708 |
+
# Use existing prompts for other roles, just add dynamic context
|
709 |
+
if role not in base_prompts:
|
710 |
+
return dynamic_context + self._get_original_prompt_for_role(role)
|
711 |
+
|
712 |
+
return base_prompts[role]
|
713 |
+
|
714 |
+
def _get_original_prompt_for_role(self, role: AgentRole) -> str:
|
715 |
+
"""Returns original system prompts for roles not yet updated"""
|
716 |
prompts = {
|
717 |
AgentRole.HYPOTHESIS: (
|
718 |
"""
|
|
|
1019 |
}
|
1020 |
return prompts[role]
|
1021 |
|
1022 |
+
def _parse_json_response(self, response: str, retry_count: int = 0) -> Dict[str, Any]:
|
1023 |
+
"""Safely parses a JSON string with retry logic - addresses Category 3.2"""
|
1024 |
try:
|
1025 |
# Extract the actual response content from the agent response
|
1026 |
if isinstance(response, str):
|
|
|
1094 |
logger.debug(
|
1095 |
f"Response content: {response[:500]}..."
|
1096 |
) # Log first 500 chars
|
1097 |
+
|
1098 |
+
# Return the error for potential retry instead of immediately falling back
|
1099 |
+
raise e
|
1100 |
+
|
1101 |
+
def _parse_json_with_retry(self, consensus_agent: Agent, consensus_prompt: str, max_retries: int = 2) -> Dict[str, Any]:
|
1102 |
+
"""Parse JSON with retry logic for robustness - addresses Category 3.2"""
|
1103 |
+
for attempt in range(max_retries + 1):
|
1104 |
+
try:
|
1105 |
+
if attempt == 0:
|
1106 |
+
response = consensus_agent.run(consensus_prompt)
|
1107 |
+
else:
|
1108 |
+
# Retry with error feedback
|
1109 |
+
retry_prompt = consensus_prompt + f"""
|
1110 |
+
|
1111 |
+
**RETRY REQUIRED - ATTEMPT {attempt + 1}**
|
1112 |
+
Your previous response could not be parsed as JSON. Please ensure your response contains ONLY a valid JSON object in this exact format:
|
1113 |
+
```json
|
1114 |
+
{{
|
1115 |
+
"action_type": "ask" | "test" | "diagnose",
|
1116 |
+
"content": "your content here",
|
1117 |
+
"reasoning": "your reasoning here"
|
1118 |
+
}}
|
1119 |
+
```
|
1120 |
+
"""
|
1121 |
+
response = consensus_agent.run(retry_prompt)
|
1122 |
+
|
1123 |
+
# Extract the actual text content from agent response
|
1124 |
+
if hasattr(response, "content"):
|
1125 |
+
response_text = response.content
|
1126 |
+
elif isinstance(response, str):
|
1127 |
+
response_text = response
|
1128 |
+
else:
|
1129 |
+
response_text = str(response)
|
1130 |
+
|
1131 |
+
return self._parse_json_response(response_text, attempt)
|
1132 |
+
|
1133 |
+
except Exception as e:
|
1134 |
+
logger.warning(f"JSON parsing attempt {attempt + 1} failed: {e}")
|
1135 |
+
if attempt == max_retries:
|
1136 |
+
# Final fallback after all retries
|
1137 |
+
logger.error("All JSON parsing attempts failed, using fallback")
|
1138 |
+
return {
|
1139 |
+
"action_type": "ask",
|
1140 |
+
"content": "Could you please clarify the next best step? The previous analysis was inconclusive.",
|
1141 |
+
"reasoning": f"Fallback due to JSON parsing error after {max_retries + 1} attempts.",
|
1142 |
+
}
|
1143 |
+
|
1144 |
+
# Should never reach here, but just in case
|
1145 |
+
return {
|
1146 |
+
"action_type": "ask",
|
1147 |
+
"content": "Please provide more information about the patient's condition.",
|
1148 |
+
"reasoning": "Unexpected fallback in JSON parsing.",
|
1149 |
+
}
|
1150 |
|
1151 |
def _estimate_cost(self, tests: Union[List[str], str]) -> int:
|
1152 |
"""Estimates the cost of diagnostic tests."""
|
|
|
1235 |
|
1236 |
return cost
|
1237 |
|
1238 |
+
def _run_panel_deliberation(self, case_state: CaseState) -> Action:
|
1239 |
+
"""Orchestrates one round of structured debate among the virtual panel - addresses Category 1.1"""
|
1240 |
logger.info(
|
1241 |
"π©Ί Virtual medical panel deliberation commenced - analyzing patient case"
|
1242 |
)
|
1243 |
logger.debug(
|
1244 |
"Panel members: Dr. Hypothesis, Dr. Test-Chooser, Dr. Challenger, Dr. Stewardship, Dr. Checklist"
|
1245 |
)
|
|
|
|
|
|
|
1246 |
|
1247 |
+
# Initialize structured deliberation state instead of conversational chaining
|
1248 |
+
deliberation_state = DeliberationState()
|
1249 |
+
|
1250 |
+
# Prepare comprehensive but concise case context for each agent
|
1251 |
+
remaining_budget = self.initial_budget - case_state.cumulative_cost
|
1252 |
budget_status = (
|
1253 |
"EXCEEDED"
|
1254 |
if remaining_budget < 0
|
1255 |
else f"${remaining_budget:,}"
|
1256 |
)
|
1257 |
|
1258 |
+
# Base context for all agents (token-efficient)
|
1259 |
+
base_context = f"""
|
1260 |
+
=== DIAGNOSTIC CASE STATUS - ROUND {case_state.iteration} ===
|
1261 |
+
|
1262 |
+
INITIAL PRESENTATION:
|
1263 |
+
{case_state.initial_vignette}
|
1264 |
+
|
1265 |
+
EVIDENCE GATHERED:
|
1266 |
+
{case_state.summarize_evidence()}
|
1267 |
+
|
1268 |
+
CURRENT STATE:
|
1269 |
+
- Tests Performed: {', '.join(case_state.tests_performed) if case_state.tests_performed else 'None'}
|
1270 |
+
- Questions Asked: {len(case_state.questions_asked)}
|
1271 |
+
- Cumulative Cost: ${case_state.cumulative_cost:,}
|
1272 |
+
- Remaining Budget: {budget_status}
|
1273 |
+
- Mode: {self.mode}
|
1274 |
"""
|
|
|
1275 |
|
1276 |
# Check mode-specific constraints
|
1277 |
if self.mode == "instant":
|
1278 |
# For instant mode, skip deliberation and go straight to diagnosis
|
1279 |
action_dict = {
|
1280 |
"action_type": "diagnose",
|
1281 |
+
"content": case_state.get_leading_diagnosis(),
|
|
|
|
|
|
|
|
|
1282 |
"reasoning": (
|
1283 |
"Instant diagnosis mode - providing immediate assessment based on initial presentation"
|
1284 |
),
|
1285 |
}
|
1286 |
return Action(**action_dict)
|
1287 |
|
1288 |
+
# Check for stagnation before running deliberation
|
1289 |
+
stagnation_detected = False
|
1290 |
+
if len(case_state.last_actions) >= 2:
|
1291 |
+
last_action = case_state.last_actions[-1]
|
1292 |
+
stagnation_detected = case_state.is_stagnating(last_action)
|
1293 |
+
deliberation_state.stagnation_detected = stagnation_detected
|
1294 |
+
if stagnation_detected:
|
1295 |
+
logger.warning("π Stagnation detected - will force different action")
|
1296 |
+
|
1297 |
+
# Generate dynamic situational context for all agents
|
1298 |
+
deliberation_state.situational_context = self._generate_situational_context(case_state, remaining_budget)
|
1299 |
|
1300 |
+
# Run each specialist agent in parallel-like fashion with structured output
|
1301 |
+
# Each agent gets the same base context plus their role-specific dynamic prompt
|
1302 |
try:
|
1303 |
# Dr. Hypothesis - Differential diagnosis and probability assessment
|
1304 |
+
logger.info("π§ Dr. Hypothesis analyzing differential diagnosis...")
|
1305 |
+
hypothesis_prompt = self._get_prompt_for_role(AgentRole.HYPOTHESIS, case_state) + "\n\n" + base_context
|
1306 |
+
deliberation_state.hypothesis_analysis = self.agents[AgentRole.HYPOTHESIS].run(hypothesis_prompt)
|
1307 |
+
|
1308 |
+
# Update case state with new differential
|
1309 |
+
self._update_differential_from_hypothesis(case_state, deliberation_state.hypothesis_analysis)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1310 |
|
1311 |
# Dr. Test-Chooser - Information value optimization
|
1312 |
+
logger.info("π¬ Dr. Test-Chooser selecting optimal tests...")
|
1313 |
+
test_chooser_prompt = self._get_prompt_for_role(AgentRole.TEST_CHOOSER, case_state) + "\n\n" + base_context
|
1314 |
+
if self.mode == "question_only":
|
1315 |
+
test_chooser_prompt += "\n\nIMPORTANT: This is QUESTION-ONLY mode. You may ONLY recommend patient questions, not diagnostic tests."
|
1316 |
+
deliberation_state.test_chooser_analysis = self.agents[AgentRole.TEST_CHOOSER].run(test_chooser_prompt)
|
|
|
|
|
|
|
|
|
|
|
1317 |
|
1318 |
# Dr. Challenger - Bias identification and alternative hypotheses
|
1319 |
+
logger.info("π€ Dr. Challenger challenging assumptions...")
|
1320 |
+
challenger_prompt = self._get_prompt_for_role(AgentRole.CHALLENGER, case_state) + "\n\n" + base_context
|
1321 |
+
deliberation_state.challenger_analysis = self.agents[AgentRole.CHALLENGER].run(challenger_prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1322 |
|
1323 |
# Dr. Stewardship - Cost-effectiveness analysis
|
1324 |
+
logger.info("π° Dr. Stewardship evaluating cost-effectiveness...")
|
1325 |
+
stewardship_prompt = self._get_prompt_for_role(AgentRole.STEWARDSHIP, case_state) + "\n\n" + base_context
|
|
|
|
|
1326 |
if self.enable_budget_tracking:
|
1327 |
+
stewardship_prompt += f"\n\nBUDGET TRACKING ENABLED - Current cost: ${case_state.cumulative_cost}, Remaining: ${remaining_budget}"
|
1328 |
+
deliberation_state.stewardship_analysis = self.agents[AgentRole.STEWARDSHIP].run(stewardship_prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
1329 |
|
1330 |
# Dr. Checklist - Quality assurance
|
1331 |
+
logger.info("β
Dr. Checklist performing quality control...")
|
1332 |
+
checklist_prompt = self._get_prompt_for_role(AgentRole.CHECKLIST, case_state) + "\n\n" + base_context
|
1333 |
+
deliberation_state.checklist_analysis = self.agents[AgentRole.CHECKLIST].run(checklist_prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1334 |
|
1335 |
+
# Consensus Coordinator - Final decision synthesis using structured state
|
1336 |
+
logger.info("π€ Consensus Coordinator synthesizing panel decision...")
|
1337 |
+
|
1338 |
+
# Generate the structured consensus prompt
|
1339 |
+
consensus_prompt = deliberation_state.to_consensus_prompt()
|
1340 |
+
|
1341 |
# Add mode-specific constraints to consensus
|
1342 |
if self.mode == "budgeted" and remaining_budget <= 0:
|
1343 |
+
consensus_prompt += "\n\nBUDGET CONSTRAINT: Budget exceeded - must either ask questions or provide final diagnosis."
|
1344 |
|
1345 |
+
# Use improved JSON parsing with retry logic
|
1346 |
+
action_dict = self._parse_json_with_retry(
|
1347 |
+
self.agents[AgentRole.CONSENSUS],
|
1348 |
+
consensus_prompt
|
|
|
1349 |
)
|
1350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1351 |
# Validate action based on mode constraints
|
1352 |
action = Action(**action_dict)
|
1353 |
+
|
1354 |
+
# Apply mode-specific validation and corrections
|
1355 |
+
action = self._validate_and_correct_action(action, case_state, remaining_budget)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1356 |
|
1357 |
return action
|
1358 |
|
|
|
1364 |
content="Could you please provide more information about the patient's current condition?",
|
1365 |
reasoning=f"Fallback due to panel deliberation error: {str(e)}",
|
1366 |
)
|
1367 |
+
|
1368 |
+
def _generate_situational_context(self, case_state: CaseState, remaining_budget: int) -> str:
|
1369 |
+
"""Generate dynamic situational context based on current case state - addresses Category 4.2"""
|
1370 |
+
context_parts = []
|
1371 |
+
|
1372 |
+
# Budget-related context
|
1373 |
+
if remaining_budget < 1000:
|
1374 |
+
context_parts.append(f"URGENT: Remaining budget critically low (${remaining_budget}). Focus on cost-effective actions.")
|
1375 |
+
elif remaining_budget < 2000:
|
1376 |
+
context_parts.append(f"WARNING: Budget running low (${remaining_budget}). Prioritize high-value tests.")
|
1377 |
+
|
1378 |
+
# Diagnostic confidence context
|
1379 |
+
max_confidence = case_state.get_max_confidence()
|
1380 |
+
if max_confidence > 0.85:
|
1381 |
+
context_parts.append(f"FINAL STAGES: High confidence diagnosis available ({max_confidence:.0%}). Consider definitive action.")
|
1382 |
+
elif max_confidence > 0.70:
|
1383 |
+
context_parts.append(f"CONVERGING: Moderate confidence in leading diagnosis ({max_confidence:.0%}). Focus on confirmation.")
|
1384 |
+
|
1385 |
+
# Iteration context
|
1386 |
+
if case_state.iteration > 7:
|
1387 |
+
context_parts.append(f"EXTENDED CASE: {case_state.iteration} rounds completed. Move toward decisive action.")
|
1388 |
+
elif case_state.iteration > 5:
|
1389 |
+
context_parts.append(f"PROLONGED: {case_state.iteration} rounds. Avoid further exploratory steps unless critical.")
|
1390 |
+
|
1391 |
+
# Test/cost context
|
1392 |
+
if len(case_state.tests_performed) > 5:
|
1393 |
+
context_parts.append("EXTENSIVE TESTING: Many tests completed. Focus on synthesis rather than additional testing.")
|
1394 |
+
|
1395 |
+
return " | ".join(context_parts) if context_parts else ""
|
1396 |
+
|
1397 |
+
def _update_differential_from_hypothesis(self, case_state: CaseState, hypothesis_analysis: str):
|
1398 |
+
"""Extract and update differential diagnosis from Dr. Hypothesis analysis"""
|
1399 |
+
try:
|
1400 |
+
# Simple extraction - look for percentage patterns in the text
|
1401 |
+
import re
|
1402 |
+
|
1403 |
+
# Update the main differential diagnosis for backward compatibility
|
1404 |
+
self.differential_diagnosis = hypothesis_analysis
|
1405 |
+
|
1406 |
+
# Try to extract structured probabilities
|
1407 |
+
# Look for patterns like "Diagnosis: 85%" or "Disease (70%)"
|
1408 |
+
percentage_pattern = r'([A-Za-z][^:(\n]*?)[\s:]*[\(]?(\d{1,3})%[\)]?'
|
1409 |
+
matches = re.findall(percentage_pattern, hypothesis_analysis)
|
1410 |
+
|
1411 |
+
new_differential = {}
|
1412 |
+
for match in matches:
|
1413 |
+
diagnosis = match[0].strip().rstrip(':-()').strip()
|
1414 |
+
probability = float(match[1]) / 100.0
|
1415 |
+
if 0 <= probability <= 1.0 and len(diagnosis) > 3: # Basic validation
|
1416 |
+
new_differential[diagnosis] = probability
|
1417 |
+
|
1418 |
+
if new_differential:
|
1419 |
+
case_state.update_differential(new_differential)
|
1420 |
+
logger.debug(f"Updated differential: {new_differential}")
|
1421 |
+
|
1422 |
+
except Exception as e:
|
1423 |
+
logger.debug(f"Could not extract structured differential: {e}")
|
1424 |
+
# Still update the text version for display
|
1425 |
+
self.differential_diagnosis = hypothesis_analysis
|
1426 |
+
|
1427 |
+
def _validate_and_correct_action(self, action: Action, case_state: CaseState, remaining_budget: int) -> Action:
|
1428 |
+
"""Validate and correct actions based on mode constraints and context"""
|
1429 |
+
|
1430 |
+
# Mode-specific validations
|
1431 |
+
if self.mode == "question_only" and action.action_type == "test":
|
1432 |
+
logger.warning("Test ordering attempted in question-only mode, converting to ask action")
|
1433 |
+
action.action_type = "ask"
|
1434 |
+
action.content = "Can you provide more details about the patient's symptoms and history?"
|
1435 |
+
action.reasoning = "Mode constraint: question-only mode active"
|
1436 |
+
|
1437 |
+
if self.mode == "budgeted" and action.action_type == "test" and remaining_budget <= 0:
|
1438 |
+
logger.warning("Test ordering attempted with insufficient budget, converting to diagnose action")
|
1439 |
+
action.action_type = "diagnose"
|
1440 |
+
action.content = case_state.get_leading_diagnosis()
|
1441 |
+
action.reasoning = "Budget constraint: insufficient funds for additional testing"
|
1442 |
+
|
1443 |
+
# Stagnation handling
|
1444 |
+
if case_state.is_stagnating(action):
|
1445 |
+
logger.warning("Stagnation detected, forcing diagnostic decision")
|
1446 |
+
action.action_type = "diagnose"
|
1447 |
+
action.content = case_state.get_leading_diagnosis()
|
1448 |
+
action.reasoning = "Forced diagnosis due to detected stagnation in diagnostic process"
|
1449 |
+
|
1450 |
+
# High confidence threshold
|
1451 |
+
if action.action_type != "diagnose" and case_state.get_max_confidence() > 0.90:
|
1452 |
+
logger.info("Very high confidence reached, recommending diagnosis")
|
1453 |
+
action.action_type = "diagnose"
|
1454 |
+
action.content = case_state.get_leading_diagnosis()
|
1455 |
+
action.reasoning = "High confidence threshold reached - proceeding to final diagnosis"
|
1456 |
+
|
1457 |
+
return action
|
1458 |
|
1459 |
def _interact_with_gatekeeper(
|
1460 |
self, action: Action, full_case_details: str
|
|
|
1514 |
ground_truth_diagnosis: str,
|
1515 |
) -> DiagnosisResult:
|
1516 |
"""
|
1517 |
+
Executes the full sequential diagnostic process with structured state management.
|
1518 |
|
1519 |
Args:
|
1520 |
initial_case_info (str): The initial abstract of the case.
|
|
|
1525 |
DiagnosisResult: An object containing the final diagnosis, evaluation, cost, and history.
|
1526 |
"""
|
1527 |
start_time = time.time()
|
1528 |
+
|
1529 |
+
# Initialize structured case state
|
1530 |
+
case_state = CaseState(initial_vignette=initial_case_info)
|
1531 |
+
case_state.cumulative_cost = self.physician_visit_cost # Add initial visit cost
|
1532 |
+
self.cumulative_cost = case_state.cumulative_cost
|
1533 |
+
|
1534 |
+
# Store for potential use by other methods
|
1535 |
+
self.case_state = case_state
|
1536 |
+
|
1537 |
+
# Add to conversation for history tracking
|
1538 |
self.conversation.add(
|
1539 |
"Gatekeeper",
|
1540 |
f"Initial Case Information: {initial_case_info}",
|
1541 |
)
|
1542 |
+
case_state.add_evidence(f"Initial presentation: {initial_case_info}")
|
1543 |
|
|
|
|
|
1544 |
logger.info(
|
1545 |
f"Initial physician visit cost: ${self.physician_visit_cost}"
|
1546 |
)
|
|
|
1550 |
|
1551 |
for i in range(self.max_iterations):
|
1552 |
iteration_count = i + 1
|
1553 |
+
case_state.iteration = iteration_count
|
1554 |
+
|
1555 |
logger.info(
|
1556 |
f"--- Starting Diagnostic Loop {iteration_count}/{self.max_iterations} ---"
|
1557 |
)
|
1558 |
logger.info(
|
1559 |
+
f"Current cost: ${case_state.cumulative_cost:,} | Remaining budget: ${self.initial_budget - case_state.cumulative_cost:,}"
|
1560 |
)
|
1561 |
|
1562 |
try:
|
1563 |
+
# Panel deliberates to decide on the next action using structured state
|
1564 |
+
action = self._run_panel_deliberation(case_state)
|
1565 |
logger.info(
|
1566 |
f"βοΈ Panel decision: {action.action_type.upper()} -> {action.content}"
|
1567 |
)
|
|
|
1569 |
f"π Medical reasoning: {action.reasoning}"
|
1570 |
)
|
1571 |
|
1572 |
+
# Add action to case state for stagnation detection
|
1573 |
+
case_state.add_action(action)
|
1574 |
+
|
1575 |
if action.action_type == "diagnose":
|
1576 |
final_diagnosis = action.content
|
1577 |
logger.info(
|
|
|
1579 |
)
|
1580 |
break
|
1581 |
|
1582 |
+
# Handle mode-specific constraints (most are now handled in validation)
|
1583 |
if (
|
1584 |
self.mode == "question_only"
|
1585 |
and action.action_type == "test"
|
|
|
1598 |
action.content
|
1599 |
)
|
1600 |
if (
|
1601 |
+
case_state.cumulative_cost + estimated_test_cost
|
1602 |
> self.initial_budget
|
1603 |
):
|
1604 |
logger.warning(
|
|
|
1611 |
action, full_case_details
|
1612 |
)
|
1613 |
self.conversation.add("Gatekeeper", response)
|
1614 |
+
case_state.add_evidence(response)
|
1615 |
|
1616 |
+
# Update costs and state based on action type
|
1617 |
if action.action_type == "test":
|
1618 |
test_cost = self._estimate_cost(action.content)
|
1619 |
+
case_state.cumulative_cost += test_cost
|
1620 |
+
case_state.add_test(str(action.content))
|
1621 |
+
self.cumulative_cost = case_state.cumulative_cost # Keep backward compatibility
|
1622 |
+
|
1623 |
logger.info(f"Tests ordered: {action.content}")
|
1624 |
logger.info(
|
1625 |
+
f"Test cost: ${test_cost:,} | Cumulative cost: ${case_state.cumulative_cost:,}"
|
1626 |
)
|
1627 |
elif action.action_type == "ask":
|
1628 |
+
case_state.add_question(str(action.content))
|
1629 |
# Questions are part of the same visit until tests are ordered
|
1630 |
logger.info(f"Questions asked: {action.content}")
|
1631 |
logger.info(
|
|
|
1635 |
# Check budget constraints for budgeted mode
|
1636 |
if (
|
1637 |
self.mode == "budgeted"
|
1638 |
+
and case_state.cumulative_cost >= self.initial_budget
|
1639 |
):
|
1640 |
logger.warning(
|
1641 |
"Budget limit reached. Forcing final diagnosis."
|
1642 |
)
|
1643 |
+
# Use current leading diagnosis
|
1644 |
+
final_diagnosis = case_state.get_leading_diagnosis()
|
|
|
|
|
|
|
|
|
1645 |
break
|
1646 |
|
1647 |
except Exception as e:
|
|
|
1653 |
|
1654 |
else:
|
1655 |
# Max iterations reached without diagnosis
|
1656 |
+
final_diagnosis = case_state.get_leading_diagnosis()
|
1657 |
+
if final_diagnosis == "No diagnosis formulated":
|
1658 |
+
final_diagnosis = "Diagnosis not reached within maximum iterations."
|
|
|
|
|
1659 |
logger.warning(
|
1660 |
f"Max iterations ({self.max_iterations}) reached. Using best available diagnosis."
|
1661 |
)
|
|
|
1691 |
ground_truth=ground_truth_diagnosis,
|
1692 |
accuracy_score=judgement["score"],
|
1693 |
accuracy_reasoning=judgement["reasoning"],
|
1694 |
+
total_cost=case_state.cumulative_cost,
|
1695 |
iterations=iteration_count,
|
1696 |
conversation_history=self.conversation.get_str(),
|
1697 |
)
|
|
|
1700 |
logger.info(f" Final diagnosis: {final_diagnosis}")
|
1701 |
logger.info(f" Ground truth: {ground_truth_diagnosis}")
|
1702 |
logger.info(f" Accuracy score: {judgement['score']}/5.0")
|
1703 |
+
logger.info(f" Total cost: ${case_state.cumulative_cost:,}")
|
1704 |
logger.info(f" Iterations: {iteration_count}")
|
1705 |
|
1706 |
return result
|
|
|
1982 |
return results
|
1983 |
|
1984 |
|
1985 |
+
if __name__ == "__main__":
|
1986 |
+
# Example case inspired by the paper's Figure 1
|
1987 |
+
initial_info = (
|
1988 |
+
"A 29-year-old woman was admitted to the hospital because of sore throat and peritonsillar swelling "
|
1989 |
+
"and bleeding. Symptoms did not abate with antimicrobial therapy."
|
1990 |
+
)
|
1991 |
+
|
1992 |
+
full_case = """
|
1993 |
+
Patient: 29-year-old female.
|
1994 |
+
History: Onset of sore throat 7 weeks prior to admission. Worsening right-sided pain and swelling.
|
1995 |
+
No fevers, headaches, or gastrointestinal symptoms. Past medical history is unremarkable. No history of smoking or significant alcohol use.
|
1996 |
+
Physical Exam: Right peritonsillar mass, displacing the uvula. No other significant findings.
|
1997 |
+
Initial Labs: FBC, clotting studies normal.
|
1998 |
+
MRI Neck: Showed a large, enhancing mass in the right peritonsillar space.
|
1999 |
+
Biopsy (H&E): Infiltrative round-cell neoplasm with high nuclear-to-cytoplasmic ratio and frequent mitotic figures.
|
2000 |
+
Biopsy (Immunohistochemistry for Carcinoma): CD31, D2-40, CD34, ERG, GLUT-1, pan-cytokeratin, CD45, CD20, CD3 all negative. Ki-67: 60% nuclear positivity.
|
2001 |
+
Biopsy (Immunohistochemistry for Rhabdomyosarcoma): Desmin and MyoD1 diffusely positive. Myogenin multifocally positive.
|
2002 |
+
Biopsy (FISH): No FOXO1 (13q14) rearrangements detected.
|
2003 |
+
Final Diagnosis from Pathology: Embryonal rhabdomyosarcoma of the pharynx.
|
2004 |
+
"""
|
2005 |
+
|
2006 |
+
ground_truth = "Embryonal rhabdomyosarcoma of the pharynx"
|
2007 |
+
|
2008 |
+
# --- Demonstrate Different MAI-DxO Variants ---
|
2009 |
+
try:
|
2010 |
+
print("\n" + "=" * 80)
|
2011 |
+
print(
|
2012 |
+
" MAI DIAGNOSTIC ORCHESTRATOR (MAI-DxO) - SEQUENTIAL DIAGNOSIS BENCHMARK"
|
2013 |
+
)
|
2014 |
+
print(
|
2015 |
+
" Implementation based on the NEJM Research Paper"
|
2016 |
+
)
|
2017 |
+
print("=" * 80)
|
2018 |
+
|
2019 |
+
# Test different variants as described in the paper
|
2020 |
+
variants_to_test = [
|
2021 |
+
(
|
2022 |
+
"no_budget",
|
2023 |
+
"Standard MAI-DxO with no budget constraints",
|
2024 |
+
),
|
2025 |
+
("budgeted", "Budget-constrained MAI-DxO ($3000 limit)"),
|
2026 |
+
(
|
2027 |
+
"question_only",
|
2028 |
+
"Question-only variant (no diagnostic tests)",
|
2029 |
+
),
|
2030 |
+
]
|
2031 |
+
|
2032 |
+
results = {}
|
2033 |
+
|
2034 |
+
for variant_name, description in variants_to_test:
|
2035 |
+
print(f"\n{'='*60}")
|
2036 |
+
print(f"Testing Variant: {variant_name.upper()}")
|
2037 |
+
print(f"Description: {description}")
|
2038 |
+
print("=" * 60)
|
2039 |
+
|
2040 |
+
# Create the variant
|
2041 |
+
if variant_name == "budgeted":
|
2042 |
+
orchestrator = MaiDxOrchestrator.create_variant(
|
2043 |
+
variant_name,
|
2044 |
+
budget=3000,
|
2045 |
+
model_name="gpt-4.1",
|
2046 |
+
max_iterations=5,
|
2047 |
+
)
|
2048 |
+
else:
|
2049 |
+
orchestrator = MaiDxOrchestrator.create_variant(
|
2050 |
+
variant_name,
|
2051 |
+
model_name="gpt-4.1",
|
2052 |
+
max_iterations=5,
|
2053 |
+
)
|
2054 |
+
|
2055 |
+
# Run the diagnostic process
|
2056 |
+
result = orchestrator.run(
|
2057 |
+
initial_case_info=initial_info,
|
2058 |
+
full_case_details=full_case,
|
2059 |
+
ground_truth_diagnosis=ground_truth,
|
2060 |
+
)
|
2061 |
+
|
2062 |
+
results[variant_name] = result
|
2063 |
+
|
2064 |
+
# Display results
|
2065 |
+
print(f"\nπ Final Diagnosis: {result.final_diagnosis}")
|
2066 |
+
print(f"π― Ground Truth: {result.ground_truth}")
|
2067 |
+
print(f"β Accuracy Score: {result.accuracy_score}/5.0")
|
2068 |
+
print(f" Reasoning: {result.accuracy_reasoning}")
|
2069 |
+
print(f"π° Total Cost: ${result.total_cost:,}")
|
2070 |
+
print(f"π Iterations: {result.iterations}")
|
2071 |
+
print(f"β±οΈ Mode: {orchestrator.mode}")
|
2072 |
+
|
2073 |
+
# Demonstrate ensemble approach
|
2074 |
+
print(f"\n{'='*60}")
|
2075 |
+
print("Testing Variant: ENSEMBLE")
|
2076 |
+
print(
|
2077 |
+
"Description: Multiple independent runs with consensus aggregation"
|
2078 |
+
)
|
2079 |
+
print("=" * 60)
|
2080 |
+
|
2081 |
+
ensemble_orchestrator = MaiDxOrchestrator.create_variant(
|
2082 |
+
"ensemble",
|
2083 |
+
model_name="gpt-4.1",
|
2084 |
+
max_iterations=3, # Shorter iterations for ensemble
|
2085 |
+
)
|
2086 |
+
|
2087 |
+
ensemble_result = ensemble_orchestrator.run_ensemble(
|
2088 |
+
initial_case_info=initial_info,
|
2089 |
+
full_case_details=full_case,
|
2090 |
+
ground_truth_diagnosis=ground_truth,
|
2091 |
+
num_runs=2, # Reduced for demo
|
2092 |
+
)
|
2093 |
+
|
2094 |
+
results["ensemble"] = ensemble_result
|
2095 |
+
|
2096 |
+
print(
|
2097 |
+
f"\nπ Ensemble Diagnosis: {ensemble_result.final_diagnosis}"
|
2098 |
+
)
|
2099 |
+
print(f"π― Ground Truth: {ensemble_result.ground_truth}")
|
2100 |
+
print(
|
2101 |
+
f"β Ensemble Score: {ensemble_result.accuracy_score}/5.0"
|
2102 |
+
)
|
2103 |
+
print(
|
2104 |
+
f"π° Total Ensemble Cost: ${ensemble_result.total_cost:,}"
|
2105 |
+
)
|
2106 |
+
|
2107 |
+
# --- Summary Comparison ---
|
2108 |
+
print(f"\n{'='*80}")
|
2109 |
+
print(" RESULTS SUMMARY")
|
2110 |
+
print("=" * 80)
|
2111 |
+
print(
|
2112 |
+
f"{'Variant':<15} {'Diagnosis Match':<15} {'Score':<8} {'Cost':<12} {'Iterations':<12}"
|
2113 |
+
)
|
2114 |
+
print("-" * 80)
|
2115 |
+
|
2116 |
+
for variant_name, result in results.items():
|
2117 |
+
match_status = (
|
2118 |
+
"β Match"
|
2119 |
+
if result.accuracy_score >= 4.0
|
2120 |
+
else "β No Match"
|
2121 |
+
)
|
2122 |
+
print(
|
2123 |
+
f"{variant_name:<15} {match_status:<15} {result.accuracy_score:<8.1f} ${result.total_cost:<11,} {result.iterations:<12}"
|
2124 |
+
)
|
2125 |
+
|
2126 |
+
print(f"\n{'='*80}")
|
2127 |
+
print(
|
2128 |
+
"Implementation successfully demonstrates the MAI-DxO framework"
|
2129 |
+
)
|
2130 |
+
print(
|
2131 |
+
"as described in 'Sequential Diagnosis with Language Models' paper"
|
2132 |
+
)
|
2133 |
+
print("=" * 80)
|
2134 |
+
|
2135 |
+
except Exception as e:
|
2136 |
+
logger.exception(
|
2137 |
+
f"An error occurred during the diagnostic session: {e}"
|
2138 |
+
)
|
2139 |
+
print(f"\nβ Error occurred: {e}")
|
2140 |
+
print("Please check your model configuration and API keys.")
|