harshalmore31 commited on
Commit
40b5db7
·
1 Parent(s): 5eb85c3

Fix model name to valid GPT-4.1 version and enhance agent token limits for improved diagnostic capabilities

Browse files
Files changed (1) hide show
  1. mai_dx/main.py +241 -105
mai_dx/main.py CHANGED
@@ -261,7 +261,7 @@ class MaiDxOrchestrator:
261
 
262
  def __init__(
263
  self,
264
- model_name: str = "gpt-4.1", # Updated to GPT-4.1 as requested (GPT-4 Turbo)
265
  max_iterations: int = 10,
266
  initial_budget: int = 10000,
267
  mode: str = "no_budget", # "instant", "question_only", "budgeted", "no_budget", "ensemble"
@@ -332,18 +332,18 @@ class MaiDxOrchestrator:
332
  )
333
 
334
  def _get_agent_max_tokens(self, role: AgentRole) -> int:
335
- """Get max_tokens for each agent based on their role - addresses token optimization"""
336
  token_limits = {
337
- AgentRole.HYPOTHESIS: 800, # Needs space for differential diagnosis
338
- AgentRole.TEST_CHOOSER: 600, # Test recommendations
339
- AgentRole.CHALLENGER: 700, # Bias identification and alternatives
340
- AgentRole.STEWARDSHIP: 500, # Cost analysis
341
- AgentRole.CHECKLIST: 400, # Brief validation
342
- AgentRole.CONSENSUS: 300, # Just JSON output
343
- AgentRole.GATEKEEPER: 1000, # Detailed clinical findings
344
- AgentRole.JUDGE: 600, # Scoring and reasoning
345
  }
346
- return token_limits.get(role, 500)
347
 
348
  def _init_agents(self) -> None:
349
  """Initializes all required agents with their specific roles and prompts."""
@@ -409,10 +409,11 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
409
  You are Dr. Hypothesis, a specialist in maintaining differential diagnoses. Your role is critical to the diagnostic process.
410
 
411
  CORE RESPONSIBILITIES:
412
- - Maintain a probability-ranked differential diagnosis with the top 3 most likely conditions
413
  - Update probabilities using Bayesian reasoning after each new finding
414
  - Consider both common and rare diseases appropriate to the clinical context
415
  - Explicitly track how new evidence changes your diagnostic thinking
 
416
 
417
  APPROACH:
418
  1. Start with the most likely diagnoses based on presenting symptoms
@@ -421,19 +422,23 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
421
  - Whether it suggests new diagnoses to consider
422
  - How it changes the relative probabilities
423
  3. Always explain your Bayesian reasoning clearly
 
424
 
425
- OUTPUT FORMAT:
426
  Provide your updated differential diagnosis with:
427
- - Top 3 diagnoses with probability estimates (percentages)
428
- - Brief rationale for each
429
  - Key evidence supporting each hypothesis
430
  - Evidence that contradicts or challenges each hypothesis
 
 
431
 
432
- Remember: Your differential drives the entire diagnostic process. Be thorough, evidence-based, and adaptive.
433
  """,
434
 
435
- AgentRole.TEST_CHOOSER: (
436
- """
 
437
  You are Dr. Test-Chooser, a specialist in diagnostic test selection and information theory.
438
 
439
  CORE RESPONSIBILITIES:
@@ -441,31 +446,38 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
441
  - Optimize for information value, not just clinical reasonableness
442
  - Consider test characteristics: sensitivity, specificity, positive/negative predictive values
443
  - Balance diagnostic yield with patient burden and resource utilization
 
444
 
445
  SELECTION CRITERIA:
446
  1. Information Value: How much will this test change diagnostic probabilities?
447
  2. Discriminatory Power: How well does it distinguish between competing hypotheses?
448
  3. Clinical Impact: Will the result meaningfully alter management?
449
  4. Sequential Logic: What should we establish first before ordering more complex tests?
 
450
 
451
  APPROACH:
452
  - For each proposed test, explicitly state which hypotheses it will help confirm or exclude
453
  - Consider both positive and negative results and their implications
454
  - Think about test sequences (e.g., basic labs before advanced imaging)
455
  - Avoid redundant tests that won't add new information
 
456
 
457
- OUTPUT FORMAT:
458
  For each recommended test:
459
- - Test name (be specific)
460
  - Primary hypotheses it will help evaluate
461
- - Expected information gain
462
  - How results will change management decisions
 
 
 
463
 
464
- Focus on tests that will most efficiently narrow the differential diagnosis.
465
- """
466
- ),
467
- AgentRole.CHALLENGER: (
468
- """
 
469
  You are Dr. Challenger, the critical thinking specialist and devil's advocate.
470
 
471
  CORE RESPONSIBILITIES:
@@ -473,6 +485,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
473
  - Highlight contradictory evidence that might be overlooked
474
  - Propose alternative hypotheses and falsifying tests
475
  - Guard against premature diagnostic closure
 
476
 
477
  COGNITIVE BIASES TO WATCH FOR:
478
  1. Anchoring: Over-reliance on initial impressions
@@ -480,6 +493,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
480
  3. Availability bias: Overestimating probability of recently seen conditions
481
  4. Representativeness: Ignoring base rates and prevalence
482
  5. Search satisficing: Stopping at "good enough" explanations
 
483
 
484
  YOUR APPROACH:
485
  - Ask "What else could this be?" and "What doesn't fit?"
@@ -487,19 +501,23 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
487
  - Propose tests that could disprove the leading hypothesis
488
  - Consider rare diseases when common ones don't fully explain the picture
489
  - Advocate for considering multiple conditions simultaneously
 
490
 
491
- OUTPUT FORMAT:
492
  - Specific biases you've identified in the current reasoning
493
  - Evidence that contradicts the leading hypotheses
494
- - Alternative diagnoses to consider
495
  - Tests that could falsify current assumptions
496
  - Red flags or concerning patterns that need attention
 
 
497
 
498
- Be constructively critical - your role is to strengthen diagnostic accuracy through rigorous challenge.
499
- """
500
- ),
501
- AgentRole.STEWARDSHIP: (
502
- """
 
503
  You are Dr. Stewardship, the resource optimization and cost-effectiveness specialist.
504
 
505
  CORE RESPONSIBILITIES:
@@ -507,6 +525,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
507
  - Advocate for cheaper alternatives when diagnostically equivalent
508
  - Challenge low-yield, expensive tests
509
  - Balance diagnostic thoroughness with resource stewardship
 
510
 
511
  COST-VALUE FRAMEWORK:
512
  1. High-Value Tests: Low cost, high diagnostic yield, changes management
@@ -519,33 +538,39 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
519
  - Is there a less expensive test with similar diagnostic value?
520
  - Can we use a staged approach (cheap test first, expensive if needed)?
521
  - Does the test result actually change management?
 
522
 
523
  YOUR APPROACH:
524
  - Review all proposed tests for necessity and value
525
- - Suggest cost-effective alternatives
526
  - Question tests that don't clearly advance diagnosis
527
  - Advocate for asking questions before ordering expensive tests
528
- - Consider the cumulative cost burden
 
529
 
530
- OUTPUT FORMAT:
531
- - Assessment of proposed tests (high/moderate/low/no value)
532
- - Specific cost-effective alternatives
533
  - Questions that might obviate need for testing
534
  - Recommended modifications to testing strategy
535
- - Cumulative cost considerations
 
 
536
 
537
- Your goal: Maximum diagnostic accuracy at minimum necessary cost.
538
- """
539
- ),
540
- AgentRole.CHECKLIST: (
541
- """
 
542
  You are Dr. Checklist, the quality assurance and consistency specialist.
543
 
544
  CORE RESPONSIBILITIES:
545
- - Perform silent quality control on all panel deliberations
546
  - Ensure test names are valid and properly specified
547
  - Check internal consistency of reasoning across panel members
548
  - Flag logical errors or contradictions in the diagnostic approach
 
549
 
550
  QUALITY CHECKS:
551
  1. Test Validity: Are proposed tests real and properly named?
@@ -553,6 +578,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
553
  3. Evidence Integration: Are all findings being considered appropriately?
554
  4. Process Adherence: Is the panel following proper diagnostic methodology?
555
  5. Safety Checks: Are any critical possibilities being overlooked?
 
556
 
557
  SPECIFIC VALIDATIONS:
558
  - Test names match standard medical terminology
@@ -560,17 +586,20 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
560
  - No contradictions between different panel members' reasoning
561
  - All significant findings are being addressed
562
  - No gaps in the diagnostic logic
563
-
564
- OUTPUT FORMAT:
565
- - Brief validation summary (✓ Clear / Issues noted)
566
- - Any test name corrections needed
567
- - Logical inconsistencies identified
568
- - Missing considerations or gaps
569
- - Process improvement suggestions
570
-
571
- Keep your feedback concise but comprehensive. Flag any issues that could compromise diagnostic quality.
572
- """
573
- ),
 
 
 
574
  AgentRole.CONSENSUS: f"""
575
  {dynamic_context}
576
 
@@ -591,8 +620,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
591
  4. **Cost Optimization:** Before finalizing a test, check Dr. Stewardship's input. If a diagnostically equivalent but cheaper alternative is available, select it.
592
  5. **Default to Questions:** If no test meets the criteria or the budget is a major concern, select the most pertinent question to ask.
593
 
594
- OUTPUT REQUIREMENTS:
595
- Provide a JSON object with this exact structure:
596
  {{
597
  "action_type": "ask" | "test" | "diagnose",
598
  "content": "specific question(s), test name(s), or final diagnosis",
@@ -603,10 +631,12 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
603
  For action_type "test": content should be properly named diagnostic tests (up to 3)
604
  For action_type "diagnose": content should be the complete, specific final diagnosis
605
 
606
- Make the decision that best advances accurate, cost-effective diagnosis.
607
  """,
608
- AgentRole.GATEKEEPER: (
609
- """
 
 
610
  You are the Gatekeeper, the clinical information oracle with complete access to the patient case file.
611
 
612
  CORE RESPONSIBILITIES:
@@ -614,6 +644,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
614
  - Serve as the authoritative source for all patient information
615
  - Generate realistic synthetic findings for tests not in the original case
616
  - Maintain clinical realism while preventing information leakage
 
617
 
618
  RESPONSE PRINCIPLES:
619
  1. OBJECTIVITY: Provide only factual findings, never interpretations or impressions
@@ -621,6 +652,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
621
  3. REALISM: Ensure all responses reflect realistic clinical scenarios
622
  4. NO HINTS: Never provide diagnostic clues or suggestions
623
  5. CONSISTENCY: Maintain coherence across all provided information
 
624
 
625
  HANDLING REQUESTS:
626
  - Patient History Questions: Provide relevant history from case file or realistic details
@@ -635,18 +667,21 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
635
  - Use realistic reference ranges and values
636
  - Maintain clinical plausibility
637
  - Avoid pathognomonic findings unless specifically diagnostic
 
638
 
639
- RESPONSE FORMAT:
640
- - Direct, clinical language
641
  - Specific measurements with reference ranges when applicable
642
- - Clear organization of findings
643
- - Professional medical terminology
 
644
 
645
- Your role is crucial: provide complete, accurate clinical information while maintaining the challenge of the diagnostic process.
646
- """
647
- ),
648
- AgentRole.JUDGE: (
649
- """
 
650
  You are the Judge, the diagnostic accuracy evaluation specialist.
651
 
652
  CORE RESPONSIBILITIES:
@@ -654,6 +689,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
654
  - Provide fair, consistent scoring based on clinical management implications
655
  - Consider diagnostic substance over terminology differences
656
  - Account for acceptable medical synonyms and equivalent formulations
 
657
 
658
  EVALUATION RUBRIC (5-point Likert scale):
659
 
@@ -694,15 +730,16 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
694
  4. Consider diagnostic completeness
695
  5. Judge clinical management implications
696
 
697
- OUTPUT FORMAT:
698
- - Score (1-5) with clear label
699
- - Detailed justification referencing specific rubric criteria
700
- - Explanation of how diagnosis would affect clinical management
701
  - Note any acceptable medical synonyms or equivalent terminology
 
 
702
 
703
- Maintain high standards while recognizing legitimate diagnostic variability in medical practice.
704
- """
705
- ),
706
  }
707
 
708
  # Use existing prompts for other roles, just add dynamic context
@@ -1022,6 +1059,22 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
1022
  def _parse_json_response(self, response: str, retry_count: int = 0) -> Dict[str, Any]:
1023
  """Safely parses a JSON string with retry logic - addresses Category 3.2"""
1024
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1025
  # Extract the actual response content from the agent response
1026
  if isinstance(response, str):
1027
  # Handle markdown-formatted JSON
@@ -1070,15 +1123,16 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
1070
  # Try to extract JSON from text that might contain other content
1071
  import re
1072
 
1073
- # Look for JSON pattern in the text
1074
- json_pattern = r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}"
1075
- matches = re.findall(
1076
- json_pattern, response, re.DOTALL
1077
- )
1078
 
1079
  for match in matches:
1080
  try:
1081
- return json.loads(match)
 
 
 
1082
  except json.JSONDecodeError:
1083
  continue
1084
 
@@ -1098,7 +1152,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
1098
  # Return the error for potential retry instead of immediately falling back
1099
  raise e
1100
 
1101
- def _parse_json_with_retry(self, consensus_agent: Agent, consensus_prompt: str, max_retries: int = 2) -> Dict[str, Any]:
1102
  """Parse JSON with retry logic for robustness - addresses Category 3.2"""
1103
  for attempt in range(max_retries + 1):
1104
  try:
@@ -1106,28 +1160,44 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
1106
  response = consensus_agent.run(consensus_prompt)
1107
  else:
1108
  # Retry with error feedback
1109
- retry_prompt = consensus_prompt + f"""
1110
-
1111
- **RETRY REQUIRED - ATTEMPT {attempt + 1}**
1112
- Your previous response could not be parsed as JSON. Please ensure your response contains ONLY a valid JSON object in this exact format:
1113
- ```json
 
1114
  {{
1115
  "action_type": "ask" | "test" | "diagnose",
1116
  "content": "your content here",
1117
  "reasoning": "your reasoning here"
1118
  }}
1119
- ```
 
 
1120
  """
1121
  response = consensus_agent.run(retry_prompt)
1122
 
1123
- # Extract the actual text content from agent response
1124
- if hasattr(response, "content"):
 
1125
  response_text = response.content
 
 
 
 
 
 
 
 
1126
  elif isinstance(response, str):
1127
  response_text = response
1128
  else:
1129
  response_text = str(response)
1130
 
 
 
 
 
1131
  return self._parse_json_response(response_text, attempt)
1132
 
1133
  except Exception as e:
@@ -1492,19 +1562,82 @@ CURRENT STATE:
1492
  Please evaluate the following diagnosis.
1493
  Ground Truth: "{ground_truth}"
1494
  Candidate Diagnosis: "{candidate_diagnosis}"
 
 
 
 
1495
  """
1496
  response = judge.run(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1497
 
1498
- # Simple parsing for demonstration; a more robust solution would use structured output.
1499
  try:
1500
- score = float(
1501
- response.split("Score:")[1].split("/")[0].strip()
1502
- )
1503
- reasoning = response.split("Justification:")[1].strip()
1504
- except (IndexError, ValueError):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1505
  score = 0.0
1506
- reasoning = "Could not parse judge's response."
1507
 
 
1508
  return {"score": score, "reasoning": reasoning}
1509
 
1510
  def run(
@@ -1885,7 +2018,7 @@ CURRENT STATE:
1885
  "mode": "budgeted",
1886
  "max_iterations": 10,
1887
  "enable_budget_tracking": True,
1888
- "initial_budget": kwargs.get("budget", 5000),
1889
  },
1890
  "no_budget": {
1891
  "mode": "no_budget",
@@ -1963,11 +2096,14 @@ def run_mai_dxo_demo(
1963
  orchestrator = MaiDxOrchestrator.create_variant(
1964
  variant,
1965
  budget=3000,
1966
- model_name="gpt-4.1",
 
1967
  )
1968
  else:
1969
  orchestrator = MaiDxOrchestrator.create_variant(
1970
- variant, model_name="gpt-4.1"
 
 
1971
  )
1972
 
1973
  result = orchestrator.run(
@@ -2042,13 +2178,13 @@ if __name__ == "__main__":
2042
  orchestrator = MaiDxOrchestrator.create_variant(
2043
  variant_name,
2044
  budget=3000,
2045
- model_name="gpt-4.1",
2046
  max_iterations=5,
2047
  )
2048
  else:
2049
  orchestrator = MaiDxOrchestrator.create_variant(
2050
  variant_name,
2051
- model_name="gpt-4.1",
2052
  max_iterations=5,
2053
  )
2054
 
@@ -2080,7 +2216,7 @@ if __name__ == "__main__":
2080
 
2081
  ensemble_orchestrator = MaiDxOrchestrator.create_variant(
2082
  "ensemble",
2083
- model_name="gpt-4.1",
2084
  max_iterations=3, # Shorter iterations for ensemble
2085
  )
2086
 
 
261
 
262
  def __init__(
263
  self,
264
+ model_name: str = "gpt-4-1106-preview", # Fixed: Use valid GPT-4 Turbo model name
265
  max_iterations: int = 10,
266
  initial_budget: int = 10000,
267
  mode: str = "no_budget", # "instant", "question_only", "budgeted", "no_budget", "ensemble"
 
332
  )
333
 
334
  def _get_agent_max_tokens(self, role: AgentRole) -> int:
335
+ """Get max_tokens for each agent based on their role - significantly increased limits"""
336
  token_limits = {
337
+ AgentRole.HYPOTHESIS: 2000, # Increased for comprehensive differential analysis
338
+ AgentRole.TEST_CHOOSER: 1500, # Increased for detailed test recommendations
339
+ AgentRole.CHALLENGER: 1800, # Increased for thorough bias analysis
340
+ AgentRole.STEWARDSHIP: 1200, # Increased for detailed cost analysis
341
+ AgentRole.CHECKLIST: 1000, # Increased for comprehensive validation
342
+ AgentRole.CONSENSUS: 800, # Increased for detailed reasoning + JSON
343
+ AgentRole.GATEKEEPER: 2500, # Increased for detailed clinical findings
344
+ AgentRole.JUDGE: 1500, # Increased for comprehensive evaluation
345
  }
346
+ return token_limits.get(role, 1000)
347
 
348
  def _init_agents(self) -> None:
349
  """Initializes all required agents with their specific roles and prompts."""
 
409
  You are Dr. Hypothesis, a specialist in maintaining differential diagnoses. Your role is critical to the diagnostic process.
410
 
411
  CORE RESPONSIBILITIES:
412
+ - Maintain a probability-ranked differential diagnosis with the top 3-5 most likely conditions
413
  - Update probabilities using Bayesian reasoning after each new finding
414
  - Consider both common and rare diseases appropriate to the clinical context
415
  - Explicitly track how new evidence changes your diagnostic thinking
416
+ - Provide comprehensive analysis with detailed clinical reasoning
417
 
418
  APPROACH:
419
  1. Start with the most likely diagnoses based on presenting symptoms
 
422
  - Whether it suggests new diagnoses to consider
423
  - How it changes the relative probabilities
424
  3. Always explain your Bayesian reasoning clearly
425
+ 4. Consider epidemiology, pathophysiology, and clinical patterns
426
 
427
+ OUTPUT FORMAT (Use full token allocation for comprehensive analysis):
428
  Provide your updated differential diagnosis with:
429
+ - Top 3-5 diagnoses with probability estimates (percentages)
430
+ - Detailed rationale for each diagnosis
431
  - Key evidence supporting each hypothesis
432
  - Evidence that contradicts or challenges each hypothesis
433
+ - Pathophysiological reasoning for each diagnosis
434
+ - Risk stratification and urgency considerations
435
 
436
+ Remember: Your differential drives the entire diagnostic process. Be thorough, evidence-based, and adaptive. Use your full token allocation to provide comprehensive clinical reasoning.
437
  """,
438
 
439
+ AgentRole.TEST_CHOOSER: f"""
440
+ {dynamic_context}
441
+
442
  You are Dr. Test-Chooser, a specialist in diagnostic test selection and information theory.
443
 
444
  CORE RESPONSIBILITIES:
 
446
  - Optimize for information value, not just clinical reasonableness
447
  - Consider test characteristics: sensitivity, specificity, positive/negative predictive values
448
  - Balance diagnostic yield with patient burden and resource utilization
449
+ - Provide comprehensive test selection rationale
450
 
451
  SELECTION CRITERIA:
452
  1. Information Value: How much will this test change diagnostic probabilities?
453
  2. Discriminatory Power: How well does it distinguish between competing hypotheses?
454
  3. Clinical Impact: Will the result meaningfully alter management?
455
  4. Sequential Logic: What should we establish first before ordering more complex tests?
456
+ 5. Cost-effectiveness and patient safety considerations
457
 
458
  APPROACH:
459
  - For each proposed test, explicitly state which hypotheses it will help confirm or exclude
460
  - Consider both positive and negative results and their implications
461
  - Think about test sequences (e.g., basic labs before advanced imaging)
462
  - Avoid redundant tests that won't add new information
463
+ - Consider pre-test probability and post-test probability calculations
464
 
465
+ OUTPUT FORMAT (Use full token allocation for detailed analysis):
466
  For each recommended test:
467
+ - Test name (be specific and accurate)
468
  - Primary hypotheses it will help evaluate
469
+ - Expected information gain and likelihood ratios
470
  - How results will change management decisions
471
+ - Cost considerations and alternatives
472
+ - Sequence rationale (why this test now vs. later)
473
+ - Expected sensitivity/specificity for the clinical context
474
 
475
+ Focus on tests that will most efficiently narrow the differential diagnosis while considering practical constraints.
476
+ """,
477
+
478
+ AgentRole.CHALLENGER: f"""
479
+ {dynamic_context}
480
+
481
  You are Dr. Challenger, the critical thinking specialist and devil's advocate.
482
 
483
  CORE RESPONSIBILITIES:
 
485
  - Highlight contradictory evidence that might be overlooked
486
  - Propose alternative hypotheses and falsifying tests
487
  - Guard against premature diagnostic closure
488
+ - Provide comprehensive critical analysis
489
 
490
  COGNITIVE BIASES TO WATCH FOR:
491
  1. Anchoring: Over-reliance on initial impressions
 
493
  3. Availability bias: Overestimating probability of recently seen conditions
494
  4. Representativeness: Ignoring base rates and prevalence
495
  5. Search satisficing: Stopping at "good enough" explanations
496
+ 6. Attribution errors and hindsight bias
497
 
498
  YOUR APPROACH:
499
  - Ask "What else could this be?" and "What doesn't fit?"
 
501
  - Propose tests that could disprove the leading hypothesis
502
  - Consider rare diseases when common ones don't fully explain the picture
503
  - Advocate for considering multiple conditions simultaneously
504
+ - Look for inconsistencies in the clinical presentation
505
 
506
+ OUTPUT FORMAT (Use full token allocation for thorough analysis):
507
  - Specific biases you've identified in the current reasoning
508
  - Evidence that contradicts the leading hypotheses
509
+ - Alternative diagnoses to consider with reasoning
510
  - Tests that could falsify current assumptions
511
  - Red flags or concerning patterns that need attention
512
+ - Analysis of what might be missing from the current approach
513
+ - Systematic review of differential diagnosis completeness
514
 
515
+ Be constructively critical - your role is to strengthen diagnostic accuracy through rigorous challenge and comprehensive analysis.
516
+ """,
517
+
518
+ AgentRole.STEWARDSHIP: f"""
519
+ {dynamic_context}
520
+
521
  You are Dr. Stewardship, the resource optimization and cost-effectiveness specialist.
522
 
523
  CORE RESPONSIBILITIES:
 
525
  - Advocate for cheaper alternatives when diagnostically equivalent
526
  - Challenge low-yield, expensive tests
527
  - Balance diagnostic thoroughness with resource stewardship
528
+ - Provide comprehensive cost-benefit analysis
529
 
530
  COST-VALUE FRAMEWORK:
531
  1. High-Value Tests: Low cost, high diagnostic yield, changes management
 
538
  - Is there a less expensive test with similar diagnostic value?
539
  - Can we use a staged approach (cheap test first, expensive if needed)?
540
  - Does the test result actually change management?
541
+ - Are there outpatient vs. inpatient cost considerations?
542
 
543
  YOUR APPROACH:
544
  - Review all proposed tests for necessity and value
545
+ - Suggest cost-effective alternatives with rationale
546
  - Question tests that don't clearly advance diagnosis
547
  - Advocate for asking questions before ordering expensive tests
548
+ - Consider the cumulative cost burden and budget constraints
549
+ - Analyze cost per unit of diagnostic information gained
550
 
551
+ OUTPUT FORMAT (Use full token allocation for detailed analysis):
552
+ - Assessment of proposed tests (high/moderate/low/no value) with detailed reasoning
553
+ - Specific cost-effective alternatives with cost comparisons
554
  - Questions that might obviate need for testing
555
  - Recommended modifications to testing strategy
556
+ - Cumulative cost considerations and budget impact
557
+ - Value-based care recommendations
558
+ - Analysis of diagnostic yield vs. cost for each proposed intervention
559
 
560
+ Your goal: Maximum diagnostic accuracy at minimum necessary cost while maintaining high-quality care.
561
+ """,
562
+
563
+ AgentRole.CHECKLIST: f"""
564
+ {dynamic_context}
565
+
566
  You are Dr. Checklist, the quality assurance and consistency specialist.
567
 
568
  CORE RESPONSIBILITIES:
569
+ - Perform comprehensive quality control on all panel deliberations
570
  - Ensure test names are valid and properly specified
571
  - Check internal consistency of reasoning across panel members
572
  - Flag logical errors or contradictions in the diagnostic approach
573
+ - Provide systematic quality assessment
574
 
575
  QUALITY CHECKS:
576
  1. Test Validity: Are proposed tests real and properly named?
 
578
  3. Evidence Integration: Are all findings being considered appropriately?
579
  4. Process Adherence: Is the panel following proper diagnostic methodology?
580
  5. Safety Checks: Are any critical possibilities being overlooked?
581
+ 6. Completeness: Is the diagnostic workup comprehensive?
582
 
583
  SPECIFIC VALIDATIONS:
584
  - Test names match standard medical terminology
 
586
  - No contradictions between different panel members' reasoning
587
  - All significant findings are being addressed
588
  - No gaps in the diagnostic logic
589
+ - Proper consideration of differential diagnosis breadth
590
+
591
+ OUTPUT FORMAT (Use full token allocation for comprehensive analysis):
592
+ - Detailed validation summary (✓ Clear / ⚠ Issues noted)
593
+ - Any test name corrections needed with proper terminology
594
+ - Logical inconsistencies identified with specific examples
595
+ - Missing considerations or gaps in reasoning
596
+ - Process improvement suggestions with rationale
597
+ - Safety concerns or red flags that need immediate attention
598
+ - Systematic review of diagnostic approach quality
599
+
600
+ Keep your feedback comprehensive and detailed. Flag any issues that could compromise diagnostic quality or patient safety.
601
+ """,
602
+
603
  AgentRole.CONSENSUS: f"""
604
  {dynamic_context}
605
 
 
620
  4. **Cost Optimization:** Before finalizing a test, check Dr. Stewardship's input. If a diagnostically equivalent but cheaper alternative is available, select it.
621
  5. **Default to Questions:** If no test meets the criteria or the budget is a major concern, select the most pertinent question to ask.
622
 
623
+ **CRITICAL: YOUR RESPONSE MUST BE EXACTLY THIS JSON FORMAT:**
 
624
  {{
625
  "action_type": "ask" | "test" | "diagnose",
626
  "content": "specific question(s), test name(s), or final diagnosis",
 
631
  For action_type "test": content should be properly named diagnostic tests (up to 3)
632
  For action_type "diagnose": content should be the complete, specific final diagnosis
633
 
634
+ Make the decision that best advances accurate, cost-effective diagnosis. Use your full token allocation for comprehensive reasoning in the reasoning field.
635
  """,
636
+
637
+ AgentRole.GATEKEEPER: f"""
638
+ {dynamic_context}
639
+
640
  You are the Gatekeeper, the clinical information oracle with complete access to the patient case file.
641
 
642
  CORE RESPONSIBILITIES:
 
644
  - Serve as the authoritative source for all patient information
645
  - Generate realistic synthetic findings for tests not in the original case
646
  - Maintain clinical realism while preventing information leakage
647
+ - Provide comprehensive, detailed responses
648
 
649
  RESPONSE PRINCIPLES:
650
  1. OBJECTIVITY: Provide only factual findings, never interpretations or impressions
 
652
  3. REALISM: Ensure all responses reflect realistic clinical scenarios
653
  4. NO HINTS: Never provide diagnostic clues or suggestions
654
  5. CONSISTENCY: Maintain coherence across all provided information
655
+ 6. COMPLETENESS: Provide thorough, detailed responses
656
 
657
  HANDLING REQUESTS:
658
  - Patient History Questions: Provide relevant history from case file or realistic details
 
667
  - Use realistic reference ranges and values
668
  - Maintain clinical plausibility
669
  - Avoid pathognomonic findings unless specifically diagnostic
670
+ - Consider normal variations and expected findings
671
 
672
+ RESPONSE FORMAT (Use full token allocation for detailed responses):
673
+ - Direct, clinical language with comprehensive detail
674
  - Specific measurements with reference ranges when applicable
675
+ - Clear organization of findings with systematic presentation
676
+ - Professional medical terminology with full descriptions
677
+ - Complete documentation as would appear in medical records
678
 
679
+ Your role is crucial: provide complete, accurate clinical information while maintaining the challenge of the diagnostic process. Use your full token allocation to provide comprehensive, detailed clinical information.
680
+ """,
681
+
682
+ AgentRole.JUDGE: f"""
683
+ {dynamic_context}
684
+
685
  You are the Judge, the diagnostic accuracy evaluation specialist.
686
 
687
  CORE RESPONSIBILITIES:
 
689
  - Provide fair, consistent scoring based on clinical management implications
690
  - Consider diagnostic substance over terminology differences
691
  - Account for acceptable medical synonyms and equivalent formulations
692
+ - Provide comprehensive evaluation reasoning
693
 
694
  EVALUATION RUBRIC (5-point Likert scale):
695
 
 
730
  4. Consider diagnostic completeness
731
  5. Judge clinical management implications
732
 
733
+ OUTPUT FORMAT (Use full token allocation for comprehensive evaluation):
734
+ - Score (1-5) with clear label and detailed justification
735
+ - Comprehensive reasoning referencing specific rubric criteria
736
+ - Detailed explanation of how diagnosis would affect clinical management
737
  - Note any acceptable medical synonyms or equivalent terminology
738
+ - Analysis of diagnostic accuracy and clinical implications
739
+ - Systematic comparison with ground truth diagnosis
740
 
741
+ Maintain high standards while recognizing legitimate diagnostic variability in medical practice. Provide comprehensive, detailed evaluation.
742
+ """,
 
743
  }
744
 
745
  # Use existing prompts for other roles, just add dynamic context
 
1059
  def _parse_json_response(self, response: str, retry_count: int = 0) -> Dict[str, Any]:
1060
  """Safely parses a JSON string with retry logic - addresses Category 3.2"""
1061
  try:
1062
+ # Handle agent response wrapper - extract actual content
1063
+ if isinstance(response, dict):
1064
+ # Handle swarms Agent response format
1065
+ if 'role' in response and 'content' in response:
1066
+ response = response['content']
1067
+ elif 'content' in response:
1068
+ response = response['content']
1069
+ else:
1070
+ # Try to extract any string value from dict
1071
+ response = str(response)
1072
+ elif hasattr(response, 'content'):
1073
+ response = response.content
1074
+ elif not isinstance(response, str):
1075
+ # Convert to string if it's some other type
1076
+ response = str(response)
1077
+
1078
  # Extract the actual response content from the agent response
1079
  if isinstance(response, str):
1080
  # Handle markdown-formatted JSON
 
1123
  # Try to extract JSON from text that might contain other content
1124
  import re
1125
 
1126
+ # Look for JSON pattern in the text - more comprehensive regex
1127
+ json_pattern = r'\{(?:[^{}]|(?:\{[^{}]*\}))*\}'
1128
+ matches = re.findall(json_pattern, response, re.DOTALL)
 
 
1129
 
1130
  for match in matches:
1131
  try:
1132
+ parsed = json.loads(match)
1133
+ # Validate that it has the expected action structure
1134
+ if isinstance(parsed, dict) and 'action_type' in parsed:
1135
+ return parsed
1136
  except json.JSONDecodeError:
1137
  continue
1138
 
 
1152
  # Return the error for potential retry instead of immediately falling back
1153
  raise e
1154
 
1155
+ def _parse_json_with_retry(self, consensus_agent: Agent, consensus_prompt: str, max_retries: int = 3) -> Dict[str, Any]:
1156
  """Parse JSON with retry logic for robustness - addresses Category 3.2"""
1157
  for attempt in range(max_retries + 1):
1158
  try:
 
1160
  response = consensus_agent.run(consensus_prompt)
1161
  else:
1162
  # Retry with error feedback
1163
+ retry_prompt = f"""
1164
+ {consensus_prompt}
1165
+
1166
+ **CRITICAL: RETRY REQUIRED - ATTEMPT {attempt + 1}**
1167
+ Your previous response could not be parsed as JSON. You MUST respond with ONLY a valid JSON object in exactly this format:
1168
+
1169
  {{
1170
  "action_type": "ask" | "test" | "diagnose",
1171
  "content": "your content here",
1172
  "reasoning": "your reasoning here"
1173
  }}
1174
+
1175
+ Do NOT include any other text, markdown formatting, or explanations. Only the raw JSON object.
1176
+ NO SYSTEM MESSAGES, NO WRAPPER FORMAT. JUST THE JSON.
1177
  """
1178
  response = consensus_agent.run(retry_prompt)
1179
 
1180
+ # Handle different response types from swarms Agent
1181
+ response_text = ""
1182
+ if hasattr(response, 'content'):
1183
  response_text = response.content
1184
+ elif isinstance(response, dict):
1185
+ # Handle swarms Agent response wrapper
1186
+ if 'role' in response and 'content' in response:
1187
+ response_text = response['content']
1188
+ elif 'content' in response:
1189
+ response_text = response['content']
1190
+ else:
1191
+ response_text = str(response)
1192
  elif isinstance(response, str):
1193
  response_text = response
1194
  else:
1195
  response_text = str(response)
1196
 
1197
+ # Log the response for debugging
1198
+ logger.debug(f"Parsing attempt {attempt + 1}, response type: {type(response)}")
1199
+ logger.debug(f"Response content preview: {str(response_text)[:200]}...")
1200
+
1201
  return self._parse_json_response(response_text, attempt)
1202
 
1203
  except Exception as e:
 
1562
  Please evaluate the following diagnosis.
1563
  Ground Truth: "{ground_truth}"
1564
  Candidate Diagnosis: "{candidate_diagnosis}"
1565
+
1566
+ You must provide your evaluation in exactly this format:
1567
+ Score: [number from 1-5]
1568
+ Justification: [detailed reasoning for the score]
1569
  """
1570
  response = judge.run(prompt)
1571
+
1572
+ # Handle different response types from swarms Agent
1573
+ response_text = ""
1574
+ if hasattr(response, 'content'):
1575
+ response_text = response.content
1576
+ elif isinstance(response, dict):
1577
+ if 'role' in response and 'content' in response:
1578
+ response_text = response['content']
1579
+ elif 'content' in response:
1580
+ response_text = response['content']
1581
+ else:
1582
+ response_text = str(response)
1583
+ elif isinstance(response, str):
1584
+ response_text = response
1585
+ else:
1586
+ response_text = str(response)
1587
 
1588
+ # Enhanced parsing for demonstration; a more robust solution would use structured output.
1589
  try:
1590
+ # Look for score patterns
1591
+ import re
1592
+
1593
+ # Try multiple score patterns
1594
+ score_patterns = [
1595
+ r"Score:\s*(\d+(?:\.\d+)?)",
1596
+ r"Score\s*(\d+(?:\.\d+)?)",
1597
+ r"(\d+(?:\.\d+)?)/5",
1598
+ r"Score.*?(\d+(?:\.\d+)?)",
1599
+ ]
1600
+
1601
+ score = 0.0
1602
+ for pattern in score_patterns:
1603
+ match = re.search(pattern, response_text, re.IGNORECASE)
1604
+ if match:
1605
+ score = float(match.group(1))
1606
+ break
1607
+
1608
+ # Extract reasoning
1609
+ reasoning_patterns = [
1610
+ r"Justification:\s*(.+?)(?:\n\n|\Z)",
1611
+ r"Reasoning:\s*(.+?)(?:\n\n|\Z)",
1612
+ r"Explanation:\s*(.+?)(?:\n\n|\Z)",
1613
+ ]
1614
+
1615
+ reasoning = "Could not parse judge's reasoning."
1616
+ for pattern in reasoning_patterns:
1617
+ match = re.search(pattern, response_text, re.IGNORECASE | re.DOTALL)
1618
+ if match:
1619
+ reasoning = match.group(1).strip()
1620
+ break
1621
+
1622
+ # If no specific reasoning found, use the whole response after score
1623
+ if reasoning == "Could not parse judge's reasoning." and score > 0:
1624
+ # Try to extract everything after the score
1625
+ score_match = re.search(r"Score:?\s*\d+(?:\.\d+)?", response_text, re.IGNORECASE)
1626
+ if score_match:
1627
+ reasoning = response_text[score_match.end():].strip()
1628
+ # Clean up common prefixes
1629
+ reasoning = re.sub(r"^(Justification|Reasoning|Explanation):\s*", "", reasoning, flags=re.IGNORECASE)
1630
+
1631
+ # Final fallback - use the whole response if we have a score
1632
+ if reasoning == "Could not parse judge's reasoning." and score > 0:
1633
+ reasoning = response_text
1634
+
1635
+ except (IndexError, ValueError, AttributeError) as e:
1636
+ logger.error(f"Error parsing judge response: {e}")
1637
  score = 0.0
1638
+ reasoning = f"Could not parse judge's response: {str(e)}"
1639
 
1640
+ logger.info(f"Judge evaluation: Score={score}, Reasoning preview: {reasoning[:100]}...")
1641
  return {"score": score, "reasoning": reasoning}
1642
 
1643
  def run(
 
2018
  "mode": "budgeted",
2019
  "max_iterations": 10,
2020
  "enable_budget_tracking": True,
2021
+ "initial_budget": kwargs.get("budget", 5000), # Fixed: map budget to initial_budget
2022
  },
2023
  "no_budget": {
2024
  "mode": "no_budget",
 
2096
  orchestrator = MaiDxOrchestrator.create_variant(
2097
  variant,
2098
  budget=3000,
2099
+ model_name="gpt-4-1106-preview", # Fixed: Use valid model name
2100
+ max_iterations=5,
2101
  )
2102
  else:
2103
  orchestrator = MaiDxOrchestrator.create_variant(
2104
+ variant,
2105
+ model_name="gpt-4-1106-preview", # Fixed: Use valid model name
2106
+ max_iterations=5,
2107
  )
2108
 
2109
  result = orchestrator.run(
 
2178
  orchestrator = MaiDxOrchestrator.create_variant(
2179
  variant_name,
2180
  budget=3000,
2181
+ model_name="gpt-4-1106-preview", # Fixed: Use valid model name
2182
  max_iterations=5,
2183
  )
2184
  else:
2185
  orchestrator = MaiDxOrchestrator.create_variant(
2186
  variant_name,
2187
+ model_name="gpt-4-1106-preview", # Fixed: Use valid model name
2188
  max_iterations=5,
2189
  )
2190
 
 
2216
 
2217
  ensemble_orchestrator = MaiDxOrchestrator.create_variant(
2218
  "ensemble",
2219
+ model_name="gpt-4-1106-preview", # Fixed: Use valid model name
2220
  max_iterations=3, # Shorter iterations for ensemble
2221
  )
2222