Spaces:
Sleeping
Sleeping
Commit
·
40b5db7
1
Parent(s):
5eb85c3
Fix model name to valid GPT-4.1 version and enhance agent token limits for improved diagnostic capabilities
Browse files- mai_dx/main.py +241 -105
mai_dx/main.py
CHANGED
@@ -261,7 +261,7 @@ class MaiDxOrchestrator:
|
|
261 |
|
262 |
def __init__(
|
263 |
self,
|
264 |
-
model_name: str = "gpt-4
|
265 |
max_iterations: int = 10,
|
266 |
initial_budget: int = 10000,
|
267 |
mode: str = "no_budget", # "instant", "question_only", "budgeted", "no_budget", "ensemble"
|
@@ -332,18 +332,18 @@ class MaiDxOrchestrator:
|
|
332 |
)
|
333 |
|
334 |
def _get_agent_max_tokens(self, role: AgentRole) -> int:
|
335 |
-
"""Get max_tokens for each agent based on their role -
|
336 |
token_limits = {
|
337 |
-
AgentRole.HYPOTHESIS:
|
338 |
-
AgentRole.TEST_CHOOSER:
|
339 |
-
AgentRole.CHALLENGER:
|
340 |
-
AgentRole.STEWARDSHIP:
|
341 |
-
AgentRole.CHECKLIST:
|
342 |
-
AgentRole.CONSENSUS:
|
343 |
-
AgentRole.GATEKEEPER:
|
344 |
-
AgentRole.JUDGE:
|
345 |
}
|
346 |
-
return token_limits.get(role,
|
347 |
|
348 |
def _init_agents(self) -> None:
|
349 |
"""Initializes all required agents with their specific roles and prompts."""
|
@@ -409,10 +409,11 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
409 |
You are Dr. Hypothesis, a specialist in maintaining differential diagnoses. Your role is critical to the diagnostic process.
|
410 |
|
411 |
CORE RESPONSIBILITIES:
|
412 |
-
- Maintain a probability-ranked differential diagnosis with the top 3 most likely conditions
|
413 |
- Update probabilities using Bayesian reasoning after each new finding
|
414 |
- Consider both common and rare diseases appropriate to the clinical context
|
415 |
- Explicitly track how new evidence changes your diagnostic thinking
|
|
|
416 |
|
417 |
APPROACH:
|
418 |
1. Start with the most likely diagnoses based on presenting symptoms
|
@@ -421,19 +422,23 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
421 |
- Whether it suggests new diagnoses to consider
|
422 |
- How it changes the relative probabilities
|
423 |
3. Always explain your Bayesian reasoning clearly
|
|
|
424 |
|
425 |
-
OUTPUT FORMAT:
|
426 |
Provide your updated differential diagnosis with:
|
427 |
-
- Top 3 diagnoses with probability estimates (percentages)
|
428 |
-
-
|
429 |
- Key evidence supporting each hypothesis
|
430 |
- Evidence that contradicts or challenges each hypothesis
|
|
|
|
|
431 |
|
432 |
-
Remember: Your differential drives the entire diagnostic process. Be thorough, evidence-based, and adaptive.
|
433 |
""",
|
434 |
|
435 |
-
AgentRole.TEST_CHOOSER:
|
436 |
-
|
|
|
437 |
You are Dr. Test-Chooser, a specialist in diagnostic test selection and information theory.
|
438 |
|
439 |
CORE RESPONSIBILITIES:
|
@@ -441,31 +446,38 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
441 |
- Optimize for information value, not just clinical reasonableness
|
442 |
- Consider test characteristics: sensitivity, specificity, positive/negative predictive values
|
443 |
- Balance diagnostic yield with patient burden and resource utilization
|
|
|
444 |
|
445 |
SELECTION CRITERIA:
|
446 |
1. Information Value: How much will this test change diagnostic probabilities?
|
447 |
2. Discriminatory Power: How well does it distinguish between competing hypotheses?
|
448 |
3. Clinical Impact: Will the result meaningfully alter management?
|
449 |
4. Sequential Logic: What should we establish first before ordering more complex tests?
|
|
|
450 |
|
451 |
APPROACH:
|
452 |
- For each proposed test, explicitly state which hypotheses it will help confirm or exclude
|
453 |
- Consider both positive and negative results and their implications
|
454 |
- Think about test sequences (e.g., basic labs before advanced imaging)
|
455 |
- Avoid redundant tests that won't add new information
|
|
|
456 |
|
457 |
-
OUTPUT FORMAT:
|
458 |
For each recommended test:
|
459 |
-
- Test name (be specific)
|
460 |
- Primary hypotheses it will help evaluate
|
461 |
-
- Expected information gain
|
462 |
- How results will change management decisions
|
|
|
|
|
|
|
463 |
|
464 |
-
Focus on tests that will most efficiently narrow the differential diagnosis.
|
465 |
-
"""
|
466 |
-
|
467 |
-
AgentRole.CHALLENGER:
|
468 |
-
|
|
|
469 |
You are Dr. Challenger, the critical thinking specialist and devil's advocate.
|
470 |
|
471 |
CORE RESPONSIBILITIES:
|
@@ -473,6 +485,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
473 |
- Highlight contradictory evidence that might be overlooked
|
474 |
- Propose alternative hypotheses and falsifying tests
|
475 |
- Guard against premature diagnostic closure
|
|
|
476 |
|
477 |
COGNITIVE BIASES TO WATCH FOR:
|
478 |
1. Anchoring: Over-reliance on initial impressions
|
@@ -480,6 +493,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
480 |
3. Availability bias: Overestimating probability of recently seen conditions
|
481 |
4. Representativeness: Ignoring base rates and prevalence
|
482 |
5. Search satisficing: Stopping at "good enough" explanations
|
|
|
483 |
|
484 |
YOUR APPROACH:
|
485 |
- Ask "What else could this be?" and "What doesn't fit?"
|
@@ -487,19 +501,23 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
487 |
- Propose tests that could disprove the leading hypothesis
|
488 |
- Consider rare diseases when common ones don't fully explain the picture
|
489 |
- Advocate for considering multiple conditions simultaneously
|
|
|
490 |
|
491 |
-
OUTPUT FORMAT:
|
492 |
- Specific biases you've identified in the current reasoning
|
493 |
- Evidence that contradicts the leading hypotheses
|
494 |
-
- Alternative diagnoses to consider
|
495 |
- Tests that could falsify current assumptions
|
496 |
- Red flags or concerning patterns that need attention
|
|
|
|
|
497 |
|
498 |
-
Be constructively critical - your role is to strengthen diagnostic accuracy through rigorous challenge.
|
499 |
-
"""
|
500 |
-
|
501 |
-
AgentRole.STEWARDSHIP:
|
502 |
-
|
|
|
503 |
You are Dr. Stewardship, the resource optimization and cost-effectiveness specialist.
|
504 |
|
505 |
CORE RESPONSIBILITIES:
|
@@ -507,6 +525,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
507 |
- Advocate for cheaper alternatives when diagnostically equivalent
|
508 |
- Challenge low-yield, expensive tests
|
509 |
- Balance diagnostic thoroughness with resource stewardship
|
|
|
510 |
|
511 |
COST-VALUE FRAMEWORK:
|
512 |
1. High-Value Tests: Low cost, high diagnostic yield, changes management
|
@@ -519,33 +538,39 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
519 |
- Is there a less expensive test with similar diagnostic value?
|
520 |
- Can we use a staged approach (cheap test first, expensive if needed)?
|
521 |
- Does the test result actually change management?
|
|
|
522 |
|
523 |
YOUR APPROACH:
|
524 |
- Review all proposed tests for necessity and value
|
525 |
-
- Suggest cost-effective alternatives
|
526 |
- Question tests that don't clearly advance diagnosis
|
527 |
- Advocate for asking questions before ordering expensive tests
|
528 |
-
- Consider the cumulative cost burden
|
|
|
529 |
|
530 |
-
OUTPUT FORMAT:
|
531 |
-
- Assessment of proposed tests (high/moderate/low/no value)
|
532 |
-
- Specific cost-effective alternatives
|
533 |
- Questions that might obviate need for testing
|
534 |
- Recommended modifications to testing strategy
|
535 |
-
- Cumulative cost considerations
|
|
|
|
|
536 |
|
537 |
-
Your goal: Maximum diagnostic accuracy at minimum necessary cost.
|
538 |
-
"""
|
539 |
-
|
540 |
-
AgentRole.CHECKLIST:
|
541 |
-
|
|
|
542 |
You are Dr. Checklist, the quality assurance and consistency specialist.
|
543 |
|
544 |
CORE RESPONSIBILITIES:
|
545 |
-
- Perform
|
546 |
- Ensure test names are valid and properly specified
|
547 |
- Check internal consistency of reasoning across panel members
|
548 |
- Flag logical errors or contradictions in the diagnostic approach
|
|
|
549 |
|
550 |
QUALITY CHECKS:
|
551 |
1. Test Validity: Are proposed tests real and properly named?
|
@@ -553,6 +578,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
553 |
3. Evidence Integration: Are all findings being considered appropriately?
|
554 |
4. Process Adherence: Is the panel following proper diagnostic methodology?
|
555 |
5. Safety Checks: Are any critical possibilities being overlooked?
|
|
|
556 |
|
557 |
SPECIFIC VALIDATIONS:
|
558 |
- Test names match standard medical terminology
|
@@ -560,17 +586,20 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
560 |
- No contradictions between different panel members' reasoning
|
561 |
- All significant findings are being addressed
|
562 |
- No gaps in the diagnostic logic
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
-
|
567 |
-
-
|
568 |
-
-
|
569 |
-
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
|
|
|
|
|
|
574 |
AgentRole.CONSENSUS: f"""
|
575 |
{dynamic_context}
|
576 |
|
@@ -591,8 +620,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
591 |
4. **Cost Optimization:** Before finalizing a test, check Dr. Stewardship's input. If a diagnostically equivalent but cheaper alternative is available, select it.
|
592 |
5. **Default to Questions:** If no test meets the criteria or the budget is a major concern, select the most pertinent question to ask.
|
593 |
|
594 |
-
|
595 |
-
Provide a JSON object with this exact structure:
|
596 |
{{
|
597 |
"action_type": "ask" | "test" | "diagnose",
|
598 |
"content": "specific question(s), test name(s), or final diagnosis",
|
@@ -603,10 +631,12 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
603 |
For action_type "test": content should be properly named diagnostic tests (up to 3)
|
604 |
For action_type "diagnose": content should be the complete, specific final diagnosis
|
605 |
|
606 |
-
Make the decision that best advances accurate, cost-effective diagnosis.
|
607 |
""",
|
608 |
-
|
609 |
-
|
|
|
|
|
610 |
You are the Gatekeeper, the clinical information oracle with complete access to the patient case file.
|
611 |
|
612 |
CORE RESPONSIBILITIES:
|
@@ -614,6 +644,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
614 |
- Serve as the authoritative source for all patient information
|
615 |
- Generate realistic synthetic findings for tests not in the original case
|
616 |
- Maintain clinical realism while preventing information leakage
|
|
|
617 |
|
618 |
RESPONSE PRINCIPLES:
|
619 |
1. OBJECTIVITY: Provide only factual findings, never interpretations or impressions
|
@@ -621,6 +652,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
621 |
3. REALISM: Ensure all responses reflect realistic clinical scenarios
|
622 |
4. NO HINTS: Never provide diagnostic clues or suggestions
|
623 |
5. CONSISTENCY: Maintain coherence across all provided information
|
|
|
624 |
|
625 |
HANDLING REQUESTS:
|
626 |
- Patient History Questions: Provide relevant history from case file or realistic details
|
@@ -635,18 +667,21 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
635 |
- Use realistic reference ranges and values
|
636 |
- Maintain clinical plausibility
|
637 |
- Avoid pathognomonic findings unless specifically diagnostic
|
|
|
638 |
|
639 |
-
RESPONSE FORMAT:
|
640 |
-
- Direct, clinical language
|
641 |
- Specific measurements with reference ranges when applicable
|
642 |
-
- Clear organization of findings
|
643 |
-
- Professional medical terminology
|
|
|
644 |
|
645 |
-
Your role is crucial: provide complete, accurate clinical information while maintaining the challenge of the diagnostic process.
|
646 |
-
"""
|
647 |
-
|
648 |
-
AgentRole.JUDGE:
|
649 |
-
|
|
|
650 |
You are the Judge, the diagnostic accuracy evaluation specialist.
|
651 |
|
652 |
CORE RESPONSIBILITIES:
|
@@ -654,6 +689,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
654 |
- Provide fair, consistent scoring based on clinical management implications
|
655 |
- Consider diagnostic substance over terminology differences
|
656 |
- Account for acceptable medical synonyms and equivalent formulations
|
|
|
657 |
|
658 |
EVALUATION RUBRIC (5-point Likert scale):
|
659 |
|
@@ -694,15 +730,16 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
694 |
4. Consider diagnostic completeness
|
695 |
5. Judge clinical management implications
|
696 |
|
697 |
-
OUTPUT FORMAT:
|
698 |
-
- Score (1-5) with clear label
|
699 |
-
-
|
700 |
-
-
|
701 |
- Note any acceptable medical synonyms or equivalent terminology
|
|
|
|
|
702 |
|
703 |
-
Maintain high standards while recognizing legitimate diagnostic variability in medical practice.
|
704 |
-
"""
|
705 |
-
),
|
706 |
}
|
707 |
|
708 |
# Use existing prompts for other roles, just add dynamic context
|
@@ -1022,6 +1059,22 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
1022 |
def _parse_json_response(self, response: str, retry_count: int = 0) -> Dict[str, Any]:
|
1023 |
"""Safely parses a JSON string with retry logic - addresses Category 3.2"""
|
1024 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1025 |
# Extract the actual response content from the agent response
|
1026 |
if isinstance(response, str):
|
1027 |
# Handle markdown-formatted JSON
|
@@ -1070,15 +1123,16 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
1070 |
# Try to extract JSON from text that might contain other content
|
1071 |
import re
|
1072 |
|
1073 |
-
# Look for JSON pattern in the text
|
1074 |
-
json_pattern = r
|
1075 |
-
matches = re.findall(
|
1076 |
-
json_pattern, response, re.DOTALL
|
1077 |
-
)
|
1078 |
|
1079 |
for match in matches:
|
1080 |
try:
|
1081 |
-
|
|
|
|
|
|
|
1082 |
except json.JSONDecodeError:
|
1083 |
continue
|
1084 |
|
@@ -1098,7 +1152,7 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
1098 |
# Return the error for potential retry instead of immediately falling back
|
1099 |
raise e
|
1100 |
|
1101 |
-
def _parse_json_with_retry(self, consensus_agent: Agent, consensus_prompt: str, max_retries: int =
|
1102 |
"""Parse JSON with retry logic for robustness - addresses Category 3.2"""
|
1103 |
for attempt in range(max_retries + 1):
|
1104 |
try:
|
@@ -1106,28 +1160,44 @@ This case has gone through {case_state.iteration} iterations. Focus on decisive
|
|
1106 |
response = consensus_agent.run(consensus_prompt)
|
1107 |
else:
|
1108 |
# Retry with error feedback
|
1109 |
-
retry_prompt =
|
1110 |
-
|
1111 |
-
|
1112 |
-
|
1113 |
-
|
|
|
1114 |
{{
|
1115 |
"action_type": "ask" | "test" | "diagnose",
|
1116 |
"content": "your content here",
|
1117 |
"reasoning": "your reasoning here"
|
1118 |
}}
|
1119 |
-
|
|
|
|
|
1120 |
"""
|
1121 |
response = consensus_agent.run(retry_prompt)
|
1122 |
|
1123 |
-
#
|
1124 |
-
|
|
|
1125 |
response_text = response.content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1126 |
elif isinstance(response, str):
|
1127 |
response_text = response
|
1128 |
else:
|
1129 |
response_text = str(response)
|
1130 |
|
|
|
|
|
|
|
|
|
1131 |
return self._parse_json_response(response_text, attempt)
|
1132 |
|
1133 |
except Exception as e:
|
@@ -1492,19 +1562,82 @@ CURRENT STATE:
|
|
1492 |
Please evaluate the following diagnosis.
|
1493 |
Ground Truth: "{ground_truth}"
|
1494 |
Candidate Diagnosis: "{candidate_diagnosis}"
|
|
|
|
|
|
|
|
|
1495 |
"""
|
1496 |
response = judge.run(prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1497 |
|
1498 |
-
#
|
1499 |
try:
|
1500 |
-
score
|
1501 |
-
|
1502 |
-
|
1503 |
-
|
1504 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1505 |
score = 0.0
|
1506 |
-
reasoning = "Could not parse judge's response
|
1507 |
|
|
|
1508 |
return {"score": score, "reasoning": reasoning}
|
1509 |
|
1510 |
def run(
|
@@ -1885,7 +2018,7 @@ CURRENT STATE:
|
|
1885 |
"mode": "budgeted",
|
1886 |
"max_iterations": 10,
|
1887 |
"enable_budget_tracking": True,
|
1888 |
-
"initial_budget": kwargs.get("budget", 5000),
|
1889 |
},
|
1890 |
"no_budget": {
|
1891 |
"mode": "no_budget",
|
@@ -1963,11 +2096,14 @@ def run_mai_dxo_demo(
|
|
1963 |
orchestrator = MaiDxOrchestrator.create_variant(
|
1964 |
variant,
|
1965 |
budget=3000,
|
1966 |
-
model_name="gpt-4
|
|
|
1967 |
)
|
1968 |
else:
|
1969 |
orchestrator = MaiDxOrchestrator.create_variant(
|
1970 |
-
variant,
|
|
|
|
|
1971 |
)
|
1972 |
|
1973 |
result = orchestrator.run(
|
@@ -2042,13 +2178,13 @@ if __name__ == "__main__":
|
|
2042 |
orchestrator = MaiDxOrchestrator.create_variant(
|
2043 |
variant_name,
|
2044 |
budget=3000,
|
2045 |
-
model_name="gpt-4
|
2046 |
max_iterations=5,
|
2047 |
)
|
2048 |
else:
|
2049 |
orchestrator = MaiDxOrchestrator.create_variant(
|
2050 |
variant_name,
|
2051 |
-
model_name="gpt-4
|
2052 |
max_iterations=5,
|
2053 |
)
|
2054 |
|
@@ -2080,7 +2216,7 @@ if __name__ == "__main__":
|
|
2080 |
|
2081 |
ensemble_orchestrator = MaiDxOrchestrator.create_variant(
|
2082 |
"ensemble",
|
2083 |
-
model_name="gpt-4
|
2084 |
max_iterations=3, # Shorter iterations for ensemble
|
2085 |
)
|
2086 |
|
|
|
261 |
|
262 |
def __init__(
|
263 |
self,
|
264 |
+
model_name: str = "gpt-4-1106-preview", # Fixed: Use valid GPT-4 Turbo model name
|
265 |
max_iterations: int = 10,
|
266 |
initial_budget: int = 10000,
|
267 |
mode: str = "no_budget", # "instant", "question_only", "budgeted", "no_budget", "ensemble"
|
|
|
332 |
)
|
333 |
|
334 |
def _get_agent_max_tokens(self, role: AgentRole) -> int:
|
335 |
+
"""Get max_tokens for each agent based on their role - significantly increased limits"""
|
336 |
token_limits = {
|
337 |
+
AgentRole.HYPOTHESIS: 2000, # Increased for comprehensive differential analysis
|
338 |
+
AgentRole.TEST_CHOOSER: 1500, # Increased for detailed test recommendations
|
339 |
+
AgentRole.CHALLENGER: 1800, # Increased for thorough bias analysis
|
340 |
+
AgentRole.STEWARDSHIP: 1200, # Increased for detailed cost analysis
|
341 |
+
AgentRole.CHECKLIST: 1000, # Increased for comprehensive validation
|
342 |
+
AgentRole.CONSENSUS: 800, # Increased for detailed reasoning + JSON
|
343 |
+
AgentRole.GATEKEEPER: 2500, # Increased for detailed clinical findings
|
344 |
+
AgentRole.JUDGE: 1500, # Increased for comprehensive evaluation
|
345 |
}
|
346 |
+
return token_limits.get(role, 1000)
|
347 |
|
348 |
def _init_agents(self) -> None:
|
349 |
"""Initializes all required agents with their specific roles and prompts."""
|
|
|
409 |
You are Dr. Hypothesis, a specialist in maintaining differential diagnoses. Your role is critical to the diagnostic process.
|
410 |
|
411 |
CORE RESPONSIBILITIES:
|
412 |
+
- Maintain a probability-ranked differential diagnosis with the top 3-5 most likely conditions
|
413 |
- Update probabilities using Bayesian reasoning after each new finding
|
414 |
- Consider both common and rare diseases appropriate to the clinical context
|
415 |
- Explicitly track how new evidence changes your diagnostic thinking
|
416 |
+
- Provide comprehensive analysis with detailed clinical reasoning
|
417 |
|
418 |
APPROACH:
|
419 |
1. Start with the most likely diagnoses based on presenting symptoms
|
|
|
422 |
- Whether it suggests new diagnoses to consider
|
423 |
- How it changes the relative probabilities
|
424 |
3. Always explain your Bayesian reasoning clearly
|
425 |
+
4. Consider epidemiology, pathophysiology, and clinical patterns
|
426 |
|
427 |
+
OUTPUT FORMAT (Use full token allocation for comprehensive analysis):
|
428 |
Provide your updated differential diagnosis with:
|
429 |
+
- Top 3-5 diagnoses with probability estimates (percentages)
|
430 |
+
- Detailed rationale for each diagnosis
|
431 |
- Key evidence supporting each hypothesis
|
432 |
- Evidence that contradicts or challenges each hypothesis
|
433 |
+
- Pathophysiological reasoning for each diagnosis
|
434 |
+
- Risk stratification and urgency considerations
|
435 |
|
436 |
+
Remember: Your differential drives the entire diagnostic process. Be thorough, evidence-based, and adaptive. Use your full token allocation to provide comprehensive clinical reasoning.
|
437 |
""",
|
438 |
|
439 |
+
AgentRole.TEST_CHOOSER: f"""
|
440 |
+
{dynamic_context}
|
441 |
+
|
442 |
You are Dr. Test-Chooser, a specialist in diagnostic test selection and information theory.
|
443 |
|
444 |
CORE RESPONSIBILITIES:
|
|
|
446 |
- Optimize for information value, not just clinical reasonableness
|
447 |
- Consider test characteristics: sensitivity, specificity, positive/negative predictive values
|
448 |
- Balance diagnostic yield with patient burden and resource utilization
|
449 |
+
- Provide comprehensive test selection rationale
|
450 |
|
451 |
SELECTION CRITERIA:
|
452 |
1. Information Value: How much will this test change diagnostic probabilities?
|
453 |
2. Discriminatory Power: How well does it distinguish between competing hypotheses?
|
454 |
3. Clinical Impact: Will the result meaningfully alter management?
|
455 |
4. Sequential Logic: What should we establish first before ordering more complex tests?
|
456 |
+
5. Cost-effectiveness and patient safety considerations
|
457 |
|
458 |
APPROACH:
|
459 |
- For each proposed test, explicitly state which hypotheses it will help confirm or exclude
|
460 |
- Consider both positive and negative results and their implications
|
461 |
- Think about test sequences (e.g., basic labs before advanced imaging)
|
462 |
- Avoid redundant tests that won't add new information
|
463 |
+
- Consider pre-test probability and post-test probability calculations
|
464 |
|
465 |
+
OUTPUT FORMAT (Use full token allocation for detailed analysis):
|
466 |
For each recommended test:
|
467 |
+
- Test name (be specific and accurate)
|
468 |
- Primary hypotheses it will help evaluate
|
469 |
+
- Expected information gain and likelihood ratios
|
470 |
- How results will change management decisions
|
471 |
+
- Cost considerations and alternatives
|
472 |
+
- Sequence rationale (why this test now vs. later)
|
473 |
+
- Expected sensitivity/specificity for the clinical context
|
474 |
|
475 |
+
Focus on tests that will most efficiently narrow the differential diagnosis while considering practical constraints.
|
476 |
+
""",
|
477 |
+
|
478 |
+
AgentRole.CHALLENGER: f"""
|
479 |
+
{dynamic_context}
|
480 |
+
|
481 |
You are Dr. Challenger, the critical thinking specialist and devil's advocate.
|
482 |
|
483 |
CORE RESPONSIBILITIES:
|
|
|
485 |
- Highlight contradictory evidence that might be overlooked
|
486 |
- Propose alternative hypotheses and falsifying tests
|
487 |
- Guard against premature diagnostic closure
|
488 |
+
- Provide comprehensive critical analysis
|
489 |
|
490 |
COGNITIVE BIASES TO WATCH FOR:
|
491 |
1. Anchoring: Over-reliance on initial impressions
|
|
|
493 |
3. Availability bias: Overestimating probability of recently seen conditions
|
494 |
4. Representativeness: Ignoring base rates and prevalence
|
495 |
5. Search satisficing: Stopping at "good enough" explanations
|
496 |
+
6. Attribution errors and hindsight bias
|
497 |
|
498 |
YOUR APPROACH:
|
499 |
- Ask "What else could this be?" and "What doesn't fit?"
|
|
|
501 |
- Propose tests that could disprove the leading hypothesis
|
502 |
- Consider rare diseases when common ones don't fully explain the picture
|
503 |
- Advocate for considering multiple conditions simultaneously
|
504 |
+
- Look for inconsistencies in the clinical presentation
|
505 |
|
506 |
+
OUTPUT FORMAT (Use full token allocation for thorough analysis):
|
507 |
- Specific biases you've identified in the current reasoning
|
508 |
- Evidence that contradicts the leading hypotheses
|
509 |
+
- Alternative diagnoses to consider with reasoning
|
510 |
- Tests that could falsify current assumptions
|
511 |
- Red flags or concerning patterns that need attention
|
512 |
+
- Analysis of what might be missing from the current approach
|
513 |
+
- Systematic review of differential diagnosis completeness
|
514 |
|
515 |
+
Be constructively critical - your role is to strengthen diagnostic accuracy through rigorous challenge and comprehensive analysis.
|
516 |
+
""",
|
517 |
+
|
518 |
+
AgentRole.STEWARDSHIP: f"""
|
519 |
+
{dynamic_context}
|
520 |
+
|
521 |
You are Dr. Stewardship, the resource optimization and cost-effectiveness specialist.
|
522 |
|
523 |
CORE RESPONSIBILITIES:
|
|
|
525 |
- Advocate for cheaper alternatives when diagnostically equivalent
|
526 |
- Challenge low-yield, expensive tests
|
527 |
- Balance diagnostic thoroughness with resource stewardship
|
528 |
+
- Provide comprehensive cost-benefit analysis
|
529 |
|
530 |
COST-VALUE FRAMEWORK:
|
531 |
1. High-Value Tests: Low cost, high diagnostic yield, changes management
|
|
|
538 |
- Is there a less expensive test with similar diagnostic value?
|
539 |
- Can we use a staged approach (cheap test first, expensive if needed)?
|
540 |
- Does the test result actually change management?
|
541 |
+
- Are there outpatient vs. inpatient cost considerations?
|
542 |
|
543 |
YOUR APPROACH:
|
544 |
- Review all proposed tests for necessity and value
|
545 |
+
- Suggest cost-effective alternatives with rationale
|
546 |
- Question tests that don't clearly advance diagnosis
|
547 |
- Advocate for asking questions before ordering expensive tests
|
548 |
+
- Consider the cumulative cost burden and budget constraints
|
549 |
+
- Analyze cost per unit of diagnostic information gained
|
550 |
|
551 |
+
OUTPUT FORMAT (Use full token allocation for detailed analysis):
|
552 |
+
- Assessment of proposed tests (high/moderate/low/no value) with detailed reasoning
|
553 |
+
- Specific cost-effective alternatives with cost comparisons
|
554 |
- Questions that might obviate need for testing
|
555 |
- Recommended modifications to testing strategy
|
556 |
+
- Cumulative cost considerations and budget impact
|
557 |
+
- Value-based care recommendations
|
558 |
+
- Analysis of diagnostic yield vs. cost for each proposed intervention
|
559 |
|
560 |
+
Your goal: Maximum diagnostic accuracy at minimum necessary cost while maintaining high-quality care.
|
561 |
+
""",
|
562 |
+
|
563 |
+
AgentRole.CHECKLIST: f"""
|
564 |
+
{dynamic_context}
|
565 |
+
|
566 |
You are Dr. Checklist, the quality assurance and consistency specialist.
|
567 |
|
568 |
CORE RESPONSIBILITIES:
|
569 |
+
- Perform comprehensive quality control on all panel deliberations
|
570 |
- Ensure test names are valid and properly specified
|
571 |
- Check internal consistency of reasoning across panel members
|
572 |
- Flag logical errors or contradictions in the diagnostic approach
|
573 |
+
- Provide systematic quality assessment
|
574 |
|
575 |
QUALITY CHECKS:
|
576 |
1. Test Validity: Are proposed tests real and properly named?
|
|
|
578 |
3. Evidence Integration: Are all findings being considered appropriately?
|
579 |
4. Process Adherence: Is the panel following proper diagnostic methodology?
|
580 |
5. Safety Checks: Are any critical possibilities being overlooked?
|
581 |
+
6. Completeness: Is the diagnostic workup comprehensive?
|
582 |
|
583 |
SPECIFIC VALIDATIONS:
|
584 |
- Test names match standard medical terminology
|
|
|
586 |
- No contradictions between different panel members' reasoning
|
587 |
- All significant findings are being addressed
|
588 |
- No gaps in the diagnostic logic
|
589 |
+
- Proper consideration of differential diagnosis breadth
|
590 |
+
|
591 |
+
OUTPUT FORMAT (Use full token allocation for comprehensive analysis):
|
592 |
+
- Detailed validation summary (✓ Clear / ⚠ Issues noted)
|
593 |
+
- Any test name corrections needed with proper terminology
|
594 |
+
- Logical inconsistencies identified with specific examples
|
595 |
+
- Missing considerations or gaps in reasoning
|
596 |
+
- Process improvement suggestions with rationale
|
597 |
+
- Safety concerns or red flags that need immediate attention
|
598 |
+
- Systematic review of diagnostic approach quality
|
599 |
+
|
600 |
+
Keep your feedback comprehensive and detailed. Flag any issues that could compromise diagnostic quality or patient safety.
|
601 |
+
""",
|
602 |
+
|
603 |
AgentRole.CONSENSUS: f"""
|
604 |
{dynamic_context}
|
605 |
|
|
|
620 |
4. **Cost Optimization:** Before finalizing a test, check Dr. Stewardship's input. If a diagnostically equivalent but cheaper alternative is available, select it.
|
621 |
5. **Default to Questions:** If no test meets the criteria or the budget is a major concern, select the most pertinent question to ask.
|
622 |
|
623 |
+
**CRITICAL: YOUR RESPONSE MUST BE EXACTLY THIS JSON FORMAT:**
|
|
|
624 |
{{
|
625 |
"action_type": "ask" | "test" | "diagnose",
|
626 |
"content": "specific question(s), test name(s), or final diagnosis",
|
|
|
631 |
For action_type "test": content should be properly named diagnostic tests (up to 3)
|
632 |
For action_type "diagnose": content should be the complete, specific final diagnosis
|
633 |
|
634 |
+
Make the decision that best advances accurate, cost-effective diagnosis. Use your full token allocation for comprehensive reasoning in the reasoning field.
|
635 |
""",
|
636 |
+
|
637 |
+
AgentRole.GATEKEEPER: f"""
|
638 |
+
{dynamic_context}
|
639 |
+
|
640 |
You are the Gatekeeper, the clinical information oracle with complete access to the patient case file.
|
641 |
|
642 |
CORE RESPONSIBILITIES:
|
|
|
644 |
- Serve as the authoritative source for all patient information
|
645 |
- Generate realistic synthetic findings for tests not in the original case
|
646 |
- Maintain clinical realism while preventing information leakage
|
647 |
+
- Provide comprehensive, detailed responses
|
648 |
|
649 |
RESPONSE PRINCIPLES:
|
650 |
1. OBJECTIVITY: Provide only factual findings, never interpretations or impressions
|
|
|
652 |
3. REALISM: Ensure all responses reflect realistic clinical scenarios
|
653 |
4. NO HINTS: Never provide diagnostic clues or suggestions
|
654 |
5. CONSISTENCY: Maintain coherence across all provided information
|
655 |
+
6. COMPLETENESS: Provide thorough, detailed responses
|
656 |
|
657 |
HANDLING REQUESTS:
|
658 |
- Patient History Questions: Provide relevant history from case file or realistic details
|
|
|
667 |
- Use realistic reference ranges and values
|
668 |
- Maintain clinical plausibility
|
669 |
- Avoid pathognomonic findings unless specifically diagnostic
|
670 |
+
- Consider normal variations and expected findings
|
671 |
|
672 |
+
RESPONSE FORMAT (Use full token allocation for detailed responses):
|
673 |
+
- Direct, clinical language with comprehensive detail
|
674 |
- Specific measurements with reference ranges when applicable
|
675 |
+
- Clear organization of findings with systematic presentation
|
676 |
+
- Professional medical terminology with full descriptions
|
677 |
+
- Complete documentation as would appear in medical records
|
678 |
|
679 |
+
Your role is crucial: provide complete, accurate clinical information while maintaining the challenge of the diagnostic process. Use your full token allocation to provide comprehensive, detailed clinical information.
|
680 |
+
""",
|
681 |
+
|
682 |
+
AgentRole.JUDGE: f"""
|
683 |
+
{dynamic_context}
|
684 |
+
|
685 |
You are the Judge, the diagnostic accuracy evaluation specialist.
|
686 |
|
687 |
CORE RESPONSIBILITIES:
|
|
|
689 |
- Provide fair, consistent scoring based on clinical management implications
|
690 |
- Consider diagnostic substance over terminology differences
|
691 |
- Account for acceptable medical synonyms and equivalent formulations
|
692 |
+
- Provide comprehensive evaluation reasoning
|
693 |
|
694 |
EVALUATION RUBRIC (5-point Likert scale):
|
695 |
|
|
|
730 |
4. Consider diagnostic completeness
|
731 |
5. Judge clinical management implications
|
732 |
|
733 |
+
OUTPUT FORMAT (Use full token allocation for comprehensive evaluation):
|
734 |
+
- Score (1-5) with clear label and detailed justification
|
735 |
+
- Comprehensive reasoning referencing specific rubric criteria
|
736 |
+
- Detailed explanation of how diagnosis would affect clinical management
|
737 |
- Note any acceptable medical synonyms or equivalent terminology
|
738 |
+
- Analysis of diagnostic accuracy and clinical implications
|
739 |
+
- Systematic comparison with ground truth diagnosis
|
740 |
|
741 |
+
Maintain high standards while recognizing legitimate diagnostic variability in medical practice. Provide comprehensive, detailed evaluation.
|
742 |
+
""",
|
|
|
743 |
}
|
744 |
|
745 |
# Use existing prompts for other roles, just add dynamic context
|
|
|
1059 |
def _parse_json_response(self, response: str, retry_count: int = 0) -> Dict[str, Any]:
|
1060 |
"""Safely parses a JSON string with retry logic - addresses Category 3.2"""
|
1061 |
try:
|
1062 |
+
# Handle agent response wrapper - extract actual content
|
1063 |
+
if isinstance(response, dict):
|
1064 |
+
# Handle swarms Agent response format
|
1065 |
+
if 'role' in response and 'content' in response:
|
1066 |
+
response = response['content']
|
1067 |
+
elif 'content' in response:
|
1068 |
+
response = response['content']
|
1069 |
+
else:
|
1070 |
+
# Try to extract any string value from dict
|
1071 |
+
response = str(response)
|
1072 |
+
elif hasattr(response, 'content'):
|
1073 |
+
response = response.content
|
1074 |
+
elif not isinstance(response, str):
|
1075 |
+
# Convert to string if it's some other type
|
1076 |
+
response = str(response)
|
1077 |
+
|
1078 |
# Extract the actual response content from the agent response
|
1079 |
if isinstance(response, str):
|
1080 |
# Handle markdown-formatted JSON
|
|
|
1123 |
# Try to extract JSON from text that might contain other content
|
1124 |
import re
|
1125 |
|
1126 |
+
# Look for JSON pattern in the text - more comprehensive regex
|
1127 |
+
json_pattern = r'\{(?:[^{}]|(?:\{[^{}]*\}))*\}'
|
1128 |
+
matches = re.findall(json_pattern, response, re.DOTALL)
|
|
|
|
|
1129 |
|
1130 |
for match in matches:
|
1131 |
try:
|
1132 |
+
parsed = json.loads(match)
|
1133 |
+
# Validate that it has the expected action structure
|
1134 |
+
if isinstance(parsed, dict) and 'action_type' in parsed:
|
1135 |
+
return parsed
|
1136 |
except json.JSONDecodeError:
|
1137 |
continue
|
1138 |
|
|
|
1152 |
# Return the error for potential retry instead of immediately falling back
|
1153 |
raise e
|
1154 |
|
1155 |
+
def _parse_json_with_retry(self, consensus_agent: Agent, consensus_prompt: str, max_retries: int = 3) -> Dict[str, Any]:
|
1156 |
"""Parse JSON with retry logic for robustness - addresses Category 3.2"""
|
1157 |
for attempt in range(max_retries + 1):
|
1158 |
try:
|
|
|
1160 |
response = consensus_agent.run(consensus_prompt)
|
1161 |
else:
|
1162 |
# Retry with error feedback
|
1163 |
+
retry_prompt = f"""
|
1164 |
+
{consensus_prompt}
|
1165 |
+
|
1166 |
+
**CRITICAL: RETRY REQUIRED - ATTEMPT {attempt + 1}**
|
1167 |
+
Your previous response could not be parsed as JSON. You MUST respond with ONLY a valid JSON object in exactly this format:
|
1168 |
+
|
1169 |
{{
|
1170 |
"action_type": "ask" | "test" | "diagnose",
|
1171 |
"content": "your content here",
|
1172 |
"reasoning": "your reasoning here"
|
1173 |
}}
|
1174 |
+
|
1175 |
+
Do NOT include any other text, markdown formatting, or explanations. Only the raw JSON object.
|
1176 |
+
NO SYSTEM MESSAGES, NO WRAPPER FORMAT. JUST THE JSON.
|
1177 |
"""
|
1178 |
response = consensus_agent.run(retry_prompt)
|
1179 |
|
1180 |
+
# Handle different response types from swarms Agent
|
1181 |
+
response_text = ""
|
1182 |
+
if hasattr(response, 'content'):
|
1183 |
response_text = response.content
|
1184 |
+
elif isinstance(response, dict):
|
1185 |
+
# Handle swarms Agent response wrapper
|
1186 |
+
if 'role' in response and 'content' in response:
|
1187 |
+
response_text = response['content']
|
1188 |
+
elif 'content' in response:
|
1189 |
+
response_text = response['content']
|
1190 |
+
else:
|
1191 |
+
response_text = str(response)
|
1192 |
elif isinstance(response, str):
|
1193 |
response_text = response
|
1194 |
else:
|
1195 |
response_text = str(response)
|
1196 |
|
1197 |
+
# Log the response for debugging
|
1198 |
+
logger.debug(f"Parsing attempt {attempt + 1}, response type: {type(response)}")
|
1199 |
+
logger.debug(f"Response content preview: {str(response_text)[:200]}...")
|
1200 |
+
|
1201 |
return self._parse_json_response(response_text, attempt)
|
1202 |
|
1203 |
except Exception as e:
|
|
|
1562 |
Please evaluate the following diagnosis.
|
1563 |
Ground Truth: "{ground_truth}"
|
1564 |
Candidate Diagnosis: "{candidate_diagnosis}"
|
1565 |
+
|
1566 |
+
You must provide your evaluation in exactly this format:
|
1567 |
+
Score: [number from 1-5]
|
1568 |
+
Justification: [detailed reasoning for the score]
|
1569 |
"""
|
1570 |
response = judge.run(prompt)
|
1571 |
+
|
1572 |
+
# Handle different response types from swarms Agent
|
1573 |
+
response_text = ""
|
1574 |
+
if hasattr(response, 'content'):
|
1575 |
+
response_text = response.content
|
1576 |
+
elif isinstance(response, dict):
|
1577 |
+
if 'role' in response and 'content' in response:
|
1578 |
+
response_text = response['content']
|
1579 |
+
elif 'content' in response:
|
1580 |
+
response_text = response['content']
|
1581 |
+
else:
|
1582 |
+
response_text = str(response)
|
1583 |
+
elif isinstance(response, str):
|
1584 |
+
response_text = response
|
1585 |
+
else:
|
1586 |
+
response_text = str(response)
|
1587 |
|
1588 |
+
# Enhanced parsing for demonstration; a more robust solution would use structured output.
|
1589 |
try:
|
1590 |
+
# Look for score patterns
|
1591 |
+
import re
|
1592 |
+
|
1593 |
+
# Try multiple score patterns
|
1594 |
+
score_patterns = [
|
1595 |
+
r"Score:\s*(\d+(?:\.\d+)?)",
|
1596 |
+
r"Score\s*(\d+(?:\.\d+)?)",
|
1597 |
+
r"(\d+(?:\.\d+)?)/5",
|
1598 |
+
r"Score.*?(\d+(?:\.\d+)?)",
|
1599 |
+
]
|
1600 |
+
|
1601 |
+
score = 0.0
|
1602 |
+
for pattern in score_patterns:
|
1603 |
+
match = re.search(pattern, response_text, re.IGNORECASE)
|
1604 |
+
if match:
|
1605 |
+
score = float(match.group(1))
|
1606 |
+
break
|
1607 |
+
|
1608 |
+
# Extract reasoning
|
1609 |
+
reasoning_patterns = [
|
1610 |
+
r"Justification:\s*(.+?)(?:\n\n|\Z)",
|
1611 |
+
r"Reasoning:\s*(.+?)(?:\n\n|\Z)",
|
1612 |
+
r"Explanation:\s*(.+?)(?:\n\n|\Z)",
|
1613 |
+
]
|
1614 |
+
|
1615 |
+
reasoning = "Could not parse judge's reasoning."
|
1616 |
+
for pattern in reasoning_patterns:
|
1617 |
+
match = re.search(pattern, response_text, re.IGNORECASE | re.DOTALL)
|
1618 |
+
if match:
|
1619 |
+
reasoning = match.group(1).strip()
|
1620 |
+
break
|
1621 |
+
|
1622 |
+
# If no specific reasoning found, use the whole response after score
|
1623 |
+
if reasoning == "Could not parse judge's reasoning." and score > 0:
|
1624 |
+
# Try to extract everything after the score
|
1625 |
+
score_match = re.search(r"Score:?\s*\d+(?:\.\d+)?", response_text, re.IGNORECASE)
|
1626 |
+
if score_match:
|
1627 |
+
reasoning = response_text[score_match.end():].strip()
|
1628 |
+
# Clean up common prefixes
|
1629 |
+
reasoning = re.sub(r"^(Justification|Reasoning|Explanation):\s*", "", reasoning, flags=re.IGNORECASE)
|
1630 |
+
|
1631 |
+
# Final fallback - use the whole response if we have a score
|
1632 |
+
if reasoning == "Could not parse judge's reasoning." and score > 0:
|
1633 |
+
reasoning = response_text
|
1634 |
+
|
1635 |
+
except (IndexError, ValueError, AttributeError) as e:
|
1636 |
+
logger.error(f"Error parsing judge response: {e}")
|
1637 |
score = 0.0
|
1638 |
+
reasoning = f"Could not parse judge's response: {str(e)}"
|
1639 |
|
1640 |
+
logger.info(f"Judge evaluation: Score={score}, Reasoning preview: {reasoning[:100]}...")
|
1641 |
return {"score": score, "reasoning": reasoning}
|
1642 |
|
1643 |
def run(
|
|
|
2018 |
"mode": "budgeted",
|
2019 |
"max_iterations": 10,
|
2020 |
"enable_budget_tracking": True,
|
2021 |
+
"initial_budget": kwargs.get("budget", 5000), # Fixed: map budget to initial_budget
|
2022 |
},
|
2023 |
"no_budget": {
|
2024 |
"mode": "no_budget",
|
|
|
2096 |
orchestrator = MaiDxOrchestrator.create_variant(
|
2097 |
variant,
|
2098 |
budget=3000,
|
2099 |
+
model_name="gpt-4-1106-preview", # Fixed: Use valid model name
|
2100 |
+
max_iterations=5,
|
2101 |
)
|
2102 |
else:
|
2103 |
orchestrator = MaiDxOrchestrator.create_variant(
|
2104 |
+
variant,
|
2105 |
+
model_name="gpt-4-1106-preview", # Fixed: Use valid model name
|
2106 |
+
max_iterations=5,
|
2107 |
)
|
2108 |
|
2109 |
result = orchestrator.run(
|
|
|
2178 |
orchestrator = MaiDxOrchestrator.create_variant(
|
2179 |
variant_name,
|
2180 |
budget=3000,
|
2181 |
+
model_name="gpt-4-1106-preview", # Fixed: Use valid model name
|
2182 |
max_iterations=5,
|
2183 |
)
|
2184 |
else:
|
2185 |
orchestrator = MaiDxOrchestrator.create_variant(
|
2186 |
variant_name,
|
2187 |
+
model_name="gpt-4-1106-preview", # Fixed: Use valid model name
|
2188 |
max_iterations=5,
|
2189 |
)
|
2190 |
|
|
|
2216 |
|
2217 |
ensemble_orchestrator = MaiDxOrchestrator.create_variant(
|
2218 |
"ensemble",
|
2219 |
+
model_name="gpt-4-1106-preview", # Fixed: Use valid model name
|
2220 |
max_iterations=3, # Shorter iterations for ensemble
|
2221 |
)
|
2222 |
|