Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
Β·
40d39ed
1
Parent(s):
16a2990
Enhance Direct LLM Evaluator and Judge Evaluator:
Browse files- Update LLM generation method for improved response handling and timeout settings.
- Adjust target compliance metrics from 30s to 60s for better evaluation consistency.
- Implement validation for query consistency and model type differences across systems in Judge Evaluator.
evaluation/direct_llm_evaluator.py
CHANGED
|
@@ -87,13 +87,26 @@ Please provide comprehensive medical advice including:
|
|
| 87 |
Provide evidence-based, actionable medical guidance.
|
| 88 |
"""
|
| 89 |
|
| 90 |
-
# Direct LLM generation
|
| 91 |
-
response = self.llm_client.
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
llm_time = time.time() - llm_start
|
| 95 |
total_time = time.time() - overall_start
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
# Create result
|
| 98 |
result = {
|
| 99 |
"query": query,
|
|
@@ -103,7 +116,7 @@ Provide evidence-based, actionable medical guidance.
|
|
| 103 |
"latency_metrics": {
|
| 104 |
"total_latency": total_time,
|
| 105 |
"llm_generation_time": llm_time,
|
| 106 |
-
"meets_target": total_time <=
|
| 107 |
},
|
| 108 |
|
| 109 |
# Metrics 2-4: Not applicable for direct LLM
|
|
@@ -167,6 +180,10 @@ Provide evidence-based, actionable medical guidance.
|
|
| 167 |
}
|
| 168 |
|
| 169 |
self.direct_results.append(error_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
return error_result
|
| 171 |
|
| 172 |
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
|
@@ -238,7 +255,7 @@ Provide evidence-based, actionable medical guidance.
|
|
| 238 |
category_stats[category] = {
|
| 239 |
"average_latency": sum(cat_latencies) / len(cat_latencies),
|
| 240 |
"query_count": len(cat_latencies),
|
| 241 |
-
"target_compliance": sum(1 for lat in cat_latencies if lat <=
|
| 242 |
}
|
| 243 |
else:
|
| 244 |
category_stats[category] = {
|
|
@@ -255,7 +272,7 @@ Provide evidence-based, actionable medical guidance.
|
|
| 255 |
"successful_queries": len(successful_results),
|
| 256 |
"total_queries": len(self.direct_results),
|
| 257 |
"success_rate": len(successful_results) / len(self.direct_results),
|
| 258 |
-
"target_compliance": sum(1 for lat in latencies if lat <=
|
| 259 |
}
|
| 260 |
else:
|
| 261 |
category_stats = {cat: {"average_latency": 0.0, "query_count": 0, "target_compliance": 0.0}
|
|
@@ -386,7 +403,7 @@ if __name__ == "__main__":
|
|
| 386 |
print(f"Overall Performance:")
|
| 387 |
print(f" Average Latency: {overall_results['average_latency']:.2f}s")
|
| 388 |
print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
|
| 389 |
-
print(f"
|
| 390 |
|
| 391 |
print(f"\nApplicable Metrics:")
|
| 392 |
print(f" β
Metric 1 (Latency): Measured")
|
|
@@ -399,4 +416,4 @@ if __name__ == "__main__":
|
|
| 399 |
print(f"\nβ
Direct LLM evaluation complete!")
|
| 400 |
print(f"π Statistics: {stats_path}")
|
| 401 |
print(f"π Medical Outputs: {outputs_path}")
|
| 402 |
-
print(f"\nπ‘ Next step: Run
|
|
|
|
| 87 |
Provide evidence-based, actionable medical guidance.
|
| 88 |
"""
|
| 89 |
|
| 90 |
+
# Direct LLM generation (same parameters as RAG system for fair comparison)
|
| 91 |
+
response = self.llm_client.analyze_medical_query(
|
| 92 |
+
query=direct_prompt,
|
| 93 |
+
max_tokens=1600, # Same as RAG system primary setting
|
| 94 |
+
timeout=60.0 # Increased timeout for stable evaluation
|
| 95 |
+
)
|
| 96 |
+
# Extract medical advice from response (Med42 client returns dict with 'raw_response')
|
| 97 |
+
if isinstance(response, dict):
|
| 98 |
+
medical_advice = response.get('raw_response', '') or response.get('content', '')
|
| 99 |
+
else:
|
| 100 |
+
medical_advice = str(response)
|
| 101 |
|
| 102 |
llm_time = time.time() - llm_start
|
| 103 |
total_time = time.time() - overall_start
|
| 104 |
|
| 105 |
+
# Check if response is valid (not empty) - focus on content, not timeout
|
| 106 |
+
if not medical_advice or len(medical_advice.strip()) == 0:
|
| 107 |
+
print(f"β Direct LLM returned empty response after {total_time:.2f}s")
|
| 108 |
+
raise ValueError("Empty response from LLM - no content generated")
|
| 109 |
+
|
| 110 |
# Create result
|
| 111 |
result = {
|
| 112 |
"query": query,
|
|
|
|
| 116 |
"latency_metrics": {
|
| 117 |
"total_latency": total_time,
|
| 118 |
"llm_generation_time": llm_time,
|
| 119 |
+
"meets_target": total_time <= 60.0
|
| 120 |
},
|
| 121 |
|
| 122 |
# Metrics 2-4: Not applicable for direct LLM
|
|
|
|
| 180 |
}
|
| 181 |
|
| 182 |
self.direct_results.append(error_result)
|
| 183 |
+
|
| 184 |
+
# Do NOT add failed queries to medical_outputs for judge evaluation
|
| 185 |
+
# Only successful queries with valid medical advice should be evaluated
|
| 186 |
+
|
| 187 |
return error_result
|
| 188 |
|
| 189 |
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
|
|
|
| 255 |
category_stats[category] = {
|
| 256 |
"average_latency": sum(cat_latencies) / len(cat_latencies),
|
| 257 |
"query_count": len(cat_latencies),
|
| 258 |
+
"target_compliance": sum(1 for lat in cat_latencies if lat <= 60.0) / len(cat_latencies)
|
| 259 |
}
|
| 260 |
else:
|
| 261 |
category_stats[category] = {
|
|
|
|
| 272 |
"successful_queries": len(successful_results),
|
| 273 |
"total_queries": len(self.direct_results),
|
| 274 |
"success_rate": len(successful_results) / len(self.direct_results),
|
| 275 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies)
|
| 276 |
}
|
| 277 |
else:
|
| 278 |
category_stats = {cat: {"average_latency": 0.0, "query_count": 0, "target_compliance": 0.0}
|
|
|
|
| 403 |
print(f"Overall Performance:")
|
| 404 |
print(f" Average Latency: {overall_results['average_latency']:.2f}s")
|
| 405 |
print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
|
| 406 |
+
print(f" 60s Target Compliance: {overall_results['target_compliance']:.1%}")
|
| 407 |
|
| 408 |
print(f"\nApplicable Metrics:")
|
| 409 |
print(f" β
Metric 1 (Latency): Measured")
|
|
|
|
| 416 |
print(f"\nβ
Direct LLM evaluation complete!")
|
| 417 |
print(f"π Statistics: {stats_path}")
|
| 418 |
print(f"π Medical Outputs: {outputs_path}")
|
| 419 |
+
print(f"\nπ‘ Next step: Run python metric5_6_llm_judge_evaluator.py rag,direct for metrics 5-6")
|
evaluation/metric5_6_llm_judge_evaluator.py
CHANGED
|
@@ -137,14 +137,17 @@ class LLMJudgeEvaluator:
|
|
| 137 |
|
| 138 |
for system in systems:
|
| 139 |
if system == "rag":
|
| 140 |
-
pattern
|
|
|
|
| 141 |
elif system == "direct":
|
| 142 |
pattern = str(results_dir / "medical_outputs_direct_*.json")
|
| 143 |
else:
|
| 144 |
# Future extension: support other systems
|
| 145 |
pattern = str(results_dir / f"medical_outputs_{system}_*.json")
|
| 146 |
|
|
|
|
| 147 |
output_files = glob.glob(pattern)
|
|
|
|
| 148 |
|
| 149 |
if not output_files:
|
| 150 |
raise FileNotFoundError(f"No medical outputs files found for {system} system")
|
|
@@ -547,6 +550,38 @@ if __name__ == "__main__":
|
|
| 547 |
if len(set(query_counts)) > 1:
|
| 548 |
print(f"β οΈ Warning: Systems have different query counts: {dict(zip(systems, query_counts))}")
|
| 549 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
print(f"π Comparing {len(systems)} systems with {min(query_counts)} queries each")
|
| 551 |
print(f"π― Metrics: 5 (Actionability) + 6 (Evidence Quality)")
|
| 552 |
print(f"β‘ Strategy: Single comparison call for maximum consistency")
|
|
|
|
| 137 |
|
| 138 |
for system in systems:
|
| 139 |
if system == "rag":
|
| 140 |
+
# Use more specific pattern to exclude direct files
|
| 141 |
+
pattern = str(results_dir / "medical_outputs_[0-9]*.json")
|
| 142 |
elif system == "direct":
|
| 143 |
pattern = str(results_dir / "medical_outputs_direct_*.json")
|
| 144 |
else:
|
| 145 |
# Future extension: support other systems
|
| 146 |
pattern = str(results_dir / f"medical_outputs_{system}_*.json")
|
| 147 |
|
| 148 |
+
print(f"π Searching for {system} with pattern: {pattern}")
|
| 149 |
output_files = glob.glob(pattern)
|
| 150 |
+
print(f"π Found files for {system}: {output_files}")
|
| 151 |
|
| 152 |
if not output_files:
|
| 153 |
raise FileNotFoundError(f"No medical outputs files found for {system} system")
|
|
|
|
| 550 |
if len(set(query_counts)) > 1:
|
| 551 |
print(f"β οΈ Warning: Systems have different query counts: {dict(zip(systems, query_counts))}")
|
| 552 |
|
| 553 |
+
# Validate systems processed same queries (for scientific comparison)
|
| 554 |
+
print(f"π Validating query consistency across systems...")
|
| 555 |
+
if len(systems) > 1:
|
| 556 |
+
first_system_queries = [q['query'] for q in systems_outputs[systems[0]]]
|
| 557 |
+
for i, system in enumerate(systems[1:], 1):
|
| 558 |
+
system_queries = [q['query'] for q in systems_outputs[system]]
|
| 559 |
+
|
| 560 |
+
if first_system_queries != system_queries:
|
| 561 |
+
print(f"β οΈ Warning: {systems[0]} and {system} processed different queries!")
|
| 562 |
+
# Show first difference
|
| 563 |
+
for j, (q1, q2) in enumerate(zip(first_system_queries, system_queries)):
|
| 564 |
+
if q1 != q2:
|
| 565 |
+
print(f" Query {j+1} differs:")
|
| 566 |
+
print(f" {systems[0]}: {q1[:50]}...")
|
| 567 |
+
print(f" {system}: {q2[:50]}...")
|
| 568 |
+
break
|
| 569 |
+
else:
|
| 570 |
+
print(f"β
{systems[0]} and {system} processed identical queries")
|
| 571 |
+
|
| 572 |
+
# Validate systems have different model types
|
| 573 |
+
model_types = set()
|
| 574 |
+
for system, outputs in systems_outputs.items():
|
| 575 |
+
if outputs:
|
| 576 |
+
model_type = outputs[0].get('model_type', 'unknown')
|
| 577 |
+
model_types.add(model_type)
|
| 578 |
+
print(f"π·οΈ {system.upper()} system model_type: {model_type}")
|
| 579 |
+
|
| 580 |
+
if len(model_types) == 1:
|
| 581 |
+
print(f"β οΈ Warning: All systems have same model_type - this may not be a valid comparison!")
|
| 582 |
+
else:
|
| 583 |
+
print(f"β
Systems have different model_types: {model_types}")
|
| 584 |
+
|
| 585 |
print(f"π Comparing {len(systems)} systems with {min(query_counts)} queries each")
|
| 586 |
print(f"π― Metrics: 5 (Actionability) + 6 (Evidence Quality)")
|
| 587 |
print(f"β‘ Strategy: Single comparison call for maximum consistency")
|