Update app.py
Browse files
app.py
CHANGED
@@ -154,11 +154,10 @@ available_metrics = [
|
|
154 |
default_metrics = ["BLEU", "ROUGE", "BERTScore"]
|
155 |
|
156 |
|
157 |
-
with gr.Blocks(title="RadEval
|
158 |
gr.Markdown(
|
159 |
"""
|
160 |
-
#
|
161 |
-
[Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval/) | [Video](https://justin13601.github.io/files/radeval.mp4) |[arXiv]() | [RadEvalModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()
|
162 |
|
163 |
**RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.
|
164 |
|
@@ -256,5 +255,410 @@ with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme
|
|
256 |
outputs=[analysis_output, table_output]
|
257 |
)
|
258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
if __name__ == "__main__":
|
260 |
-
|
|
|
154 |
default_metrics = ["BLEU", "ROUGE", "BERTScore"]
|
155 |
|
156 |
|
157 |
+
with gr.Blocks(title="RadEval Evaluation", theme=gr.themes.Soft()) as demo:
|
158 |
gr.Markdown(
|
159 |
"""
|
160 |
+
# ποΈ RadEval Evaluation
|
|
|
161 |
|
162 |
**RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.
|
163 |
|
|
|
255 |
outputs=[analysis_output, table_output]
|
256 |
)
|
257 |
|
258 |
+
# =============================================================================
|
259 |
+
# π§ͺ Hypothesis Testing Section
|
260 |
+
# =============================================================================
|
261 |
+
|
262 |
+
def run_hypothesis_testing(systems_data, selected_test_metrics, n_samples, significance_level):
|
263 |
+
"""
|
264 |
+
Run statistical significance testing between multiple systems
|
265 |
+
"""
|
266 |
+
try:
|
267 |
+
from RadEval import RadEval, compare_systems
|
268 |
+
|
269 |
+
# Parse systems data (expecting JSON format)
|
270 |
+
import json
|
271 |
+
systems_dict = json.loads(systems_data)
|
272 |
+
|
273 |
+
# Extract references and systems
|
274 |
+
if 'references' not in systems_dict or 'systems' not in systems_dict:
|
275 |
+
return "Error: Please provide both 'references' and 'systems' in the JSON data.", ""
|
276 |
+
|
277 |
+
references = systems_dict['references']
|
278 |
+
systems = systems_dict['systems']
|
279 |
+
|
280 |
+
# Validate data integrity
|
281 |
+
if not references or not systems:
|
282 |
+
return "Error: References and systems cannot be empty.", ""
|
283 |
+
|
284 |
+
if not isinstance(references, list) or not isinstance(systems, dict):
|
285 |
+
return "Error: References must be a list and systems must be a dictionary.", ""
|
286 |
+
|
287 |
+
# Check that all systems have the same number of outputs as references
|
288 |
+
ref_count = len(references)
|
289 |
+
for system_name, system_outputs in systems.items():
|
290 |
+
if not isinstance(system_outputs, list):
|
291 |
+
return f"Error: System '{system_name}' outputs must be a list.", ""
|
292 |
+
if len(system_outputs) != ref_count:
|
293 |
+
return f"Error: System '{system_name}' has {len(system_outputs)} outputs but {ref_count} references provided.", ""
|
294 |
+
|
295 |
+
# Validate that all texts are non-empty strings
|
296 |
+
for i, ref in enumerate(references):
|
297 |
+
if not isinstance(ref, str) or not ref.strip():
|
298 |
+
return f"Error: Reference {i+1} is empty or not a string.", ""
|
299 |
+
|
300 |
+
for system_name, system_outputs in systems.items():
|
301 |
+
for i, output in enumerate(system_outputs):
|
302 |
+
if not isinstance(output, str) or not output.strip():
|
303 |
+
return f"Error: System '{system_name}' output {i+1} is empty or not a string.", ""
|
304 |
+
|
305 |
+
# Initialize evaluators based on selected metrics (fast metrics only)
|
306 |
+
evaluators = {}
|
307 |
+
if 'BLEU' in selected_test_metrics:
|
308 |
+
evaluators['bleu'] = RadEval(do_bleu=True)
|
309 |
+
if 'ROUGE' in selected_test_metrics:
|
310 |
+
evaluators['rouge'] = RadEval(do_rouge=True)
|
311 |
+
if 'BERTScore' in selected_test_metrics:
|
312 |
+
evaluators['bertscore'] = RadEval(do_bertscore=True)
|
313 |
+
|
314 |
+
# Custom metric: average word count
|
315 |
+
def word_count_metric(hyps, refs):
|
316 |
+
return sum(len(report.split()) for report in hyps) / len(hyps)
|
317 |
+
|
318 |
+
# Build metrics dictionary (following the example structure)
|
319 |
+
metrics = {}
|
320 |
+
if 'BLEU' in selected_test_metrics:
|
321 |
+
# Test the evaluator first
|
322 |
+
try:
|
323 |
+
test_result = evaluators['bleu'](references[:1], [systems[list(systems.keys())[0]][0]])
|
324 |
+
if 'bleu' not in test_result:
|
325 |
+
return "Error: BLEU evaluator doesn't return 'bleu' key. Available keys: " + str(list(test_result.keys())), ""
|
326 |
+
metrics['bleu'] = lambda hyps, refs: evaluators['bleu'](refs, hyps)['bleu']
|
327 |
+
except Exception as bleu_error:
|
328 |
+
return f"Error testing BLEU evaluator: {str(bleu_error)}", ""
|
329 |
+
|
330 |
+
if 'ROUGE' in selected_test_metrics:
|
331 |
+
try:
|
332 |
+
test_result = evaluators['rouge'](references[:1], [systems[list(systems.keys())[0]][0]])
|
333 |
+
for rouge_key in ['rouge1', 'rouge2', 'rougeL']:
|
334 |
+
if rouge_key not in test_result:
|
335 |
+
return f"Error: ROUGE evaluator doesn't return '{rouge_key}' key. Available keys: " + str(list(test_result.keys())), ""
|
336 |
+
metrics['rouge1'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge1']
|
337 |
+
metrics['rouge2'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge2']
|
338 |
+
metrics['rougeL'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rougeL']
|
339 |
+
except Exception as rouge_error:
|
340 |
+
return f"Error testing ROUGE evaluator: {str(rouge_error)}", ""
|
341 |
+
|
342 |
+
if 'BERTScore' in selected_test_metrics:
|
343 |
+
try:
|
344 |
+
test_result = evaluators['bertscore'](references[:1], [systems[list(systems.keys())[0]][0]])
|
345 |
+
if 'bertscore' not in test_result:
|
346 |
+
return "Error: BERTScore evaluator doesn't return 'bertscore' key. Available keys: " + str(list(test_result.keys())), ""
|
347 |
+
metrics['bertscore'] = lambda hyps, refs: evaluators['bertscore'](refs, hyps)['bertscore']
|
348 |
+
except Exception as bert_error:
|
349 |
+
return f"Error testing BERTScore evaluator: {str(bert_error)}", ""
|
350 |
+
|
351 |
+
if 'Word Count' in selected_test_metrics:
|
352 |
+
metrics['word_count'] = word_count_metric # β example of a simple custom-defined metric
|
353 |
+
|
354 |
+
if not metrics:
|
355 |
+
return "Error: Please select at least one metric for testing.", ""
|
356 |
+
|
357 |
+
# Run significance tests
|
358 |
+
try:
|
359 |
+
signatures, scores = compare_systems(
|
360 |
+
systems=systems,
|
361 |
+
metrics=metrics,
|
362 |
+
references=references,
|
363 |
+
n_samples=int(n_samples),
|
364 |
+
significance_level=float(significance_level),
|
365 |
+
print_results=False # We don't need print output for online demo
|
366 |
+
)
|
367 |
+
|
368 |
+
except Exception as compare_error:
|
369 |
+
return f"Error during significance testing: {str(compare_error)}\n\nThis might be due to:\n1. Empty or invalid text content\n2. Incompatible metric configurations\n3. RadEval library issues", str(compare_error)
|
370 |
+
|
371 |
+
# Format results
|
372 |
+
results_text = "## π§ͺ Hypothesis Testing Results\n\n"
|
373 |
+
results_text += f"**Parameters:**\n"
|
374 |
+
results_text += f"- Randomization samples: {n_samples}\n"
|
375 |
+
results_text += f"- Significance level: {significance_level}\n"
|
376 |
+
results_text += f"- Number of systems: {len(systems)}\n"
|
377 |
+
results_text += f"- Number of references: {len(references)}\n\n"
|
378 |
+
|
379 |
+
# Significant differences summary
|
380 |
+
results_text += "### π Significant Differences Summary\n\n"
|
381 |
+
baseline_name = list(systems.keys())[0] # Assume first one is the baseline
|
382 |
+
results_text += f"**Baseline system:** {baseline_name}\n\n"
|
383 |
+
|
384 |
+
has_significant_differences = False
|
385 |
+
for system_name in systems.keys():
|
386 |
+
if system_name == baseline_name:
|
387 |
+
continue
|
388 |
+
|
389 |
+
significant_metrics = []
|
390 |
+
for metric_name in metrics.keys():
|
391 |
+
pvalue_key = f"{metric_name}_pvalue"
|
392 |
+
if pvalue_key in scores[system_name]:
|
393 |
+
p_val = scores[system_name][pvalue_key]
|
394 |
+
if p_val < float(significance_level):
|
395 |
+
significant_metrics.append(metric_name)
|
396 |
+
|
397 |
+
if significant_metrics:
|
398 |
+
results_text += f"**{system_name} vs {baseline_name}:** {', '.join(significant_metrics)} (p < {significance_level})\n\n"
|
399 |
+
has_significant_differences = True
|
400 |
+
else:
|
401 |
+
results_text += f"**{system_name} vs {baseline_name}:** No significant differences\n\n"
|
402 |
+
|
403 |
+
if not has_significant_differences:
|
404 |
+
results_text += "*No statistically significant differences found between systems.*\n\n"
|
405 |
+
|
406 |
+
# Add mean scores in table format
|
407 |
+
results_text += "### π Mean Scores by System\n\n"
|
408 |
+
try:
|
409 |
+
baseline_name = list(systems.keys())[0]
|
410 |
+
|
411 |
+
# Display each system's results in a clean format
|
412 |
+
for system_name in systems.keys():
|
413 |
+
results_text += f"**{system_name.upper()}:**\n\n"
|
414 |
+
|
415 |
+
# Create table header
|
416 |
+
results_text += "| Metric | Score | P-value |\n"
|
417 |
+
results_text += "|--------|-------|----------|\n"
|
418 |
+
|
419 |
+
# Get system data from scores
|
420 |
+
system_scores = scores.get(system_name, {})
|
421 |
+
|
422 |
+
# Add rows for each metric
|
423 |
+
for metric_name in metrics.keys():
|
424 |
+
if metric_name in system_scores:
|
425 |
+
score = system_scores[metric_name]
|
426 |
+
pvalue_key = f"{metric_name}_pvalue"
|
427 |
+
|
428 |
+
# Format score
|
429 |
+
score_str = f"{score:.4f}" if isinstance(score, (int, float)) else str(score)
|
430 |
+
|
431 |
+
# Format p-value (only for non-baseline systems)
|
432 |
+
if system_name != baseline_name and pvalue_key in system_scores:
|
433 |
+
pvalue = system_scores[pvalue_key]
|
434 |
+
pvalue_str = f"{pvalue:.4f}" if isinstance(pvalue, (int, float)) else str(pvalue)
|
435 |
+
# Mark significant p-values
|
436 |
+
if isinstance(pvalue, (int, float)) and pvalue < float(significance_level):
|
437 |
+
pvalue_str += " *"
|
438 |
+
else:
|
439 |
+
pvalue_str = "-" if system_name == baseline_name else "N/A"
|
440 |
+
|
441 |
+
results_text += f"| {metric_name} | {score_str} | {pvalue_str} |\n"
|
442 |
+
|
443 |
+
results_text += "\n"
|
444 |
+
|
445 |
+
results_text += "*Note: Baseline system shows scores only. Other systems show scores and p-values comparing to baseline.*\n"
|
446 |
+
results_text += f"*P-values marked with * are significant (p < {significance_level}).*\n\n"
|
447 |
+
|
448 |
+
except Exception as score_error:
|
449 |
+
results_text += f"Error formatting scores: {str(score_error)}\n\n"
|
450 |
+
|
451 |
+
return results_text
|
452 |
+
|
453 |
+
except ImportError as e:
|
454 |
+
return f"Import Error: {str(e)}. Please ensure RadEval with compare_systems is installed."
|
455 |
+
except json.JSONDecodeError:
|
456 |
+
return "Error: Invalid JSON format in systems data."
|
457 |
+
except Exception as e:
|
458 |
+
return f"Testing Error: {str(e)}"
|
459 |
+
|
460 |
+
# Create Hypothesis Testing UI
|
461 |
+
with gr.Blocks(title="Null Hypothesis Testing", theme=gr.themes.Soft()) as hypothesis_demo:
|
462 |
+
gr.Markdown(
|
463 |
+
"""
|
464 |
+
# π₯οΈ Null Hypothesis Testing
|
465 |
+
|
466 |
+
**Statistical significance testing** for comparing multiple radiology report generation systems.
|
467 |
+
This tool uses **randomization-based significance testing** to determine if differences between systems are statistically meaningful.
|
468 |
+
|
469 |
+
**β οΈ Performance Warning β οΈ**
|
470 |
+
|
471 |
+
Hypothesis testing with multiple metrics may take some time, especially with larger sample sizes. Please be patient during computation.
|
472 |
+
"""
|
473 |
+
)
|
474 |
+
|
475 |
+
with gr.Row():
|
476 |
+
with gr.Column(scale=1.5):
|
477 |
+
systems_input = gr.Textbox(
|
478 |
+
label="π Systems Data (JSON Format)",
|
479 |
+
lines=18,
|
480 |
+
placeholder="""Enter systems data in JSON format, e.g.:
|
481 |
+
{
|
482 |
+
"references": [
|
483 |
+
"No acute cardiopulmonary process.",
|
484 |
+
"Mild cardiomegaly with clear lung fields."
|
485 |
+
],
|
486 |
+
"systems": {
|
487 |
+
"baseline": [
|
488 |
+
"No acute findings.",
|
489 |
+
"Mild cardiomegaly, clear lungs."
|
490 |
+
],
|
491 |
+
"improved": [
|
492 |
+
"No acute cardiopulmonary process.",
|
493 |
+
"Mild cardiomegaly with clear lung fields bilaterally."
|
494 |
+
]
|
495 |
+
}
|
496 |
+
}""",
|
497 |
+
info="Provide reference reports and multiple systems to compare"
|
498 |
+
)
|
499 |
+
|
500 |
+
with gr.Column(scale=1):
|
501 |
+
test_metrics_selection = gr.CheckboxGroup(
|
502 |
+
label="π― Select Metrics for Testing",
|
503 |
+
choices=["BLEU", "ROUGE", "BERTScore", "Word Count"],
|
504 |
+
value=["BLEU", "ROUGE", "BERTScore"],
|
505 |
+
interactive=True,
|
506 |
+
info="Only fast metrics are shown to ensure quick evaluation (slow ones are excluded)"
|
507 |
+
)
|
508 |
+
|
509 |
+
n_samples_input = gr.Number(
|
510 |
+
label="π Randomization Samples",
|
511 |
+
value=50,
|
512 |
+
minimum=10,
|
513 |
+
maximum=1000,
|
514 |
+
step=10,
|
515 |
+
info="Number of randomisation samples (higher = more confidence, but slower)"
|
516 |
+
)
|
517 |
+
|
518 |
+
significance_level_input = gr.Number(
|
519 |
+
label="π Significance Level (Ξ±)",
|
520 |
+
value=0.05,
|
521 |
+
minimum=0.01,
|
522 |
+
maximum=0.10,
|
523 |
+
step=0.01,
|
524 |
+
info="Alpha level for significance testing"
|
525 |
+
)
|
526 |
+
|
527 |
+
example_button = gr.Button("π Load Example Data", variant="secondary")
|
528 |
+
clear_button = gr.Button("ποΈ Clear Data", variant="secondary")
|
529 |
+
|
530 |
+
|
531 |
+
with gr.Row():
|
532 |
+
test_button = gr.Button("π§ͺ Run Hypothesis Testing", variant="primary", size="lg")
|
533 |
+
|
534 |
+
with gr.Row():
|
535 |
+
test_results = gr.Markdown(
|
536 |
+
value="π **Test results will appear here...**\n\nClick 'Load Example Data' to see sample input, then click 'Run Hypothesis Testing' to see results."
|
537 |
+
)
|
538 |
+
|
539 |
+
# Example data button
|
540 |
+
def load_example_data():
|
541 |
+
example_data = {
|
542 |
+
"references": [
|
543 |
+
"No acute cardiopulmonary process.",
|
544 |
+
"No radiographic findings to suggest pneumonia.",
|
545 |
+
"Mild cardiomegaly with clear lung fields.",
|
546 |
+
"Small pleural effusion on the right side.",
|
547 |
+
"Status post cardiac surgery with stable appearance."
|
548 |
+
],
|
549 |
+
"systems": {
|
550 |
+
"baseline": [
|
551 |
+
"No acute findings.",
|
552 |
+
"No pneumonia.",
|
553 |
+
"Mild cardiomegaly, clear lungs.",
|
554 |
+
"Small right pleural effusion.",
|
555 |
+
"Post-cardiac surgery, stable."
|
556 |
+
],
|
557 |
+
"improved": [
|
558 |
+
"No acute cardiopulmonary process.",
|
559 |
+
"No radiographic findings suggesting pneumonia.",
|
560 |
+
"Mild cardiomegaly with clear lung fields bilaterally.",
|
561 |
+
"Small pleural effusion present on the right side.",
|
562 |
+
"Status post cardiac surgery with stable appearance."
|
563 |
+
],
|
564 |
+
"poor": [
|
565 |
+
"Normal.",
|
566 |
+
"OK.",
|
567 |
+
"Heart big.",
|
568 |
+
"Some fluid.",
|
569 |
+
"Surgery done."
|
570 |
+
]
|
571 |
+
}
|
572 |
+
}
|
573 |
+
import json
|
574 |
+
return json.dumps(example_data, indent=2)
|
575 |
+
|
576 |
+
example_button.click(
|
577 |
+
load_example_data,
|
578 |
+
outputs=systems_input
|
579 |
+
)
|
580 |
+
|
581 |
+
clear_button.click(
|
582 |
+
lambda: "",
|
583 |
+
outputs=systems_input
|
584 |
+
)
|
585 |
+
|
586 |
+
test_button.click(
|
587 |
+
run_hypothesis_testing,
|
588 |
+
inputs=[systems_input, test_metrics_selection, n_samples_input, significance_level_input],
|
589 |
+
outputs=[test_results]
|
590 |
+
)
|
591 |
+
|
592 |
+
with gr.Accordion("π‘ Hypothesis Testing Information", open=False):
|
593 |
+
gr.Markdown(
|
594 |
+
"""
|
595 |
+
### π¬ How it Works:
|
596 |
+
|
597 |
+
This tool performs **randomization-based significance testing** to compare multiple systems:
|
598 |
+
|
599 |
+
1. **Null Hypothesis**: No difference between systems
|
600 |
+
2. **Randomization**: Randomly permute system outputs multiple times
|
601 |
+
3. **P-value Calculation**: Proportion of permutations where random difference β₯ observed difference
|
602 |
+
4. **Significance**: If p-value < Ξ±, reject null hypothesis (systems are significantly different)
|
603 |
+
|
604 |
+
### π Input Format:
|
605 |
+
- **References**: Ground truth reports
|
606 |
+
- **Systems**: Multiple systems to compare (each with same number of outputs as references)
|
607 |
+
- **Metrics**: Evaluation metrics to use for comparison
|
608 |
+
|
609 |
+
### π Output:
|
610 |
+
- **Significance Matrix**: P-values for all pairwise system comparisons
|
611 |
+
- **Mean Scores**: Average performance of each system on each metric
|
612 |
+
- **Bold p-values**: Indicate statistically significant differences
|
613 |
+
|
614 |
+
### β‘ Performance:
|
615 |
+
- **Fast Metrics Only**: This tool only includes BLEU, ROUGE, BERTScore, and Word Count for optimal performance
|
616 |
+
- **Excluded Slow Metrics**: RadGraph F1, CheXbert F1 are excluded to ensure reasonable computation time
|
617 |
+
- More randomization samples = more accurate p-values but slower computation
|
618 |
+
- Recommended: 50-100 samples for quick testing, 1000+ for publication
|
619 |
+
"""
|
620 |
+
)
|
621 |
+
|
622 |
+
# Combine both demos using gr.Blocks to add a header
|
623 |
+
with gr.Blocks(
|
624 |
+
title="RadEval: A framework for radiology text evaluation",
|
625 |
+
theme=gr.themes.Soft(),
|
626 |
+
css="""
|
627 |
+
.tab-nav button {
|
628 |
+
font-weight: bold !important;
|
629 |
+
border: 2px solid #e0e7ff !important;
|
630 |
+
border-radius: 10px !important;
|
631 |
+
margin: 0 5px !important;
|
632 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
633 |
+
color: white !important;
|
634 |
+
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2) !important;
|
635 |
+
transition: all 0.3s ease !important;
|
636 |
+
}
|
637 |
+
.tab-nav button:hover {
|
638 |
+
transform: translateY(-2px) !important;
|
639 |
+
box-shadow: 0 6px 20px rgba(0, 0, 0, 0.3) !important;
|
640 |
+
background: linear-gradient(135deg, #764ba2 0%, #667eea 100%) !important;
|
641 |
+
}
|
642 |
+
.tab-nav button.selected {
|
643 |
+
background: linear-gradient(135deg, #ff6b6b 0%, #ee5a24 100%) !important;
|
644 |
+
border-color: #ff6b6b !important;
|
645 |
+
transform: translateY(-1px) !important;
|
646 |
+
box-shadow: 0 8px 25px rgba(255, 107, 107, 0.4) !important;
|
647 |
+
}
|
648 |
+
"""
|
649 |
+
) as combined_demo:
|
650 |
+
gr.Markdown(
|
651 |
+
"""
|
652 |
+
# π©Ί RadEval: A framework for radiology text evaluation
|
653 |
+
### [Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval) | [Video](https://justin13601.github.io/files/radeval.mp4) | [arXiv]() | [RadEval_ModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()
|
654 |
+
|
655 |
+
"""
|
656 |
+
)
|
657 |
+
|
658 |
+
tabs = gr.TabbedInterface(
|
659 |
+
[demo, hypothesis_demo],
|
660 |
+
["ποΈ RadEval Evaluation", "π₯οΈ Null Hypothesis Testing"]
|
661 |
+
)
|
662 |
+
|
663 |
if __name__ == "__main__":
|
664 |
+
combined_demo.launch()
|