X-iZhang commited on
Commit
ca5d05c
Β·
verified Β·
1 Parent(s): a44510e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +408 -4
app.py CHANGED
@@ -154,11 +154,10 @@ available_metrics = [
154
  default_metrics = ["BLEU", "ROUGE", "BERTScore"]
155
 
156
 
157
- with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme=gr.themes.Soft()) as demo:
158
  gr.Markdown(
159
  """
160
- # 🩺 RadEval: A framework for radiology text evaluation
161
- [Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval/) | [Video](https://justin13601.github.io/files/radeval.mp4) |[arXiv]() | [RadEvalModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()
162
 
163
  **RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.
164
 
@@ -256,5 +255,410 @@ with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme
256
  outputs=[analysis_output, table_output]
257
  )
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  if __name__ == "__main__":
260
- demo.launch()
 
154
  default_metrics = ["BLEU", "ROUGE", "BERTScore"]
155
 
156
 
157
+ with gr.Blocks(title="RadEval Evaluation", theme=gr.themes.Soft()) as demo:
158
  gr.Markdown(
159
  """
160
+ # 🏎️ RadEval Evaluation
 
161
 
162
  **RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.
163
 
 
255
  outputs=[analysis_output, table_output]
256
  )
257
 
258
+ # =============================================================================
259
+ # πŸ§ͺ Hypothesis Testing Section
260
+ # =============================================================================
261
+
262
+ def run_hypothesis_testing(systems_data, selected_test_metrics, n_samples, significance_level):
263
+ """
264
+ Run statistical significance testing between multiple systems
265
+ """
266
+ try:
267
+ from RadEval import RadEval, compare_systems
268
+
269
+ # Parse systems data (expecting JSON format)
270
+ import json
271
+ systems_dict = json.loads(systems_data)
272
+
273
+ # Extract references and systems
274
+ if 'references' not in systems_dict or 'systems' not in systems_dict:
275
+ return "Error: Please provide both 'references' and 'systems' in the JSON data.", ""
276
+
277
+ references = systems_dict['references']
278
+ systems = systems_dict['systems']
279
+
280
+ # Validate data integrity
281
+ if not references or not systems:
282
+ return "Error: References and systems cannot be empty.", ""
283
+
284
+ if not isinstance(references, list) or not isinstance(systems, dict):
285
+ return "Error: References must be a list and systems must be a dictionary.", ""
286
+
287
+ # Check that all systems have the same number of outputs as references
288
+ ref_count = len(references)
289
+ for system_name, system_outputs in systems.items():
290
+ if not isinstance(system_outputs, list):
291
+ return f"Error: System '{system_name}' outputs must be a list.", ""
292
+ if len(system_outputs) != ref_count:
293
+ return f"Error: System '{system_name}' has {len(system_outputs)} outputs but {ref_count} references provided.", ""
294
+
295
+ # Validate that all texts are non-empty strings
296
+ for i, ref in enumerate(references):
297
+ if not isinstance(ref, str) or not ref.strip():
298
+ return f"Error: Reference {i+1} is empty or not a string.", ""
299
+
300
+ for system_name, system_outputs in systems.items():
301
+ for i, output in enumerate(system_outputs):
302
+ if not isinstance(output, str) or not output.strip():
303
+ return f"Error: System '{system_name}' output {i+1} is empty or not a string.", ""
304
+
305
+ # Initialize evaluators based on selected metrics (fast metrics only)
306
+ evaluators = {}
307
+ if 'BLEU' in selected_test_metrics:
308
+ evaluators['bleu'] = RadEval(do_bleu=True)
309
+ if 'ROUGE' in selected_test_metrics:
310
+ evaluators['rouge'] = RadEval(do_rouge=True)
311
+ if 'BERTScore' in selected_test_metrics:
312
+ evaluators['bertscore'] = RadEval(do_bertscore=True)
313
+
314
+ # Custom metric: average word count
315
+ def word_count_metric(hyps, refs):
316
+ return sum(len(report.split()) for report in hyps) / len(hyps)
317
+
318
+ # Build metrics dictionary (following the example structure)
319
+ metrics = {}
320
+ if 'BLEU' in selected_test_metrics:
321
+ # Test the evaluator first
322
+ try:
323
+ test_result = evaluators['bleu'](references[:1], [systems[list(systems.keys())[0]][0]])
324
+ if 'bleu' not in test_result:
325
+ return "Error: BLEU evaluator doesn't return 'bleu' key. Available keys: " + str(list(test_result.keys())), ""
326
+ metrics['bleu'] = lambda hyps, refs: evaluators['bleu'](refs, hyps)['bleu']
327
+ except Exception as bleu_error:
328
+ return f"Error testing BLEU evaluator: {str(bleu_error)}", ""
329
+
330
+ if 'ROUGE' in selected_test_metrics:
331
+ try:
332
+ test_result = evaluators['rouge'](references[:1], [systems[list(systems.keys())[0]][0]])
333
+ for rouge_key in ['rouge1', 'rouge2', 'rougeL']:
334
+ if rouge_key not in test_result:
335
+ return f"Error: ROUGE evaluator doesn't return '{rouge_key}' key. Available keys: " + str(list(test_result.keys())), ""
336
+ metrics['rouge1'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge1']
337
+ metrics['rouge2'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge2']
338
+ metrics['rougeL'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rougeL']
339
+ except Exception as rouge_error:
340
+ return f"Error testing ROUGE evaluator: {str(rouge_error)}", ""
341
+
342
+ if 'BERTScore' in selected_test_metrics:
343
+ try:
344
+ test_result = evaluators['bertscore'](references[:1], [systems[list(systems.keys())[0]][0]])
345
+ if 'bertscore' not in test_result:
346
+ return "Error: BERTScore evaluator doesn't return 'bertscore' key. Available keys: " + str(list(test_result.keys())), ""
347
+ metrics['bertscore'] = lambda hyps, refs: evaluators['bertscore'](refs, hyps)['bertscore']
348
+ except Exception as bert_error:
349
+ return f"Error testing BERTScore evaluator: {str(bert_error)}", ""
350
+
351
+ if 'Word Count' in selected_test_metrics:
352
+ metrics['word_count'] = word_count_metric # ← example of a simple custom-defined metric
353
+
354
+ if not metrics:
355
+ return "Error: Please select at least one metric for testing.", ""
356
+
357
+ # Run significance tests
358
+ try:
359
+ signatures, scores = compare_systems(
360
+ systems=systems,
361
+ metrics=metrics,
362
+ references=references,
363
+ n_samples=int(n_samples),
364
+ significance_level=float(significance_level),
365
+ print_results=False # We don't need print output for online demo
366
+ )
367
+
368
+ except Exception as compare_error:
369
+ return f"Error during significance testing: {str(compare_error)}\n\nThis might be due to:\n1. Empty or invalid text content\n2. Incompatible metric configurations\n3. RadEval library issues", str(compare_error)
370
+
371
+ # Format results
372
+ results_text = "## πŸ§ͺ Hypothesis Testing Results\n\n"
373
+ results_text += f"**Parameters:**\n"
374
+ results_text += f"- Randomization samples: {n_samples}\n"
375
+ results_text += f"- Significance level: {significance_level}\n"
376
+ results_text += f"- Number of systems: {len(systems)}\n"
377
+ results_text += f"- Number of references: {len(references)}\n\n"
378
+
379
+ # Significant differences summary
380
+ results_text += "### πŸ“Š Significant Differences Summary\n\n"
381
+ baseline_name = list(systems.keys())[0] # Assume first one is the baseline
382
+ results_text += f"**Baseline system:** {baseline_name}\n\n"
383
+
384
+ has_significant_differences = False
385
+ for system_name in systems.keys():
386
+ if system_name == baseline_name:
387
+ continue
388
+
389
+ significant_metrics = []
390
+ for metric_name in metrics.keys():
391
+ pvalue_key = f"{metric_name}_pvalue"
392
+ if pvalue_key in scores[system_name]:
393
+ p_val = scores[system_name][pvalue_key]
394
+ if p_val < float(significance_level):
395
+ significant_metrics.append(metric_name)
396
+
397
+ if significant_metrics:
398
+ results_text += f"**{system_name} vs {baseline_name}:** {', '.join(significant_metrics)} (p < {significance_level})\n\n"
399
+ has_significant_differences = True
400
+ else:
401
+ results_text += f"**{system_name} vs {baseline_name}:** No significant differences\n\n"
402
+
403
+ if not has_significant_differences:
404
+ results_text += "*No statistically significant differences found between systems.*\n\n"
405
+
406
+ # Add mean scores in table format
407
+ results_text += "### πŸ“ˆ Mean Scores by System\n\n"
408
+ try:
409
+ baseline_name = list(systems.keys())[0]
410
+
411
+ # Display each system's results in a clean format
412
+ for system_name in systems.keys():
413
+ results_text += f"**{system_name.upper()}:**\n\n"
414
+
415
+ # Create table header
416
+ results_text += "| Metric | Score | P-value |\n"
417
+ results_text += "|--------|-------|----------|\n"
418
+
419
+ # Get system data from scores
420
+ system_scores = scores.get(system_name, {})
421
+
422
+ # Add rows for each metric
423
+ for metric_name in metrics.keys():
424
+ if metric_name in system_scores:
425
+ score = system_scores[metric_name]
426
+ pvalue_key = f"{metric_name}_pvalue"
427
+
428
+ # Format score
429
+ score_str = f"{score:.4f}" if isinstance(score, (int, float)) else str(score)
430
+
431
+ # Format p-value (only for non-baseline systems)
432
+ if system_name != baseline_name and pvalue_key in system_scores:
433
+ pvalue = system_scores[pvalue_key]
434
+ pvalue_str = f"{pvalue:.4f}" if isinstance(pvalue, (int, float)) else str(pvalue)
435
+ # Mark significant p-values
436
+ if isinstance(pvalue, (int, float)) and pvalue < float(significance_level):
437
+ pvalue_str += " *"
438
+ else:
439
+ pvalue_str = "-" if system_name == baseline_name else "N/A"
440
+
441
+ results_text += f"| {metric_name} | {score_str} | {pvalue_str} |\n"
442
+
443
+ results_text += "\n"
444
+
445
+ results_text += "*Note: Baseline system shows scores only. Other systems show scores and p-values comparing to baseline.*\n"
446
+ results_text += f"*P-values marked with * are significant (p < {significance_level}).*\n\n"
447
+
448
+ except Exception as score_error:
449
+ results_text += f"Error formatting scores: {str(score_error)}\n\n"
450
+
451
+ return results_text
452
+
453
+ except ImportError as e:
454
+ return f"Import Error: {str(e)}. Please ensure RadEval with compare_systems is installed."
455
+ except json.JSONDecodeError:
456
+ return "Error: Invalid JSON format in systems data."
457
+ except Exception as e:
458
+ return f"Testing Error: {str(e)}"
459
+
460
+ # Create Hypothesis Testing UI
461
+ with gr.Blocks(title="Null Hypothesis Testing", theme=gr.themes.Soft()) as hypothesis_demo:
462
+ gr.Markdown(
463
+ """
464
+ # πŸ–₯️ Null Hypothesis Testing
465
+
466
+ **Statistical significance testing** for comparing multiple radiology report generation systems.
467
+ This tool uses **randomization-based significance testing** to determine if differences between systems are statistically meaningful.
468
+
469
+ **⚠️ Performance Warning ⚠️**
470
+
471
+ Hypothesis testing with multiple metrics may take some time, especially with larger sample sizes. Please be patient during computation.
472
+ """
473
+ )
474
+
475
+ with gr.Row():
476
+ with gr.Column(scale=1.5):
477
+ systems_input = gr.Textbox(
478
+ label="πŸ“Š Systems Data (JSON Format)",
479
+ lines=18,
480
+ placeholder="""Enter systems data in JSON format, e.g.:
481
+ {
482
+ "references": [
483
+ "No acute cardiopulmonary process.",
484
+ "Mild cardiomegaly with clear lung fields."
485
+ ],
486
+ "systems": {
487
+ "baseline": [
488
+ "No acute findings.",
489
+ "Mild cardiomegaly, clear lungs."
490
+ ],
491
+ "improved": [
492
+ "No acute cardiopulmonary process.",
493
+ "Mild cardiomegaly with clear lung fields bilaterally."
494
+ ]
495
+ }
496
+ }""",
497
+ info="Provide reference reports and multiple systems to compare"
498
+ )
499
+
500
+ with gr.Column(scale=1):
501
+ test_metrics_selection = gr.CheckboxGroup(
502
+ label="🎯 Select Metrics for Testing",
503
+ choices=["BLEU", "ROUGE", "BERTScore", "Word Count"],
504
+ value=["BLEU", "ROUGE", "BERTScore"],
505
+ interactive=True,
506
+ info="Only fast metrics are shown to ensure quick evaluation (slow ones are excluded)"
507
+ )
508
+
509
+ n_samples_input = gr.Number(
510
+ label="πŸ”„ Randomization Samples",
511
+ value=50,
512
+ minimum=10,
513
+ maximum=1000,
514
+ step=10,
515
+ info="Number of randomisation samples (higher = more confidence, but slower)"
516
+ )
517
+
518
+ significance_level_input = gr.Number(
519
+ label="πŸ“ˆ Significance Level (Ξ±)",
520
+ value=0.05,
521
+ minimum=0.01,
522
+ maximum=0.10,
523
+ step=0.01,
524
+ info="Alpha level for significance testing"
525
+ )
526
+
527
+ example_button = gr.Button("πŸ“ Load Example Data", variant="secondary")
528
+ clear_button = gr.Button("πŸ—‘οΈ Clear Data", variant="secondary")
529
+
530
+
531
+ with gr.Row():
532
+ test_button = gr.Button("πŸ§ͺ Run Hypothesis Testing", variant="primary", size="lg")
533
+
534
+ with gr.Row():
535
+ test_results = gr.Markdown(
536
+ value="πŸ“Š **Test results will appear here...**\n\nClick 'Load Example Data' to see sample input, then click 'Run Hypothesis Testing' to see results."
537
+ )
538
+
539
+ # Example data button
540
+ def load_example_data():
541
+ example_data = {
542
+ "references": [
543
+ "No acute cardiopulmonary process.",
544
+ "No radiographic findings to suggest pneumonia.",
545
+ "Mild cardiomegaly with clear lung fields.",
546
+ "Small pleural effusion on the right side.",
547
+ "Status post cardiac surgery with stable appearance."
548
+ ],
549
+ "systems": {
550
+ "baseline": [
551
+ "No acute findings.",
552
+ "No pneumonia.",
553
+ "Mild cardiomegaly, clear lungs.",
554
+ "Small right pleural effusion.",
555
+ "Post-cardiac surgery, stable."
556
+ ],
557
+ "improved": [
558
+ "No acute cardiopulmonary process.",
559
+ "No radiographic findings suggesting pneumonia.",
560
+ "Mild cardiomegaly with clear lung fields bilaterally.",
561
+ "Small pleural effusion present on the right side.",
562
+ "Status post cardiac surgery with stable appearance."
563
+ ],
564
+ "poor": [
565
+ "Normal.",
566
+ "OK.",
567
+ "Heart big.",
568
+ "Some fluid.",
569
+ "Surgery done."
570
+ ]
571
+ }
572
+ }
573
+ import json
574
+ return json.dumps(example_data, indent=2)
575
+
576
+ example_button.click(
577
+ load_example_data,
578
+ outputs=systems_input
579
+ )
580
+
581
+ clear_button.click(
582
+ lambda: "",
583
+ outputs=systems_input
584
+ )
585
+
586
+ test_button.click(
587
+ run_hypothesis_testing,
588
+ inputs=[systems_input, test_metrics_selection, n_samples_input, significance_level_input],
589
+ outputs=[test_results]
590
+ )
591
+
592
+ with gr.Accordion("πŸ’‘ Hypothesis Testing Information", open=False):
593
+ gr.Markdown(
594
+ """
595
+ ### πŸ”¬ How it Works:
596
+
597
+ This tool performs **randomization-based significance testing** to compare multiple systems:
598
+
599
+ 1. **Null Hypothesis**: No difference between systems
600
+ 2. **Randomization**: Randomly permute system outputs multiple times
601
+ 3. **P-value Calculation**: Proportion of permutations where random difference β‰₯ observed difference
602
+ 4. **Significance**: If p-value < Ξ±, reject null hypothesis (systems are significantly different)
603
+
604
+ ### πŸ“Š Input Format:
605
+ - **References**: Ground truth reports
606
+ - **Systems**: Multiple systems to compare (each with same number of outputs as references)
607
+ - **Metrics**: Evaluation metrics to use for comparison
608
+
609
+ ### πŸ“ˆ Output:
610
+ - **Significance Matrix**: P-values for all pairwise system comparisons
611
+ - **Mean Scores**: Average performance of each system on each metric
612
+ - **Bold p-values**: Indicate statistically significant differences
613
+
614
+ ### ⚑ Performance:
615
+ - **Fast Metrics Only**: This tool only includes BLEU, ROUGE, BERTScore, and Word Count for optimal performance
616
+ - **Excluded Slow Metrics**: RadGraph F1, CheXbert F1 are excluded to ensure reasonable computation time
617
+ - More randomization samples = more accurate p-values but slower computation
618
+ - Recommended: 50-100 samples for quick testing, 1000+ for publication
619
+ """
620
+ )
621
+
622
+ # Combine both demos using gr.Blocks to add a header
623
+ with gr.Blocks(
624
+ title="RadEval: A framework for radiology text evaluation",
625
+ theme=gr.themes.Soft(),
626
+ css="""
627
+ .tab-nav button {
628
+ font-weight: bold !important;
629
+ border: 2px solid #e0e7ff !important;
630
+ border-radius: 10px !important;
631
+ margin: 0 5px !important;
632
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
633
+ color: white !important;
634
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2) !important;
635
+ transition: all 0.3s ease !important;
636
+ }
637
+ .tab-nav button:hover {
638
+ transform: translateY(-2px) !important;
639
+ box-shadow: 0 6px 20px rgba(0, 0, 0, 0.3) !important;
640
+ background: linear-gradient(135deg, #764ba2 0%, #667eea 100%) !important;
641
+ }
642
+ .tab-nav button.selected {
643
+ background: linear-gradient(135deg, #ff6b6b 0%, #ee5a24 100%) !important;
644
+ border-color: #ff6b6b !important;
645
+ transform: translateY(-1px) !important;
646
+ box-shadow: 0 8px 25px rgba(255, 107, 107, 0.4) !important;
647
+ }
648
+ """
649
+ ) as combined_demo:
650
+ gr.Markdown(
651
+ """
652
+ # 🩺 RadEval: A framework for radiology text evaluation
653
+ ### [Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval) | [Video](https://justin13601.github.io/files/radeval.mp4) | [arXiv]() | [RadEval_ModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()
654
+
655
+ """
656
+ )
657
+
658
+ tabs = gr.TabbedInterface(
659
+ [demo, hypothesis_demo],
660
+ ["🏎️ RadEval Evaluation", "πŸ–₯️ Null Hypothesis Testing"]
661
+ )
662
+
663
  if __name__ == "__main__":
664
+ combined_demo.launch()