Spaces:

X-iZhang
/

RadEval

Running

App Files Files Community

X-iZhang commited on Jul 12

Commit

ca5d05c

verified ·

1 Parent(s): a44510e

Update app.py

Browse files

Files changed (1) hide show

app.py +408 -4

app.py CHANGED Viewed

@@ -154,11 +154,10 @@ available_metrics = [
 default_metrics = ["BLEU", "ROUGE", "BERTScore"]
-with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # 🩺 RadEval:  A framework for radiology text evaluation
-        [Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval/) | [Video](https://justin13601.github.io/files/radeval.mp4) |[arXiv]() | [RadEvalModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()
         **RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.
@@ -256,5 +255,410 @@ with gr.Blocks(title="RadEval: A framework for radiology text evaluation", theme
         outputs=[analysis_output, table_output]
     )
 if __name__ == "__main__":
-    demo.launch()

 default_metrics = ["BLEU", "ROUGE", "BERTScore"]
+with gr.Blocks(title="RadEval Evaluation", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # 🏎️ RadEval Evaluation
         **RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.
         outputs=[analysis_output, table_output]
     )
+# =============================================================================
+# 🧪 Hypothesis Testing Section
+# =============================================================================
+def run_hypothesis_testing(systems_data, selected_test_metrics, n_samples, significance_level):
+    """
+    Run statistical significance testing between multiple systems
+    """
+    try:
+        from RadEval import RadEval, compare_systems
+        # Parse systems data (expecting JSON format)
+        import json
+        systems_dict = json.loads(systems_data)
+        # Extract references and systems
+        if 'references' not in systems_dict or 'systems' not in systems_dict:
+            return "Error: Please provide both 'references' and 'systems' in the JSON data.", ""
+        references = systems_dict['references']
+        systems = systems_dict['systems']
+        # Validate data integrity
+        if not references or not systems:
+            return "Error: References and systems cannot be empty.", ""
+        if not isinstance(references, list) or not isinstance(systems, dict):
+            return "Error: References must be a list and systems must be a dictionary.", ""
+        # Check that all systems have the same number of outputs as references
+        ref_count = len(references)
+        for system_name, system_outputs in systems.items():
+            if not isinstance(system_outputs, list):
+                return f"Error: System '{system_name}' outputs must be a list.", ""
+            if len(system_outputs) != ref_count:
+                return f"Error: System '{system_name}' has {len(system_outputs)} outputs but {ref_count} references provided.", ""
+        # Validate that all texts are non-empty strings
+        for i, ref in enumerate(references):
+            if not isinstance(ref, str) or not ref.strip():
+                return f"Error: Reference {i+1} is empty or not a string.", ""
+        for system_name, system_outputs in systems.items():
+            for i, output in enumerate(system_outputs):
+                if not isinstance(output, str) or not output.strip():
+                    return f"Error: System '{system_name}' output {i+1} is empty or not a string.", ""
+        # Initialize evaluators based on selected metrics (fast metrics only)
+        evaluators = {}
+        if 'BLEU' in selected_test_metrics:
+            evaluators['bleu'] = RadEval(do_bleu=True)
+        if 'ROUGE' in selected_test_metrics:
+            evaluators['rouge'] = RadEval(do_rouge=True)
+        if 'BERTScore' in selected_test_metrics:
+            evaluators['bertscore'] = RadEval(do_bertscore=True)
+        # Custom metric: average word count
+        def word_count_metric(hyps, refs):
+            return sum(len(report.split()) for report in hyps) / len(hyps)
+        # Build metrics dictionary (following the example structure)
+        metrics = {}
+        if 'BLEU' in selected_test_metrics:
+            # Test the evaluator first
+            try:
+                test_result = evaluators['bleu'](references[:1], [systems[list(systems.keys())[0]][0]])
+                if 'bleu' not in test_result:
+                    return "Error: BLEU evaluator doesn't return 'bleu' key. Available keys: " + str(list(test_result.keys())), ""
+                metrics['bleu'] = lambda hyps, refs: evaluators['bleu'](refs, hyps)['bleu']
+            except Exception as bleu_error:
+                return f"Error testing BLEU evaluator: {str(bleu_error)}", ""
+        if 'ROUGE' in selected_test_metrics:
+            try:
+                test_result = evaluators['rouge'](references[:1], [systems[list(systems.keys())[0]][0]])
+                for rouge_key in ['rouge1', 'rouge2', 'rougeL']:
+                    if rouge_key not in test_result:
+                        return f"Error: ROUGE evaluator doesn't return '{rouge_key}' key. Available keys: " + str(list(test_result.keys())), ""
+                metrics['rouge1'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge1']
+                metrics['rouge2'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge2']
+                metrics['rougeL'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rougeL']
+            except Exception as rouge_error:
+                return f"Error testing ROUGE evaluator: {str(rouge_error)}", ""
+        if 'BERTScore' in selected_test_metrics:
+            try:
+                test_result = evaluators['bertscore'](references[:1], [systems[list(systems.keys())[0]][0]])
+                if 'bertscore' not in test_result:
+                    return "Error: BERTScore evaluator doesn't return 'bertscore' key. Available keys: " + str(list(test_result.keys())), ""
+                metrics['bertscore'] = lambda hyps, refs: evaluators['bertscore'](refs, hyps)['bertscore']
+            except Exception as bert_error:
+                return f"Error testing BERTScore evaluator: {str(bert_error)}", ""
+        if 'Word Count' in selected_test_metrics:
+            metrics['word_count'] = word_count_metric  # ← example of a simple custom-defined metric
+        if not metrics:
+            return "Error: Please select at least one metric for testing.", ""
+        # Run significance tests
+        try:
+            signatures, scores = compare_systems(
+                systems=systems,
+                metrics=metrics,
+                references=references,
+                n_samples=int(n_samples),
+                significance_level=float(significance_level),
+                print_results=False  # We don't need print output for online demo
+            )
+        except Exception as compare_error:
+            return f"Error during significance testing: {str(compare_error)}\n\nThis might be due to:\n1. Empty or invalid text content\n2. Incompatible metric configurations\n3. RadEval library issues", str(compare_error)
+        # Format results
+        results_text = "## 🧪 Hypothesis Testing Results\n\n"
+        results_text += f"**Parameters:**\n"
+        results_text += f"- Randomization samples: {n_samples}\n"
+        results_text += f"- Significance level: {significance_level}\n"
+        results_text += f"- Number of systems: {len(systems)}\n"
+        results_text += f"- Number of references: {len(references)}\n\n"
+        # Significant differences summary
+        results_text += "### 📊 Significant Differences Summary\n\n"
+        baseline_name = list(systems.keys())[0]  # Assume first one is the baseline
+        results_text += f"**Baseline system:** {baseline_name}\n\n"
+        has_significant_differences = False
+        for system_name in systems.keys():
+            if system_name == baseline_name:
+                continue
+            significant_metrics = []
+            for metric_name in metrics.keys():
+                pvalue_key = f"{metric_name}_pvalue"
+                if pvalue_key in scores[system_name]:
+                    p_val = scores[system_name][pvalue_key]
+                    if p_val < float(significance_level):
+                        significant_metrics.append(metric_name)
+            if significant_metrics:
+                results_text += f"**{system_name} vs {baseline_name}:** {', '.join(significant_metrics)} (p < {significance_level})\n\n"
+                has_significant_differences = True
+            else:
+                results_text += f"**{system_name} vs {baseline_name}:** No significant differences\n\n"
+        if not has_significant_differences:
+            results_text += "*No statistically significant differences found between systems.*\n\n"
+        # Add mean scores in table format
+        results_text += "### 📈 Mean Scores by System\n\n"
+        try:
+            baseline_name = list(systems.keys())[0]
+            # Display each system's results in a clean format
+            for system_name in systems.keys():
+                results_text += f"**{system_name.upper()}:**\n\n"
+                # Create table header
+                results_text += "| Metric | Score | P-value |\n"
+                results_text += "|--------|-------|----------|\n"
+                # Get system data from scores
+                system_scores = scores.get(system_name, {})
+                # Add rows for each metric
+                for metric_name in metrics.keys():
+                    if metric_name in system_scores:
+                        score = system_scores[metric_name]
+                        pvalue_key = f"{metric_name}_pvalue"
+                        # Format score
+                        score_str = f"{score:.4f}" if isinstance(score, (int, float)) else str(score)
+                        # Format p-value (only for non-baseline systems)
+                        if system_name != baseline_name and pvalue_key in system_scores:
+                            pvalue = system_scores[pvalue_key]
+                            pvalue_str = f"{pvalue:.4f}" if isinstance(pvalue, (int, float)) else str(pvalue)
+                            # Mark significant p-values
+                            if isinstance(pvalue, (int, float)) and pvalue < float(significance_level):
+                                pvalue_str += " *"
+                        else:
+                            pvalue_str = "-" if system_name == baseline_name else "N/A"
+                        results_text += f"| {metric_name} | {score_str} | {pvalue_str} |\n"
+                results_text += "\n"
+            results_text += "*Note: Baseline system shows scores only. Other systems show scores and p-values comparing to baseline.*\n"
+            results_text += f"*P-values marked with * are significant (p < {significance_level}).*\n\n"
+        except Exception as score_error:
+            results_text += f"Error formatting scores: {str(score_error)}\n\n"
+        return results_text
+    except ImportError as e:
+        return f"Import Error: {str(e)}. Please ensure RadEval with compare_systems is installed."
+    except json.JSONDecodeError:
+        return "Error: Invalid JSON format in systems data."
+    except Exception as e:
+        return f"Testing Error: {str(e)}"
+# Create Hypothesis Testing UI
+with gr.Blocks(title="Null Hypothesis Testing", theme=gr.themes.Soft()) as hypothesis_demo:
+    gr.Markdown(
+        """
+        # 🖥️ Null Hypothesis Testing
+        **Statistical significance testing** for comparing multiple radiology report generation systems.
+        This tool uses **randomization-based significance testing** to determine if differences between systems are statistically meaningful.
+        **⚠️ Performance Warning ⚠️**
+        Hypothesis testing with multiple metrics may take some time, especially with larger sample sizes. Please be patient during computation.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1.5):
+            systems_input = gr.Textbox(
+                label="📊 Systems Data (JSON Format)",
+                lines=18,
+                placeholder="""Enter systems data in JSON format, e.g.:
+{
+  "references": [
+    "No acute cardiopulmonary process.",
+    "Mild cardiomegaly with clear lung fields."
+  ],
+  "systems": {
+    "baseline": [
+      "No acute findings.",
+      "Mild cardiomegaly, clear lungs."
+    ],
+    "improved": [
+      "No acute cardiopulmonary process.",
+      "Mild cardiomegaly with clear lung fields bilaterally."
+    ]
+  }
+}""",
+                info="Provide reference reports and multiple systems to compare"
+            )
+        with gr.Column(scale=1):
+            test_metrics_selection = gr.CheckboxGroup(
+                label="🎯 Select Metrics for Testing",
+                choices=["BLEU", "ROUGE", "BERTScore", "Word Count"],
+                value=["BLEU", "ROUGE", "BERTScore"],
+                interactive=True,
+                info="Only fast metrics are shown to ensure quick evaluation (slow ones are excluded)"
+            )
+            n_samples_input = gr.Number(
+                label="🔄 Randomization Samples",
+                value=50,
+                minimum=10,
+                maximum=1000,
+                step=10,
+                info="Number of randomisation samples (higher = more confidence, but slower)"
+            )
+            significance_level_input = gr.Number(
+                label="📈 Significance Level (α)",
+                value=0.05,
+                minimum=0.01,
+                maximum=0.10,
+                step=0.01,
+                info="Alpha level for significance testing"
+            )
+            example_button = gr.Button("📝 Load Example Data", variant="secondary")
+            clear_button = gr.Button("🗑️ Clear Data", variant="secondary")
+    with gr.Row():
+        test_button = gr.Button("🧪 Run Hypothesis Testing", variant="primary", size="lg")
+    with gr.Row():
+        test_results = gr.Markdown(
+            value="📊 **Test results will appear here...**\n\nClick 'Load Example Data' to see sample input, then click 'Run Hypothesis Testing' to see results."
+        )
+    # Example data button
+    def load_example_data():
+        example_data = {
+            "references": [
+                "No acute cardiopulmonary process.",
+                "No radiographic findings to suggest pneumonia.",
+                "Mild cardiomegaly with clear lung fields.",
+                "Small pleural effusion on the right side.",
+                "Status post cardiac surgery with stable appearance."
+            ],
+            "systems": {
+                "baseline": [
+                    "No acute findings.",
+                    "No pneumonia.",
+                    "Mild cardiomegaly, clear lungs.",
+                    "Small right pleural effusion.",
+                    "Post-cardiac surgery, stable."
+                ],
+                "improved": [
+                    "No acute cardiopulmonary process.",
+                    "No radiographic findings suggesting pneumonia.",
+                    "Mild cardiomegaly with clear lung fields bilaterally.",
+                    "Small pleural effusion present on the right side.",
+                    "Status post cardiac surgery with stable appearance."
+                ],
+                "poor": [
+                    "Normal.",
+                    "OK.",
+                    "Heart big.",
+                    "Some fluid.",
+                    "Surgery done."
+                ]
+            }
+        }
+        import json
+        return json.dumps(example_data, indent=2)
+    example_button.click(
+        load_example_data,
+        outputs=systems_input
+    )
+    clear_button.click(
+        lambda: "",
+        outputs=systems_input
+    )
+    test_button.click(
+        run_hypothesis_testing,
+        inputs=[systems_input, test_metrics_selection, n_samples_input, significance_level_input],
+        outputs=[test_results]
+    )
+    with gr.Accordion("💡 Hypothesis Testing Information", open=False):
+        gr.Markdown(
+            """
+            ### 🔬 How it Works:
+            This tool performs **randomization-based significance testing** to compare multiple systems:
+            1. **Null Hypothesis**: No difference between systems
+            2. **Randomization**: Randomly permute system outputs multiple times
+            3. **P-value Calculation**: Proportion of permutations where random difference ≥ observed difference
+            4. **Significance**: If p-value < α, reject null hypothesis (systems are significantly different)
+            ### 📊 Input Format:
+            - **References**: Ground truth reports
+            - **Systems**: Multiple systems to compare (each with same number of outputs as references)
+            - **Metrics**: Evaluation metrics to use for comparison
+            ### 📈 Output:
+            - **Significance Matrix**: P-values for all pairwise system comparisons
+            - **Mean Scores**: Average performance of each system on each metric
+            - **Bold p-values**: Indicate statistically significant differences
+            ### ⚡ Performance:
+            - **Fast Metrics Only**: This tool only includes BLEU, ROUGE, BERTScore, and Word Count for optimal performance
+            - **Excluded Slow Metrics**: RadGraph F1, CheXbert F1 are excluded to ensure reasonable computation time
+            - More randomization samples = more accurate p-values but slower computation
+            - Recommended: 50-100 samples for quick testing, 1000+ for publication
+            """
+        )
+# Combine both demos using gr.Blocks to add a header
+with gr.Blocks(
+    title="RadEval: A framework for radiology text evaluation",
+    theme=gr.themes.Soft(),
+    css="""
+    .tab-nav button {
+        font-weight: bold !important;
+        border: 2px solid #e0e7ff !important;
+        border-radius: 10px !important;
+        margin: 0 5px !important;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+        color: white !important;
+        box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2) !important;
+        transition: all 0.3s ease !important;
+    }
+    .tab-nav button:hover {
+        transform: translateY(-2px) !important;
+        box-shadow: 0 6px 20px rgba(0, 0, 0, 0.3) !important;
+        background: linear-gradient(135deg, #764ba2 0%, #667eea 100%) !important;
+    }
+    .tab-nav button.selected {
+        background: linear-gradient(135deg, #ff6b6b 0%, #ee5a24 100%) !important;
+        border-color: #ff6b6b !important;
+        transform: translateY(-1px) !important;
+        box-shadow: 0 8px 25px rgba(255, 107, 107, 0.4) !important;
+    }
+    """
+) as combined_demo:
+    gr.Markdown(
+        """
+        # 🩺 RadEval: A framework for radiology text evaluation
+        ### [Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval) | [Video](https://justin13601.github.io/files/radeval.mp4) | [arXiv]() | [RadEval_ModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()
+        """
+    )
+    tabs = gr.TabbedInterface(
+        [demo, hypothesis_demo],
+        ["🏎️ RadEval Evaluation", "🖥️ Null Hypothesis Testing"]
+    )
 if __name__ == "__main__":
+    combined_demo.launch()