Spaces:

X-iZhang
/

RadEval

Running

File size: 29,200 Bytes

import gradio as gr
import sys
import os
import torch
sys.path.append(".")

def setup_cpu_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    
    torch.set_num_threads(4) 
    
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    
    os.environ['TRANSFORMERS_CACHE'] = './cache'

setup_cpu_environment()

from RadEval import RadEval, compare_systems

def run_radeval_simple(ref_text, hyp_text, selected_metrics):
    """
    Run RadEval with selected metrics on a pair of reference and hypothesis texts
    """
    try:
        
        refs = [ref_text.strip()]
        hyps = [hyp_text.strip()]
        
        # Configure RadEval based on selected metrics
        config = {
            'do_radgraph': 'RadGraph F1' in selected_metrics,
            'do_bleu': 'BLEU' in selected_metrics,
            'do_rouge': 'ROUGE' in selected_metrics,
            'do_bertscore': 'BERTScore' in selected_metrics,
            'do_chexbert': 'CheXbert F1' in selected_metrics,
            'do_ratescore': 'RaTEScore' in selected_metrics,
            'do_radcliq': 'RadCliQ' in selected_metrics,
            'do_temporal': 'Temporal F1' in selected_metrics,
            'do_radeval_bertsore': 'RadEval BERTScore' in selected_metrics,
            'do_green': 'GREEN' in selected_metrics,
            'do_srr_bert': 'SRR-BERT' in selected_metrics
        }
        
        # Initialize RadEval with selected metrics
        evaluator = RadEval(**config)
        
        # Run evaluation
        results = evaluator(refs=refs, hyps=hyps)
        
        # Prepare results for display
        table_data = []
        analysis_text = "## 🚀 RadEval Results\n\n"
        analysis_text += f"**Reference:** {ref_text[:100]}{'...' if len(ref_text) > 100 else ''}\n\n"
        analysis_text += f"**Hypothesis:** {hyp_text[:100]}{'...' if len(hyp_text) > 100 else ''}\n\n"
        analysis_text += "### Evaluation Scores:\n\n"
        
        for metric, score in results.items():
            if isinstance(score, (int, float)):
                formatted_score = f"{score:.4f}" if isinstance(score, float) else str(score)
                table_data.append([metric, formatted_score])
                analysis_text += f"- **{metric}**: {formatted_score}\n"
            elif isinstance(score, dict):
                # Handle nested metrics
                for sub_metric, sub_score in score.items():
                    if isinstance(sub_score, (int, float)):
                        formatted_score = f"{sub_score:.4f}" if isinstance(sub_score, float) else str(sub_score)
                        metric_name = f"{metric}_{sub_metric}"
                        table_data.append([metric_name, formatted_score])
                        analysis_text += f"- **{metric_name}**: {formatted_score}\n"
        
        if not table_data:
            return "No metrics were computed. Please select at least one metric.", [["No results", ""]]
            
        return analysis_text, table_data
        
    except ImportError as e:
        error_msg = f"Import Error: {str(e)}. Please ensure RadEval dependencies are installed."
        return error_msg, [["Error", error_msg]]
    except Exception as e:
        error_msg = f"Evaluation Error: {str(e)}"
        return error_msg, [["Error", error_msg]]


# Example pairs for radiology reports
examples = {
    "Normal vs Normal": {
        "ref": "Heart size is normal. Lungs are clear. No pleural effusion or pneumothorax.",
        "hyp": "Cardiac silhouette is within normal limits. Lungs are clear bilaterally. No effusion or pneumothorax identified.",
    },
    "Pneumonia Case": {
        "ref": "Moderate cardiomegaly. Bilateral lower lobe consolidations consistent with pneumonia.",
        "hyp": "Enlarged heart. Worsening bilateral infiltrates in the lower lobes suggestive of pneumonia.",
    },
    "Temporal Comparison": {
        "ref": "Compared to prior study, the pleural effusion has increased in size. New bilateral infiltrates are present.",
        "hyp": "The pleural effusion is larger than on the previous examination. There are new bilateral pulmonary infiltrates.",
    },
    "Discordant Reports": {
        "ref": "No acute cardiopulmonary process. Normal heart size and lung fields.",
        "hyp": "Mild cardiomegaly with bilateral lower lobe atelectasis. Small pleural effusion on the right.",
    },
    "Ambiguous Language": {
        "ref": "There is a small left-sided pleural effusion with adjacent atelectasis.",
        "hyp": "Possible small effusion on the left. Atelectasis cannot be excluded.",
    },
    "Surgical Follow-up": {
        "ref": "Status post coronary artery bypass grafting. No evidence of acute complication.",
        "hyp": "Post-operative changes from CABG are present. No signs of surgical complication.",
    },
    "False Positive": {
        "ref": "No focal consolidation, pleural effusion, or pneumothorax identified.",
        "hyp": "Right lower lobe consolidation concerning for pneumonia.",
    },
    "Textual Hallucination": {
        "ref": "Heart and mediastinum are normal. Lungs are clear.",
        "hyp": "Large left pleural effusion with mediastinal shift to the right.",
    },
    "Negation Challenge": {
        "ref": "No evidence of pneumothorax or pleural effusion.",
        "hyp": "Evidence of small pneumothorax on the right.",
    },
    "Fine-grained Difference": {
        "ref": "Mild interstitial markings at the lung bases, likely chronic.",
        "hyp": "Subtle increased interstitial opacities at both lung bases, likely chronic in nature.",
    }
}

def update_fields(choice):
    """Update text fields based on example selection"""
    if choice == "Custom":
        return gr.update(value="", interactive=True), gr.update(value="", interactive=True)
    else:
        return (
            gr.update(value=examples[choice]["ref"], interactive=False), 
            gr.update(value=examples[choice]["hyp"], interactive=False)
        )


# Available metrics (ordered by computational complexity)
available_metrics = [
    "BLEU",
    "ROUGE", 
    "BERTScore",
    "Temporal F1",
    "RadEval BERTScore",
    "RaTEScore",
    "RadCliQ",
    "SRR-BERT",
    "CheXbert F1",
    "RadGraph F1",
    "GREEN"
]

# Fast metrics for default selection
default_metrics = ["BLEU", "ROUGE", "BERTScore"]


with gr.Blocks(title="RadEval Evaluation", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🏎️ RadEval Evaluation

        **RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box.

        **⚠️ Performance Warning ⚠️**

        The demo is currently running on **CPU**. When using some slower metrics (like RadGraph, CheXbert, GREEN), it may take a while to complete evaluation. Please be patient.
        """
    )

    with gr.Row():
        choice = gr.Radio(
            label="📋 Choose Example or Custom Input",
            choices=["Custom"] + list(examples.keys()),
            value="Custom",
            interactive=True
        )

    with gr.Row():
        with gr.Column(scale=1):
            ref_input = gr.Textbox(
                label="📄 Reference Report (Ground Truth)",
                lines=5,
                placeholder="Enter the reference radiology report here...",
                info="The ground truth or expert-written report"
            )
        with gr.Column(scale=1):
            hyp_input = gr.Textbox(
                label="🤖 Hypothesis Report (Generated)",
                lines=5,
                placeholder="Enter the generated/predicted radiology report here...",
                info="The AI-generated or system-produced report"
            )

    choice.change(
        update_fields,
        inputs=choice,
        outputs=[ref_input, hyp_input],
    )

    with gr.Row():
        metrics_selection = gr.CheckboxGroup(
            label="🎯 Select Evaluation Metrics",
            choices=available_metrics,
            value=default_metrics,
            interactive=True,
            info="Select metrics to compute. Some metrics may take longer (RadGraph, CheXbert, GREEN)."
        )

    with gr.Row():
        run_button = gr.Button("🚀 Run RadEval", variant="primary", size="lg")
        
    with gr.Row():
        with gr.Column(scale=2):
            analysis_output = gr.Markdown(
                value="📊 **Results will appear here after evaluation...**\n\nSelect your texts and metrics, then click 'Run RadEval'."
            )
        with gr.Column(scale=1):
            table_output = gr.DataFrame(
                label="📈 Detailed Scores",
                headers=["Metric", "Score"],
                wrap=True
            )

    # Information section
    with gr.Accordion("💡 Metric Information", open=False):
        gr.Markdown(
            """
            ### 📊 Available Metrics:
            
            **Traditional NLG Metrics:**
            - **BLEU**: N-gram overlap between reference and hypothesis
            - **ROUGE**: Recall-oriented overlap (ROUGE-1, ROUGE-2, ROUGE-L)
            - **BERTScore**: Semantic similarity using BERT embeddings
            
            **Radiology-Specific Metrics:**
            - **RadGraph F1**: Entity and relation extraction for radiology
            - **CheXbert F1**: Chest X-ray finding classification performance
            - **RaTEScore**: Radiology-aware text evaluation score
            - **RadCliQ**: Composite metric for radiology reports
            - **Temporal F1**: Temporal entity and relationship evaluation
            - **RadEval BERTScore**: Specialized BERT for radiology text
            - **GREEN**: Generative evaluation with natural language explanations
            - **SRR-BERT**: Structured radiology reasoning evaluation
            
            ### ⚡ Performance Notes:
            - **Fast**: BLEU, ROUGE, BERTScore, Temporal F1
            - **Medium**: RadEval BERTScore, RaTEScore, RadCliQ, SRR-BERT
            - **Slow**: CheXbert F1, RadGraph F1, GREEN (requires model downloads)
            """
        )

    run_button.click(
        run_radeval_simple,
        inputs=[ref_input, hyp_input, metrics_selection],
        outputs=[analysis_output, table_output]
    )

# =============================================================================
# 🧪 Hypothesis Testing Section
# =============================================================================

def run_hypothesis_testing(systems_data, selected_test_metrics, n_samples, significance_level):
    """
    Run statistical significance testing between multiple systems
    """
    try:        
        # Parse systems data (expecting JSON format)
        import json
        systems_dict = json.loads(systems_data)
        
        # Extract references and systems
        if 'references' not in systems_dict or 'systems' not in systems_dict:
            return "Error: Please provide both 'references' and 'systems' in the JSON data.", ""
        
        references = systems_dict['references']
        systems = systems_dict['systems']
        
        # Validate data integrity
        if not references or not systems:
            return "Error: References and systems cannot be empty.", ""
        
        if not isinstance(references, list) or not isinstance(systems, dict):
            return "Error: References must be a list and systems must be a dictionary.", ""
        
        # Check that all systems have the same number of outputs as references
        ref_count = len(references)
        for system_name, system_outputs in systems.items():
            if not isinstance(system_outputs, list):
                return f"Error: System '{system_name}' outputs must be a list.", ""
            if len(system_outputs) != ref_count:
                return f"Error: System '{system_name}' has {len(system_outputs)} outputs but {ref_count} references provided.", ""
        
        # Validate that all texts are non-empty strings
        for i, ref in enumerate(references):
            if not isinstance(ref, str) or not ref.strip():
                return f"Error: Reference {i+1} is empty or not a string.", ""
        
        for system_name, system_outputs in systems.items():
            for i, output in enumerate(system_outputs):
                if not isinstance(output, str) or not output.strip():
                    return f"Error: System '{system_name}' output {i+1} is empty or not a string.", ""
        
        # Initialize evaluators based on selected metrics (fast metrics only)
        evaluators = {}
        if 'BLEU' in selected_test_metrics:
            evaluators['bleu'] = RadEval(do_bleu=True)
        if 'ROUGE' in selected_test_metrics:
            evaluators['rouge'] = RadEval(do_rouge=True)
        if 'BERTScore' in selected_test_metrics:
            evaluators['bertscore'] = RadEval(do_bertscore=True)
        
        # Custom metric: average word count
        def word_count_metric(hyps, refs):
            return sum(len(report.split()) for report in hyps) / len(hyps)
        
        # Build metrics dictionary (following the example structure)
        metrics = {}
        if 'BLEU' in selected_test_metrics:
            # Test the evaluator first
            try:
                test_result = evaluators['bleu'](references[:1], [systems[list(systems.keys())[0]][0]])
                if 'bleu' not in test_result:
                    return "Error: BLEU evaluator doesn't return 'bleu' key. Available keys: " + str(list(test_result.keys())), ""
                metrics['bleu'] = lambda hyps, refs: evaluators['bleu'](refs, hyps)['bleu']
            except Exception as bleu_error:
                return f"Error testing BLEU evaluator: {str(bleu_error)}", ""
                
        if 'ROUGE' in selected_test_metrics:
            try:
                test_result = evaluators['rouge'](references[:1], [systems[list(systems.keys())[0]][0]])
                for rouge_key in ['rouge1', 'rouge2', 'rougeL']:
                    if rouge_key not in test_result:
                        return f"Error: ROUGE evaluator doesn't return '{rouge_key}' key. Available keys: " + str(list(test_result.keys())), ""
                metrics['rouge1'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge1']
                metrics['rouge2'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge2']
                metrics['rougeL'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rougeL']
            except Exception as rouge_error:
                return f"Error testing ROUGE evaluator: {str(rouge_error)}", ""
                
        if 'BERTScore' in selected_test_metrics:
            try:
                test_result = evaluators['bertscore'](references[:1], [systems[list(systems.keys())[0]][0]])
                if 'bertscore' not in test_result:
                    return "Error: BERTScore evaluator doesn't return 'bertscore' key. Available keys: " + str(list(test_result.keys())), ""
                metrics['bertscore'] = lambda hyps, refs: evaluators['bertscore'](refs, hyps)['bertscore']
            except Exception as bert_error:
                return f"Error testing BERTScore evaluator: {str(bert_error)}", ""
                
        if 'custom: Word Count' in selected_test_metrics:
            metrics['word_count'] = word_count_metric  # ← example of a simple custom-defined metric
        
        if not metrics:
            return "Error: Please select at least one metric for testing.", ""
        
        # Run significance tests
        try:
            signatures, scores = compare_systems(
                systems=systems,
                metrics=metrics,
                references=references,
                n_samples=int(n_samples),
                significance_level=float(significance_level),
                print_results=False  # We don't need print output for online demo
            )
            
        except Exception as compare_error:
            return f"Error during significance testing: {str(compare_error)}\n\nThis might be due to:\n1. Empty or invalid text content\n2. Incompatible metric configurations\n3. RadEval library issues", str(compare_error)
        
        # Format results
        results_text = "## 🧪 Hypothesis Testing Results\n\n"
        results_text += f"**Parameters:**\n"
        results_text += f"- Randomization samples: {n_samples}\n"
        results_text += f"- Significance level: {significance_level}\n"
        results_text += f"- Number of systems: {len(systems)}\n"
        results_text += f"- Number of references: {len(references)}\n\n"
        
        # Significant differences summary
        results_text += "### 📊 Significant Differences Summary\n\n"
        baseline_name = list(systems.keys())[0]  # Assume first one is the baseline
        results_text += f"**Baseline system:** {baseline_name}\n\n"
        
        has_significant_differences = False
        for system_name in systems.keys():
            if system_name == baseline_name:
                continue
                
            significant_metrics = []
            for metric_name in metrics.keys():
                pvalue_key = f"{metric_name}_pvalue"
                if pvalue_key in scores[system_name]:
                    p_val = scores[system_name][pvalue_key]
                    if p_val < float(significance_level):
                        significant_metrics.append(metric_name)
            
            if significant_metrics:
                results_text += f"**{system_name} vs {baseline_name}:** {', '.join(significant_metrics)} (p < {significance_level})\n\n"
                has_significant_differences = True
            else:
                results_text += f"**{system_name} vs {baseline_name}:** No significant differences\n\n"
        
        if not has_significant_differences:
            results_text += "*No statistically significant differences found between systems.*\n\n"
        
        # Add mean scores in table format
        results_text += "### 📈 Mean Scores by System\n\n"
        try:
            baseline_name = list(systems.keys())[0]
            
            # Display each system's results in a clean format
            for system_name in systems.keys():
                results_text += f"**{system_name.upper()}:**\n\n"
                
                # Create table header
                results_text += "| Metric | Score | P-value |\n"
                results_text += "|--------|-------|----------|\n"
                
                # Get system data from scores
                system_scores = scores.get(system_name, {})
                
                # Add rows for each metric
                for metric_name in metrics.keys():
                    if metric_name in system_scores:
                        score = system_scores[metric_name]
                        pvalue_key = f"{metric_name}_pvalue"
                        
                        # Format score
                        score_str = f"{score:.4f}" if isinstance(score, (int, float)) else str(score)
                        
                        # Format p-value (only for non-baseline systems)
                        if system_name != baseline_name and pvalue_key in system_scores:
                            pvalue = system_scores[pvalue_key]
                            pvalue_str = f"{pvalue:.4f}" if isinstance(pvalue, (int, float)) else str(pvalue)
                            # Mark significant p-values
                            if isinstance(pvalue, (int, float)) and pvalue < float(significance_level):
                                pvalue_str += " *"
                        else:
                            pvalue_str = "-" if system_name == baseline_name else "N/A"
                        
                        results_text += f"| {metric_name} | {score_str} | {pvalue_str} |\n"
                
                results_text += "\n"
            
            results_text += "*Note: Baseline system shows scores only. Other systems show scores and p-values comparing to baseline.*\n"
            results_text += f"*P-values marked with * are significant (p < {significance_level}).*\n\n"
            
        except Exception as score_error:
            results_text += f"Error formatting scores: {str(score_error)}\n\n"
        
        return results_text
        
    except ImportError as e:
        return f"Import Error: {str(e)}. Please ensure RadEval with compare_systems is installed."
    except json.JSONDecodeError:
        return "Error: Invalid JSON format in systems data."
    except Exception as e:
        return f"Testing Error: {str(e)}"

# Create Hypothesis Testing UI
with gr.Blocks(title="Null Hypothesis Testing", theme=gr.themes.Soft()) as hypothesis_demo:
    gr.Markdown(
        """
        # 🖥️ Null Hypothesis Testing
        
        **Statistical significance testing** for comparing multiple radiology report generation systems.
        This tool uses **randomization-based significance testing** to determine if differences between systems are statistically meaningful.
        
        **⚠️ Performance Warning ⚠️**
        
        Hypothesis testing with multiple metrics may take some time, especially with larger sample sizes. Please be patient during computation.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1.5):
            systems_input = gr.Textbox(
                label="📊 Systems Data (JSON Format)",
                lines=18,
                placeholder="""Enter systems data in JSON format, e.g.:
{
  "references": [
    "No acute cardiopulmonary process.",
    "Mild cardiomegaly with clear lung fields."
  ],
  "systems": {
    "baseline": [
      "No acute findings.",
      "Mild cardiomegaly, clear lungs."
    ],
    "improved": [
      "No acute cardiopulmonary process.",
      "Mild cardiomegaly with clear lung fields bilaterally."
    ]
  }
}""",
                info="Provide reference reports and multiple systems to compare"
            )
        
        with gr.Column(scale=1):
            test_metrics_selection = gr.CheckboxGroup(
                label="🎯 Select Metrics for Testing",
                choices=["BLEU", "ROUGE", "BERTScore", "custom: Word Count"],
                value=["BLEU", "ROUGE", "BERTScore"],
                interactive=True,
                info="Only fast metrics are shown to ensure quick evaluation (slow ones are excluded)"
            )
            
            n_samples_input = gr.Number(
                label="🔄 Randomization Samples",
                value=50,
                minimum=10,
                maximum=1000,
                step=10,
                info="Number of randomisation samples (higher = more confidence, but slower)"
            )
            
            significance_level_input = gr.Number(
                label="📈 Significance Level (α)",
                value=0.05,
                minimum=0.01,
                maximum=0.10,
                step=0.01,
                info="Alpha level for significance testing"
            )

            example_button = gr.Button("📝 Load Example Data", variant="secondary")
            clear_button = gr.Button("🗑️ Clear Data", variant="secondary")
            
            
    with gr.Row():
        test_button = gr.Button("🧪 Run Hypothesis Testing", variant="primary", size="lg")
    
    with gr.Row():
        test_results = gr.Markdown(
            value="📊 **Test results will appear here...**\n\nClick 'Load Example Data' to see sample input, then click 'Run Hypothesis Testing' to see results."
        )
    
    # Example data button
    def load_example_data():
        example_data = {
            "references": [
                "No acute cardiopulmonary process.",
                "No radiographic findings to suggest pneumonia.",
                "Mild cardiomegaly with clear lung fields.",
                "Small pleural effusion on the right side.",
                "Status post cardiac surgery with stable appearance."
            ],
            "systems": {
                "baseline": [
                    "No acute findings.",
                    "No pneumonia.",
                    "Mild cardiomegaly, clear lungs.",
                    "Small right pleural effusion.",
                    "Post-cardiac surgery, stable."
                ],
                "improved": [
                    "No acute cardiopulmonary process.",
                    "No radiographic findings suggesting pneumonia.",
                    "Mild cardiomegaly with clear lung fields bilaterally.",
                    "Small pleural effusion present on the right side.",
                    "Status post cardiac surgery with stable appearance."
                ],
                "poor": [
                    "Normal.",
                    "OK.",
                    "Heart big.",
                    "Some fluid.",
                    "Surgery done."
                ]
            }
        }
        import json
        return json.dumps(example_data, indent=2)
    
    example_button.click(
        load_example_data,
        outputs=systems_input
    )
    
    clear_button.click(
        lambda: "",
        outputs=systems_input
    )
    
    test_button.click(
        run_hypothesis_testing,
        inputs=[systems_input, test_metrics_selection, n_samples_input, significance_level_input],
        outputs=[test_results]
    )
    
    with gr.Accordion("💡 Hypothesis Testing Information", open=False):
        gr.Markdown(
            """
            ### 🔬 How it Works:
            
            This tool performs **randomization-based significance testing** to compare multiple systems:
            
            1. **Null Hypothesis**: No difference between systems
            2. **Randomization**: Randomly permute system outputs multiple times
            3. **P-value Calculation**: Proportion of permutations where random difference ≥ observed difference
            4. **Significance**: If p-value < α, reject null hypothesis (systems are significantly different)
            
            ### 📊 Input Format:
            - **References**: Ground truth reports
            - **Systems**: Multiple systems to compare (each with same number of outputs as references)
            - **Metrics**: Evaluation metrics to use for comparison
            
            ### 📈 Output:
            - **Significance Matrix**: P-values for all pairwise system comparisons
            - **Mean Scores**: Average performance of each system on each metric
            - **Bold p-values**: Indicate statistically significant differences
            
            ### ⚡ Performance:
            - **Fast Metrics Only**: This tool only includes BLEU, ROUGE, BERTScore, and Word Count for optimal performance
            - **Excluded Slow Metrics**: RadGraph F1, CheXbert F1 are excluded to ensure reasonable computation time
            - More randomization samples = more accurate p-values but slower computation
            - Recommended: 50-100 samples for quick testing, 1000+ for publication
            """
        )

# Combine both demos using gr.Blocks to add a header
with gr.Blocks(
    title="RadEval: A framework for radiology text evaluation", 
    theme=gr.themes.Soft(),
    css="""
    .tab-nav button {
        font-weight: bold !important;
        border: 2px solid #e0e7ff !important;
        border-radius: 10px !important;
        margin: 0 5px !important;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
        color: white !important;
        box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2) !important;
        transition: all 0.3s ease !important;
    }
    .tab-nav button:hover {
        transform: translateY(-2px) !important;
        box-shadow: 0 6px 20px rgba(0, 0, 0, 0.3) !important;
        background: linear-gradient(135deg, #764ba2 0%, #667eea 100%) !important;
    }
    .tab-nav button.selected {
        background: linear-gradient(135deg, #ff6b6b 0%, #ee5a24 100%) !important;
        border-color: #ff6b6b !important;
        transform: translateY(-1px) !important;
        box-shadow: 0 8px 25px rgba(255, 107, 107, 0.4) !important;
    }
    """
) as combined_demo:
    gr.Markdown(
        """
        # 🩺 RadEval: A framework for radiology text evaluation
        ### [Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval) | [Video](https://justin13601.github.io/files/radeval.mp4) | [arXiv]() | [RadEval_ModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]()

        """
    )
    
    tabs = gr.TabbedInterface(
        [demo, hypothesis_demo],
        ["🏎️ RadEval Evaluation", "🖥️ Null Hypothesis Testing"]
    )

if __name__ == "__main__":
    combined_demo.launch()