|
import gradio as gr |
|
import sys |
|
import os |
|
import torch |
|
sys.path.append(".") |
|
|
|
def setup_cpu_environment(): |
|
os.environ['CUDA_VISIBLE_DEVICES'] = '' |
|
|
|
torch.set_num_threads(4) |
|
|
|
os.environ['TOKENIZERS_PARALLELISM'] = 'false' |
|
|
|
os.environ['TRANSFORMERS_CACHE'] = './cache' |
|
|
|
setup_cpu_environment() |
|
|
|
from RadEval import RadEval, compare_systems |
|
|
|
def run_radeval_simple(ref_text, hyp_text, selected_metrics): |
|
""" |
|
Run RadEval with selected metrics on a pair of reference and hypothesis texts |
|
""" |
|
try: |
|
|
|
refs = [ref_text.strip()] |
|
hyps = [hyp_text.strip()] |
|
|
|
|
|
config = { |
|
'do_radgraph': 'RadGraph F1' in selected_metrics, |
|
'do_bleu': 'BLEU' in selected_metrics, |
|
'do_rouge': 'ROUGE' in selected_metrics, |
|
'do_bertscore': 'BERTScore' in selected_metrics, |
|
'do_chexbert': 'CheXbert F1' in selected_metrics, |
|
'do_ratescore': 'RaTEScore' in selected_metrics, |
|
'do_radcliq': 'RadCliQ' in selected_metrics, |
|
'do_temporal': 'Temporal F1' in selected_metrics, |
|
'do_radeval_bertsore': 'RadEval BERTScore' in selected_metrics, |
|
'do_green': 'GREEN' in selected_metrics, |
|
'do_srr_bert': 'SRR-BERT' in selected_metrics |
|
} |
|
|
|
|
|
evaluator = RadEval(**config) |
|
|
|
|
|
results = evaluator(refs=refs, hyps=hyps) |
|
|
|
|
|
table_data = [] |
|
analysis_text = "## 🚀 RadEval Results\n\n" |
|
analysis_text += f"**Reference:** {ref_text[:100]}{'...' if len(ref_text) > 100 else ''}\n\n" |
|
analysis_text += f"**Hypothesis:** {hyp_text[:100]}{'...' if len(hyp_text) > 100 else ''}\n\n" |
|
analysis_text += "### Evaluation Scores:\n\n" |
|
|
|
for metric, score in results.items(): |
|
if isinstance(score, (int, float)): |
|
formatted_score = f"{score:.4f}" if isinstance(score, float) else str(score) |
|
table_data.append([metric, formatted_score]) |
|
analysis_text += f"- **{metric}**: {formatted_score}\n" |
|
elif isinstance(score, dict): |
|
|
|
for sub_metric, sub_score in score.items(): |
|
if isinstance(sub_score, (int, float)): |
|
formatted_score = f"{sub_score:.4f}" if isinstance(sub_score, float) else str(sub_score) |
|
metric_name = f"{metric}_{sub_metric}" |
|
table_data.append([metric_name, formatted_score]) |
|
analysis_text += f"- **{metric_name}**: {formatted_score}\n" |
|
|
|
if not table_data: |
|
return "No metrics were computed. Please select at least one metric.", [["No results", ""]] |
|
|
|
return analysis_text, table_data |
|
|
|
except ImportError as e: |
|
error_msg = f"Import Error: {str(e)}. Please ensure RadEval dependencies are installed." |
|
return error_msg, [["Error", error_msg]] |
|
except Exception as e: |
|
error_msg = f"Evaluation Error: {str(e)}" |
|
return error_msg, [["Error", error_msg]] |
|
|
|
|
|
|
|
examples = { |
|
"Normal vs Normal": { |
|
"ref": "Heart size is normal. Lungs are clear. No pleural effusion or pneumothorax.", |
|
"hyp": "Cardiac silhouette is within normal limits. Lungs are clear bilaterally. No effusion or pneumothorax identified.", |
|
}, |
|
"Pneumonia Case": { |
|
"ref": "Moderate cardiomegaly. Bilateral lower lobe consolidations consistent with pneumonia.", |
|
"hyp": "Enlarged heart. Worsening bilateral infiltrates in the lower lobes suggestive of pneumonia.", |
|
}, |
|
"Temporal Comparison": { |
|
"ref": "Compared to prior study, the pleural effusion has increased in size. New bilateral infiltrates are present.", |
|
"hyp": "The pleural effusion is larger than on the previous examination. There are new bilateral pulmonary infiltrates.", |
|
}, |
|
"Discordant Reports": { |
|
"ref": "No acute cardiopulmonary process. Normal heart size and lung fields.", |
|
"hyp": "Mild cardiomegaly with bilateral lower lobe atelectasis. Small pleural effusion on the right.", |
|
}, |
|
"Ambiguous Language": { |
|
"ref": "There is a small left-sided pleural effusion with adjacent atelectasis.", |
|
"hyp": "Possible small effusion on the left. Atelectasis cannot be excluded.", |
|
}, |
|
"Surgical Follow-up": { |
|
"ref": "Status post coronary artery bypass grafting. No evidence of acute complication.", |
|
"hyp": "Post-operative changes from CABG are present. No signs of surgical complication.", |
|
}, |
|
"False Positive": { |
|
"ref": "No focal consolidation, pleural effusion, or pneumothorax identified.", |
|
"hyp": "Right lower lobe consolidation concerning for pneumonia.", |
|
}, |
|
"Textual Hallucination": { |
|
"ref": "Heart and mediastinum are normal. Lungs are clear.", |
|
"hyp": "Large left pleural effusion with mediastinal shift to the right.", |
|
}, |
|
"Negation Challenge": { |
|
"ref": "No evidence of pneumothorax or pleural effusion.", |
|
"hyp": "Evidence of small pneumothorax on the right.", |
|
}, |
|
"Fine-grained Difference": { |
|
"ref": "Mild interstitial markings at the lung bases, likely chronic.", |
|
"hyp": "Subtle increased interstitial opacities at both lung bases, likely chronic in nature.", |
|
} |
|
} |
|
|
|
def update_fields(choice): |
|
"""Update text fields based on example selection""" |
|
if choice == "Custom": |
|
return gr.update(value="", interactive=True), gr.update(value="", interactive=True) |
|
else: |
|
return ( |
|
gr.update(value=examples[choice]["ref"], interactive=False), |
|
gr.update(value=examples[choice]["hyp"], interactive=False) |
|
) |
|
|
|
|
|
|
|
available_metrics = [ |
|
"BLEU", |
|
"ROUGE", |
|
"BERTScore", |
|
"Temporal F1", |
|
"RadEval BERTScore", |
|
"RaTEScore", |
|
"RadCliQ", |
|
"SRR-BERT", |
|
"CheXbert F1", |
|
"RadGraph F1", |
|
"GREEN" |
|
] |
|
|
|
|
|
default_metrics = ["BLEU", "ROUGE", "BERTScore"] |
|
|
|
|
|
with gr.Blocks(title="RadEval Evaluation", theme=gr.themes.Soft()) as demo: |
|
gr.Markdown( |
|
""" |
|
# 🏎️ RadEval Evaluation |
|
|
|
**RadEval** is a lightweight, extensible framework for **evaluating radiology reports** using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and **radiology-specific measures** (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers **comprehensive and interpretable** metrics out of the box. |
|
|
|
**⚠️ Performance Warning ⚠️** |
|
|
|
The demo is currently running on **CPU**. When using some slower metrics (like RadGraph, CheXbert, GREEN), it may take a while to complete evaluation. Please be patient. |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
choice = gr.Radio( |
|
label="📋 Choose Example or Custom Input", |
|
choices=["Custom"] + list(examples.keys()), |
|
value="Custom", |
|
interactive=True |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
ref_input = gr.Textbox( |
|
label="📄 Reference Report (Ground Truth)", |
|
lines=5, |
|
placeholder="Enter the reference radiology report here...", |
|
info="The ground truth or expert-written report" |
|
) |
|
with gr.Column(scale=1): |
|
hyp_input = gr.Textbox( |
|
label="🤖 Hypothesis Report (Generated)", |
|
lines=5, |
|
placeholder="Enter the generated/predicted radiology report here...", |
|
info="The AI-generated or system-produced report" |
|
) |
|
|
|
choice.change( |
|
update_fields, |
|
inputs=choice, |
|
outputs=[ref_input, hyp_input], |
|
) |
|
|
|
with gr.Row(): |
|
metrics_selection = gr.CheckboxGroup( |
|
label="🎯 Select Evaluation Metrics", |
|
choices=available_metrics, |
|
value=default_metrics, |
|
interactive=True, |
|
info="Select metrics to compute. Some metrics may take longer (RadGraph, CheXbert, GREEN)." |
|
) |
|
|
|
with gr.Row(): |
|
run_button = gr.Button("🚀 Run RadEval", variant="primary", size="lg") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
analysis_output = gr.Markdown( |
|
value="📊 **Results will appear here after evaluation...**\n\nSelect your texts and metrics, then click 'Run RadEval'." |
|
) |
|
with gr.Column(scale=1): |
|
table_output = gr.DataFrame( |
|
label="📈 Detailed Scores", |
|
headers=["Metric", "Score"], |
|
wrap=True |
|
) |
|
|
|
|
|
with gr.Accordion("💡 Metric Information", open=False): |
|
gr.Markdown( |
|
""" |
|
### 📊 Available Metrics: |
|
|
|
**Traditional NLG Metrics:** |
|
- **BLEU**: N-gram overlap between reference and hypothesis |
|
- **ROUGE**: Recall-oriented overlap (ROUGE-1, ROUGE-2, ROUGE-L) |
|
- **BERTScore**: Semantic similarity using BERT embeddings |
|
|
|
**Radiology-Specific Metrics:** |
|
- **RadGraph F1**: Entity and relation extraction for radiology |
|
- **CheXbert F1**: Chest X-ray finding classification performance |
|
- **RaTEScore**: Radiology-aware text evaluation score |
|
- **RadCliQ**: Composite metric for radiology reports |
|
- **Temporal F1**: Temporal entity and relationship evaluation |
|
- **RadEval BERTScore**: Specialized BERT for radiology text |
|
- **GREEN**: Generative evaluation with natural language explanations |
|
- **SRR-BERT**: Structured radiology reasoning evaluation |
|
|
|
### ⚡ Performance Notes: |
|
- **Fast**: BLEU, ROUGE, BERTScore, Temporal F1 |
|
- **Medium**: RadEval BERTScore, RaTEScore, RadCliQ, SRR-BERT |
|
- **Slow**: CheXbert F1, RadGraph F1, GREEN (requires model downloads) |
|
""" |
|
) |
|
|
|
run_button.click( |
|
run_radeval_simple, |
|
inputs=[ref_input, hyp_input, metrics_selection], |
|
outputs=[analysis_output, table_output] |
|
) |
|
|
|
|
|
|
|
|
|
|
|
def run_hypothesis_testing(systems_data, selected_test_metrics, n_samples, significance_level): |
|
""" |
|
Run statistical significance testing between multiple systems |
|
""" |
|
try: |
|
|
|
import json |
|
systems_dict = json.loads(systems_data) |
|
|
|
|
|
if 'references' not in systems_dict or 'systems' not in systems_dict: |
|
return "Error: Please provide both 'references' and 'systems' in the JSON data.", "" |
|
|
|
references = systems_dict['references'] |
|
systems = systems_dict['systems'] |
|
|
|
|
|
if not references or not systems: |
|
return "Error: References and systems cannot be empty.", "" |
|
|
|
if not isinstance(references, list) or not isinstance(systems, dict): |
|
return "Error: References must be a list and systems must be a dictionary.", "" |
|
|
|
|
|
ref_count = len(references) |
|
for system_name, system_outputs in systems.items(): |
|
if not isinstance(system_outputs, list): |
|
return f"Error: System '{system_name}' outputs must be a list.", "" |
|
if len(system_outputs) != ref_count: |
|
return f"Error: System '{system_name}' has {len(system_outputs)} outputs but {ref_count} references provided.", "" |
|
|
|
|
|
for i, ref in enumerate(references): |
|
if not isinstance(ref, str) or not ref.strip(): |
|
return f"Error: Reference {i+1} is empty or not a string.", "" |
|
|
|
for system_name, system_outputs in systems.items(): |
|
for i, output in enumerate(system_outputs): |
|
if not isinstance(output, str) or not output.strip(): |
|
return f"Error: System '{system_name}' output {i+1} is empty or not a string.", "" |
|
|
|
|
|
evaluators = {} |
|
if 'BLEU' in selected_test_metrics: |
|
evaluators['bleu'] = RadEval(do_bleu=True) |
|
if 'ROUGE' in selected_test_metrics: |
|
evaluators['rouge'] = RadEval(do_rouge=True) |
|
if 'BERTScore' in selected_test_metrics: |
|
evaluators['bertscore'] = RadEval(do_bertscore=True) |
|
|
|
|
|
def word_count_metric(hyps, refs): |
|
return sum(len(report.split()) for report in hyps) / len(hyps) |
|
|
|
|
|
metrics = {} |
|
if 'BLEU' in selected_test_metrics: |
|
|
|
try: |
|
test_result = evaluators['bleu'](references[:1], [systems[list(systems.keys())[0]][0]]) |
|
if 'bleu' not in test_result: |
|
return "Error: BLEU evaluator doesn't return 'bleu' key. Available keys: " + str(list(test_result.keys())), "" |
|
metrics['bleu'] = lambda hyps, refs: evaluators['bleu'](refs, hyps)['bleu'] |
|
except Exception as bleu_error: |
|
return f"Error testing BLEU evaluator: {str(bleu_error)}", "" |
|
|
|
if 'ROUGE' in selected_test_metrics: |
|
try: |
|
test_result = evaluators['rouge'](references[:1], [systems[list(systems.keys())[0]][0]]) |
|
for rouge_key in ['rouge1', 'rouge2', 'rougeL']: |
|
if rouge_key not in test_result: |
|
return f"Error: ROUGE evaluator doesn't return '{rouge_key}' key. Available keys: " + str(list(test_result.keys())), "" |
|
metrics['rouge1'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge1'] |
|
metrics['rouge2'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge2'] |
|
metrics['rougeL'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rougeL'] |
|
except Exception as rouge_error: |
|
return f"Error testing ROUGE evaluator: {str(rouge_error)}", "" |
|
|
|
if 'BERTScore' in selected_test_metrics: |
|
try: |
|
test_result = evaluators['bertscore'](references[:1], [systems[list(systems.keys())[0]][0]]) |
|
if 'bertscore' not in test_result: |
|
return "Error: BERTScore evaluator doesn't return 'bertscore' key. Available keys: " + str(list(test_result.keys())), "" |
|
metrics['bertscore'] = lambda hyps, refs: evaluators['bertscore'](refs, hyps)['bertscore'] |
|
except Exception as bert_error: |
|
return f"Error testing BERTScore evaluator: {str(bert_error)}", "" |
|
|
|
if 'custom: Word Count' in selected_test_metrics: |
|
metrics['word_count'] = word_count_metric |
|
|
|
if not metrics: |
|
return "Error: Please select at least one metric for testing.", "" |
|
|
|
|
|
try: |
|
signatures, scores = compare_systems( |
|
systems=systems, |
|
metrics=metrics, |
|
references=references, |
|
n_samples=int(n_samples), |
|
significance_level=float(significance_level), |
|
print_results=False |
|
) |
|
|
|
except Exception as compare_error: |
|
return f"Error during significance testing: {str(compare_error)}\n\nThis might be due to:\n1. Empty or invalid text content\n2. Incompatible metric configurations\n3. RadEval library issues", str(compare_error) |
|
|
|
|
|
results_text = "## 🧪 Hypothesis Testing Results\n\n" |
|
results_text += f"**Parameters:**\n" |
|
results_text += f"- Randomization samples: {n_samples}\n" |
|
results_text += f"- Significance level: {significance_level}\n" |
|
results_text += f"- Number of systems: {len(systems)}\n" |
|
results_text += f"- Number of references: {len(references)}\n\n" |
|
|
|
|
|
results_text += "### 📊 Significant Differences Summary\n\n" |
|
baseline_name = list(systems.keys())[0] |
|
results_text += f"**Baseline system:** {baseline_name}\n\n" |
|
|
|
has_significant_differences = False |
|
for system_name in systems.keys(): |
|
if system_name == baseline_name: |
|
continue |
|
|
|
significant_metrics = [] |
|
for metric_name in metrics.keys(): |
|
pvalue_key = f"{metric_name}_pvalue" |
|
if pvalue_key in scores[system_name]: |
|
p_val = scores[system_name][pvalue_key] |
|
if p_val < float(significance_level): |
|
significant_metrics.append(metric_name) |
|
|
|
if significant_metrics: |
|
results_text += f"**{system_name} vs {baseline_name}:** {', '.join(significant_metrics)} (p < {significance_level})\n\n" |
|
has_significant_differences = True |
|
else: |
|
results_text += f"**{system_name} vs {baseline_name}:** No significant differences\n\n" |
|
|
|
if not has_significant_differences: |
|
results_text += "*No statistically significant differences found between systems.*\n\n" |
|
|
|
|
|
results_text += "### 📈 Mean Scores by System\n\n" |
|
try: |
|
baseline_name = list(systems.keys())[0] |
|
|
|
|
|
for system_name in systems.keys(): |
|
results_text += f"**{system_name.upper()}:**\n\n" |
|
|
|
|
|
results_text += "| Metric | Score | P-value |\n" |
|
results_text += "|--------|-------|----------|\n" |
|
|
|
|
|
system_scores = scores.get(system_name, {}) |
|
|
|
|
|
for metric_name in metrics.keys(): |
|
if metric_name in system_scores: |
|
score = system_scores[metric_name] |
|
pvalue_key = f"{metric_name}_pvalue" |
|
|
|
|
|
score_str = f"{score:.4f}" if isinstance(score, (int, float)) else str(score) |
|
|
|
|
|
if system_name != baseline_name and pvalue_key in system_scores: |
|
pvalue = system_scores[pvalue_key] |
|
pvalue_str = f"{pvalue:.4f}" if isinstance(pvalue, (int, float)) else str(pvalue) |
|
|
|
if isinstance(pvalue, (int, float)) and pvalue < float(significance_level): |
|
pvalue_str += " *" |
|
else: |
|
pvalue_str = "-" if system_name == baseline_name else "N/A" |
|
|
|
results_text += f"| {metric_name} | {score_str} | {pvalue_str} |\n" |
|
|
|
results_text += "\n" |
|
|
|
results_text += "*Note: Baseline system shows scores only. Other systems show scores and p-values comparing to baseline.*\n" |
|
results_text += f"*P-values marked with * are significant (p < {significance_level}).*\n\n" |
|
|
|
except Exception as score_error: |
|
results_text += f"Error formatting scores: {str(score_error)}\n\n" |
|
|
|
return results_text |
|
|
|
except ImportError as e: |
|
return f"Import Error: {str(e)}. Please ensure RadEval with compare_systems is installed." |
|
except json.JSONDecodeError: |
|
return "Error: Invalid JSON format in systems data." |
|
except Exception as e: |
|
return f"Testing Error: {str(e)}" |
|
|
|
|
|
with gr.Blocks(title="Null Hypothesis Testing", theme=gr.themes.Soft()) as hypothesis_demo: |
|
gr.Markdown( |
|
""" |
|
# 🖥️ Null Hypothesis Testing |
|
|
|
**Statistical significance testing** for comparing multiple radiology report generation systems. |
|
This tool uses **randomization-based significance testing** to determine if differences between systems are statistically meaningful. |
|
|
|
**⚠️ Performance Warning ⚠️** |
|
|
|
Hypothesis testing with multiple metrics may take some time, especially with larger sample sizes. Please be patient during computation. |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1.5): |
|
systems_input = gr.Textbox( |
|
label="📊 Systems Data (JSON Format)", |
|
lines=18, |
|
placeholder="""Enter systems data in JSON format, e.g.: |
|
{ |
|
"references": [ |
|
"No acute cardiopulmonary process.", |
|
"Mild cardiomegaly with clear lung fields." |
|
], |
|
"systems": { |
|
"baseline": [ |
|
"No acute findings.", |
|
"Mild cardiomegaly, clear lungs." |
|
], |
|
"improved": [ |
|
"No acute cardiopulmonary process.", |
|
"Mild cardiomegaly with clear lung fields bilaterally." |
|
] |
|
} |
|
}""", |
|
info="Provide reference reports and multiple systems to compare" |
|
) |
|
|
|
with gr.Column(scale=1): |
|
test_metrics_selection = gr.CheckboxGroup( |
|
label="🎯 Select Metrics for Testing", |
|
choices=["BLEU", "ROUGE", "BERTScore", "custom: Word Count"], |
|
value=["BLEU", "ROUGE", "BERTScore"], |
|
interactive=True, |
|
info="Only fast metrics are shown to ensure quick evaluation (slow ones are excluded)" |
|
) |
|
|
|
n_samples_input = gr.Number( |
|
label="🔄 Randomization Samples", |
|
value=50, |
|
minimum=10, |
|
maximum=1000, |
|
step=10, |
|
info="Number of randomisation samples (higher = more confidence, but slower)" |
|
) |
|
|
|
significance_level_input = gr.Number( |
|
label="📈 Significance Level (α)", |
|
value=0.05, |
|
minimum=0.01, |
|
maximum=0.10, |
|
step=0.01, |
|
info="Alpha level for significance testing" |
|
) |
|
|
|
example_button = gr.Button("📝 Load Example Data", variant="secondary") |
|
clear_button = gr.Button("🗑️ Clear Data", variant="secondary") |
|
|
|
|
|
with gr.Row(): |
|
test_button = gr.Button("🧪 Run Hypothesis Testing", variant="primary", size="lg") |
|
|
|
with gr.Row(): |
|
test_results = gr.Markdown( |
|
value="📊 **Test results will appear here...**\n\nClick 'Load Example Data' to see sample input, then click 'Run Hypothesis Testing' to see results." |
|
) |
|
|
|
|
|
def load_example_data(): |
|
example_data = { |
|
"references": [ |
|
"No acute cardiopulmonary process.", |
|
"No radiographic findings to suggest pneumonia.", |
|
"Mild cardiomegaly with clear lung fields.", |
|
"Small pleural effusion on the right side.", |
|
"Status post cardiac surgery with stable appearance." |
|
], |
|
"systems": { |
|
"baseline": [ |
|
"No acute findings.", |
|
"No pneumonia.", |
|
"Mild cardiomegaly, clear lungs.", |
|
"Small right pleural effusion.", |
|
"Post-cardiac surgery, stable." |
|
], |
|
"improved": [ |
|
"No acute cardiopulmonary process.", |
|
"No radiographic findings suggesting pneumonia.", |
|
"Mild cardiomegaly with clear lung fields bilaterally.", |
|
"Small pleural effusion present on the right side.", |
|
"Status post cardiac surgery with stable appearance." |
|
], |
|
"poor": [ |
|
"Normal.", |
|
"OK.", |
|
"Heart big.", |
|
"Some fluid.", |
|
"Surgery done." |
|
] |
|
} |
|
} |
|
import json |
|
return json.dumps(example_data, indent=2) |
|
|
|
example_button.click( |
|
load_example_data, |
|
outputs=systems_input |
|
) |
|
|
|
clear_button.click( |
|
lambda: "", |
|
outputs=systems_input |
|
) |
|
|
|
test_button.click( |
|
run_hypothesis_testing, |
|
inputs=[systems_input, test_metrics_selection, n_samples_input, significance_level_input], |
|
outputs=[test_results] |
|
) |
|
|
|
with gr.Accordion("💡 Hypothesis Testing Information", open=False): |
|
gr.Markdown( |
|
""" |
|
### 🔬 How it Works: |
|
|
|
This tool performs **randomization-based significance testing** to compare multiple systems: |
|
|
|
1. **Null Hypothesis**: No difference between systems |
|
2. **Randomization**: Randomly permute system outputs multiple times |
|
3. **P-value Calculation**: Proportion of permutations where random difference ≥ observed difference |
|
4. **Significance**: If p-value < α, reject null hypothesis (systems are significantly different) |
|
|
|
### 📊 Input Format: |
|
- **References**: Ground truth reports |
|
- **Systems**: Multiple systems to compare (each with same number of outputs as references) |
|
- **Metrics**: Evaluation metrics to use for comparison |
|
|
|
### 📈 Output: |
|
- **Significance Matrix**: P-values for all pairwise system comparisons |
|
- **Mean Scores**: Average performance of each system on each metric |
|
- **Bold p-values**: Indicate statistically significant differences |
|
|
|
### ⚡ Performance: |
|
- **Fast Metrics Only**: This tool only includes BLEU, ROUGE, BERTScore, and Word Count for optimal performance |
|
- **Excluded Slow Metrics**: RadGraph F1, CheXbert F1 are excluded to ensure reasonable computation time |
|
- More randomization samples = more accurate p-values but slower computation |
|
- Recommended: 50-100 samples for quick testing, 1000+ for publication |
|
""" |
|
) |
|
|
|
|
|
with gr.Blocks( |
|
title="RadEval: A framework for radiology text evaluation", |
|
theme=gr.themes.Soft(), |
|
css=""" |
|
.tab-nav button { |
|
font-weight: bold !important; |
|
border: 2px solid #e0e7ff !important; |
|
border-radius: 10px !important; |
|
margin: 0 5px !important; |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; |
|
color: white !important; |
|
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2) !important; |
|
transition: all 0.3s ease !important; |
|
} |
|
.tab-nav button:hover { |
|
transform: translateY(-2px) !important; |
|
box-shadow: 0 6px 20px rgba(0, 0, 0, 0.3) !important; |
|
background: linear-gradient(135deg, #764ba2 0%, #667eea 100%) !important; |
|
} |
|
.tab-nav button.selected { |
|
background: linear-gradient(135deg, #ff6b6b 0%, #ee5a24 100%) !important; |
|
border-color: #ff6b6b !important; |
|
transform: translateY(-1px) !important; |
|
box-shadow: 0 8px 25px rgba(255, 107, 107, 0.4) !important; |
|
} |
|
""" |
|
) as combined_demo: |
|
gr.Markdown( |
|
""" |
|
# 🩺 RadEval: A framework for radiology text evaluation |
|
### [Github](https://github.com/jbdel/RadEval) | [PyPI](https://pypi.org/project/RadEval) | [Video](https://justin13601.github.io/files/radeval.mp4) | [arXiv]() | [RadEval_ModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) | [Expert Dataset]() |
|
|
|
""" |
|
) |
|
|
|
tabs = gr.TabbedInterface( |
|
[demo, hypothesis_demo], |
|
["🏎️ RadEval Evaluation", "🖥️ Null Hypothesis Testing"] |
|
) |
|
|
|
if __name__ == "__main__": |
|
combined_demo.launch() |
|
|