Spaces:

X-iZhang
/

RadEval

Running

App Files Files Community

RadEval / app.py

X-iZhang

Update app.py

537c1de verified 28 days ago

raw

history blame contribute delete

29.2 kB

	import gradio as gr
	import sys
	import os
	import torch
	sys.path.append(".")

	def setup_cpu_environment():
	os.environ['CUDA_VISIBLE_DEVICES'] = ''

	torch.set_num_threads(4)

	os.environ['TOKENIZERS_PARALLELISM'] = 'false'

	os.environ['TRANSFORMERS_CACHE'] = './cache'

	setup_cpu_environment()

	from RadEval import RadEval, compare_systems

	def run_radeval_simple(ref_text, hyp_text, selected_metrics):
	"""
	Run RadEval with selected metrics on a pair of reference and hypothesis texts
	"""
	try:

	refs = [ref_text.strip()]
	hyps = [hyp_text.strip()]

	# Configure RadEval based on selected metrics
	config = {
	'do_radgraph': 'RadGraph F1' in selected_metrics,
	'do_bleu': 'BLEU' in selected_metrics,
	'do_rouge': 'ROUGE' in selected_metrics,
	'do_bertscore': 'BERTScore' in selected_metrics,
	'do_chexbert': 'CheXbert F1' in selected_metrics,
	'do_ratescore': 'RaTEScore' in selected_metrics,
	'do_radcliq': 'RadCliQ' in selected_metrics,
	'do_temporal': 'Temporal F1' in selected_metrics,
	'do_radeval_bertsore': 'RadEval BERTScore' in selected_metrics,
	'do_green': 'GREEN' in selected_metrics,
	'do_srr_bert': 'SRR-BERT' in selected_metrics
	}

	# Initialize RadEval with selected metrics
	evaluator = RadEval(**config)

	# Run evaluation
	results = evaluator(refs=refs, hyps=hyps)

	# Prepare results for display
	table_data = []
	analysis_text = "## 🚀 RadEval Results\n\n"
	analysis_text += f"Reference: {ref_text[:100]}{'...' if len(ref_text) > 100 else ''}\n\n"
	analysis_text += f"Hypothesis: {hyp_text[:100]}{'...' if len(hyp_text) > 100 else ''}\n\n"
	analysis_text += "### Evaluation Scores:\n\n"

	for metric, score in results.items():
	if isinstance(score, (int, float)):
	formatted_score = f"{score:.4f}" if isinstance(score, float) else str(score)
	table_data.append([metric, formatted_score])
	analysis_text += f"- {metric}: {formatted_score}\n"
	elif isinstance(score, dict):
	# Handle nested metrics
	for sub_metric, sub_score in score.items():
	if isinstance(sub_score, (int, float)):
	formatted_score = f"{sub_score:.4f}" if isinstance(sub_score, float) else str(sub_score)
	metric_name = f"{metric}_{sub_metric}"
	table_data.append([metric_name, formatted_score])
	analysis_text += f"- {metric_name}: {formatted_score}\n"

	if not table_data:
	return "No metrics were computed. Please select at least one metric.", [["No results", ""]]

	return analysis_text, table_data

	except ImportError as e:
	error_msg = f"Import Error: {str(e)}. Please ensure RadEval dependencies are installed."
	return error_msg, [["Error", error_msg]]
	except Exception as e:
	error_msg = f"Evaluation Error: {str(e)}"
	return error_msg, [["Error", error_msg]]


	# Example pairs for radiology reports
	examples = {
	"Normal vs Normal": {
	"ref": "Heart size is normal. Lungs are clear. No pleural effusion or pneumothorax.",
	"hyp": "Cardiac silhouette is within normal limits. Lungs are clear bilaterally. No effusion or pneumothorax identified.",
	},
	"Pneumonia Case": {
	"ref": "Moderate cardiomegaly. Bilateral lower lobe consolidations consistent with pneumonia.",
	"hyp": "Enlarged heart. Worsening bilateral infiltrates in the lower lobes suggestive of pneumonia.",
	},
	"Temporal Comparison": {
	"ref": "Compared to prior study, the pleural effusion has increased in size. New bilateral infiltrates are present.",
	"hyp": "The pleural effusion is larger than on the previous examination. There are new bilateral pulmonary infiltrates.",
	},
	"Discordant Reports": {
	"ref": "No acute cardiopulmonary process. Normal heart size and lung fields.",
	"hyp": "Mild cardiomegaly with bilateral lower lobe atelectasis. Small pleural effusion on the right.",
	},
	"Ambiguous Language": {
	"ref": "There is a small left-sided pleural effusion with adjacent atelectasis.",
	"hyp": "Possible small effusion on the left. Atelectasis cannot be excluded.",
	},
	"Surgical Follow-up": {
	"ref": "Status post coronary artery bypass grafting. No evidence of acute complication.",
	"hyp": "Post-operative changes from CABG are present. No signs of surgical complication.",
	},
	"False Positive": {
	"ref": "No focal consolidation, pleural effusion, or pneumothorax identified.",
	"hyp": "Right lower lobe consolidation concerning for pneumonia.",
	},
	"Textual Hallucination": {
	"ref": "Heart and mediastinum are normal. Lungs are clear.",
	"hyp": "Large left pleural effusion with mediastinal shift to the right.",
	},
	"Negation Challenge": {
	"ref": "No evidence of pneumothorax or pleural effusion.",
	"hyp": "Evidence of small pneumothorax on the right.",
	},
	"Fine-grained Difference": {
	"ref": "Mild interstitial markings at the lung bases, likely chronic.",
	"hyp": "Subtle increased interstitial opacities at both lung bases, likely chronic in nature.",
	}
	}

	def update_fields(choice):
	"""Update text fields based on example selection"""
	if choice == "Custom":
	return gr.update(value="", interactive=True), gr.update(value="", interactive=True)
	else:
	return (
	gr.update(value=examples[choice]["ref"], interactive=False),
	gr.update(value=examples[choice]["hyp"], interactive=False)
	)


	# Available metrics (ordered by computational complexity)
	available_metrics = [
	"BLEU",
	"ROUGE",
	"BERTScore",
	"Temporal F1",
	"RadEval BERTScore",
	"RaTEScore",
	"RadCliQ",
	"SRR-BERT",
	"CheXbert F1",
	"RadGraph F1",
	"GREEN"
	]

	# Fast metrics for default selection
	default_metrics = ["BLEU", "ROUGE", "BERTScore"]


	with gr.Blocks(title="RadEval Evaluation", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 🏎️ RadEval Evaluation

	RadEval is a lightweight, extensible framework for evaluating radiology reports using both standard NLP metrics (e.g. BLEU, ROUGE, BERTScore) and radiology-specific measures (e.g. RadGraph, CheXbert, GREEN). Whether you're benchmarking generation systems or validating clinical correctness, RadEval offers comprehensive and interpretable metrics out of the box.

	⚠️ Performance Warning ⚠️

	The demo is currently running on CPU. When using some slower metrics (like RadGraph, CheXbert, GREEN), it may take a while to complete evaluation. Please be patient.
	"""
	)

	with gr.Row():
	choice = gr.Radio(
	label="📋 Choose Example or Custom Input",
	choices=["Custom"] + list(examples.keys()),
	value="Custom",
	interactive=True
	)

	with gr.Row():
	with gr.Column(scale=1):
	ref_input = gr.Textbox(
	label="📄 Reference Report (Ground Truth)",
	lines=5,
	placeholder="Enter the reference radiology report here...",
	info="The ground truth or expert-written report"
	)
	with gr.Column(scale=1):
	hyp_input = gr.Textbox(
	label="🤖 Hypothesis Report (Generated)",
	lines=5,
	placeholder="Enter the generated/predicted radiology report here...",
	info="The AI-generated or system-produced report"
	)

	choice.change(
	update_fields,
	inputs=choice,
	outputs=[ref_input, hyp_input],
	)

	with gr.Row():
	metrics_selection = gr.CheckboxGroup(
	label="🎯 Select Evaluation Metrics",
	choices=available_metrics,
	value=default_metrics,
	interactive=True,
	info="Select metrics to compute. Some metrics may take longer (RadGraph, CheXbert, GREEN)."
	)

	with gr.Row():
	run_button = gr.Button("🚀 Run RadEval", variant="primary", size="lg")

	with gr.Row():
	with gr.Column(scale=2):
	analysis_output = gr.Markdown(
	value="📊 Results will appear here after evaluation...\n\nSelect your texts and metrics, then click 'Run RadEval'."
	)
	with gr.Column(scale=1):
	table_output = gr.DataFrame(
	label="📈 Detailed Scores",
	headers=["Metric", "Score"],
	wrap=True
	)

	# Information section
	with gr.Accordion("💡 Metric Information", open=False):
	gr.Markdown(
	"""
	### 📊 Available Metrics:

	Traditional NLG Metrics:
	- BLEU: N-gram overlap between reference and hypothesis
	- ROUGE: Recall-oriented overlap (ROUGE-1, ROUGE-2, ROUGE-L)
	- BERTScore: Semantic similarity using BERT embeddings

	Radiology-Specific Metrics:
	- RadGraph F1: Entity and relation extraction for radiology
	- CheXbert F1: Chest X-ray finding classification performance
	- RaTEScore: Radiology-aware text evaluation score
	- RadCliQ: Composite metric for radiology reports
	- Temporal F1: Temporal entity and relationship evaluation
	- RadEval BERTScore: Specialized BERT for radiology text
	- GREEN: Generative evaluation with natural language explanations
	- SRR-BERT: Structured radiology reasoning evaluation

	### ⚡ Performance Notes:
	- Fast: BLEU, ROUGE, BERTScore, Temporal F1
	- Medium: RadEval BERTScore, RaTEScore, RadCliQ, SRR-BERT
	- Slow: CheXbert F1, RadGraph F1, GREEN (requires model downloads)
	"""
	)

	run_button.click(
	run_radeval_simple,
	inputs=[ref_input, hyp_input, metrics_selection],
	outputs=[analysis_output, table_output]
	)

	# =============================================================================
	# 🧪 Hypothesis Testing Section
	# =============================================================================

	def run_hypothesis_testing(systems_data, selected_test_metrics, n_samples, significance_level):
	"""
	Run statistical significance testing between multiple systems
	"""
	try:
	# Parse systems data (expecting JSON format)
	import json
	systems_dict = json.loads(systems_data)

	# Extract references and systems
	if 'references' not in systems_dict or 'systems' not in systems_dict:
	return "Error: Please provide both 'references' and 'systems' in the JSON data.", ""

	references = systems_dict['references']
	systems = systems_dict['systems']

	# Validate data integrity
	if not references or not systems:
	return "Error: References and systems cannot be empty.", ""

	if not isinstance(references, list) or not isinstance(systems, dict):
	return "Error: References must be a list and systems must be a dictionary.", ""

	# Check that all systems have the same number of outputs as references
	ref_count = len(references)
	for system_name, system_outputs in systems.items():
	if not isinstance(system_outputs, list):
	return f"Error: System '{system_name}' outputs must be a list.", ""
	if len(system_outputs) != ref_count:
	return f"Error: System '{system_name}' has {len(system_outputs)} outputs but {ref_count} references provided.", ""

	# Validate that all texts are non-empty strings
	for i, ref in enumerate(references):
	if not isinstance(ref, str) or not ref.strip():
	return f"Error: Reference {i+1} is empty or not a string.", ""

	for system_name, system_outputs in systems.items():
	for i, output in enumerate(system_outputs):
	if not isinstance(output, str) or not output.strip():
	return f"Error: System '{system_name}' output {i+1} is empty or not a string.", ""

	# Initialize evaluators based on selected metrics (fast metrics only)
	evaluators = {}
	if 'BLEU' in selected_test_metrics:
	evaluators['bleu'] = RadEval(do_bleu=True)
	if 'ROUGE' in selected_test_metrics:
	evaluators['rouge'] = RadEval(do_rouge=True)
	if 'BERTScore' in selected_test_metrics:
	evaluators['bertscore'] = RadEval(do_bertscore=True)

	# Custom metric: average word count
	def word_count_metric(hyps, refs):
	return sum(len(report.split()) for report in hyps) / len(hyps)

	# Build metrics dictionary (following the example structure)
	metrics = {}
	if 'BLEU' in selected_test_metrics:
	# Test the evaluator first
	try:
	test_result = evaluators['bleu'](references[:1], [systems[list(systems.keys())[0]][0]])
	if 'bleu' not in test_result:
	return "Error: BLEU evaluator doesn't return 'bleu' key. Available keys: " + str(list(test_result.keys())), ""
	metrics['bleu'] = lambda hyps, refs: evaluators['bleu'](refs, hyps)['bleu']
	except Exception as bleu_error:
	return f"Error testing BLEU evaluator: {str(bleu_error)}", ""

	if 'ROUGE' in selected_test_metrics:
	try:
	test_result = evaluators['rouge'](references[:1], [systems[list(systems.keys())[0]][0]])
	for rouge_key in ['rouge1', 'rouge2', 'rougeL']:
	if rouge_key not in test_result:
	return f"Error: ROUGE evaluator doesn't return '{rouge_key}' key. Available keys: " + str(list(test_result.keys())), ""
	metrics['rouge1'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge1']
	metrics['rouge2'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rouge2']
	metrics['rougeL'] = lambda hyps, refs: evaluators['rouge'](refs, hyps)['rougeL']
	except Exception as rouge_error:
	return f"Error testing ROUGE evaluator: {str(rouge_error)}", ""

	if 'BERTScore' in selected_test_metrics:
	try:
	test_result = evaluators['bertscore'](references[:1], [systems[list(systems.keys())[0]][0]])
	if 'bertscore' not in test_result:
	return "Error: BERTScore evaluator doesn't return 'bertscore' key. Available keys: " + str(list(test_result.keys())), ""
	metrics['bertscore'] = lambda hyps, refs: evaluators['bertscore'](refs, hyps)['bertscore']
	except Exception as bert_error:
	return f"Error testing BERTScore evaluator: {str(bert_error)}", ""

	if 'custom: Word Count' in selected_test_metrics:
	metrics['word_count'] = word_count_metric # ← example of a simple custom-defined metric

	if not metrics:
	return "Error: Please select at least one metric for testing.", ""

	# Run significance tests
	try:
	signatures, scores = compare_systems(
	systems=systems,
	metrics=metrics,
	references=references,
	n_samples=int(n_samples),
	significance_level=float(significance_level),
	print_results=False # We don't need print output for online demo
	)

	except Exception as compare_error:
	return f"Error during significance testing: {str(compare_error)}\n\nThis might be due to:\n1. Empty or invalid text content\n2. Incompatible metric configurations\n3. RadEval library issues", str(compare_error)

	# Format results
	results_text = "## 🧪 Hypothesis Testing Results\n\n"
	results_text += f"Parameters:\n"
	results_text += f"- Randomization samples: {n_samples}\n"
	results_text += f"- Significance level: {significance_level}\n"
	results_text += f"- Number of systems: {len(systems)}\n"
	results_text += f"- Number of references: {len(references)}\n\n"

	# Significant differences summary
	results_text += "### 📊 Significant Differences Summary\n\n"
	baseline_name = list(systems.keys())[0] # Assume first one is the baseline
	results_text += f"Baseline system: {baseline_name}\n\n"

	has_significant_differences = False
	for system_name in systems.keys():
	if system_name == baseline_name:
	continue

	significant_metrics = []
	for metric_name in metrics.keys():
	pvalue_key = f"{metric_name}_pvalue"
	if pvalue_key in scores[system_name]:
	p_val = scores[system_name][pvalue_key]
	if p_val < float(significance_level):
	significant_metrics.append(metric_name)

	if significant_metrics:
	results_text += f"{system_name} vs {baseline_name}: {', '.join(significant_metrics)} (p < {significance_level})\n\n"
	has_significant_differences = True
	else:
	results_text += f"{system_name} vs {baseline_name}: No significant differences\n\n"

	if not has_significant_differences:
	results_text += "No statistically significant differences found between systems.\n\n"

	# Add mean scores in table format
	results_text += "### 📈 Mean Scores by System\n\n"
	try:
	baseline_name = list(systems.keys())[0]

	# Display each system's results in a clean format
	for system_name in systems.keys():
	results_text += f"{system_name.upper()}:\n\n"

	# Create table header
	results_text += "\| Metric \| Score \| P-value \|\n"
	results_text += "\|--------\|-------\|----------\|\n"

	# Get system data from scores
	system_scores = scores.get(system_name, {})

	# Add rows for each metric
	for metric_name in metrics.keys():
	if metric_name in system_scores:
	score = system_scores[metric_name]
	pvalue_key = f"{metric_name}_pvalue"

	# Format score
	score_str = f"{score:.4f}" if isinstance(score, (int, float)) else str(score)

	# Format p-value (only for non-baseline systems)
	if system_name != baseline_name and pvalue_key in system_scores:
	pvalue = system_scores[pvalue_key]
	pvalue_str = f"{pvalue:.4f}" if isinstance(pvalue, (int, float)) else str(pvalue)
	# Mark significant p-values
	if isinstance(pvalue, (int, float)) and pvalue < float(significance_level):
	pvalue_str += " *"
	else:
	pvalue_str = "-" if system_name == baseline_name else "N/A"

	results_text += f"\| {metric_name} \| {score_str} \| {pvalue_str} \|\n"

	results_text += "\n"

	results_text += "Note: Baseline system shows scores only. Other systems show scores and p-values comparing to baseline.\n"
	results_text += f"P-values marked with are significant (p < {significance_level}).*\n\n"

	except Exception as score_error:
	results_text += f"Error formatting scores: {str(score_error)}\n\n"

	return results_text

	except ImportError as e:
	return f"Import Error: {str(e)}. Please ensure RadEval with compare_systems is installed."
	except json.JSONDecodeError:
	return "Error: Invalid JSON format in systems data."
	except Exception as e:
	return f"Testing Error: {str(e)}"

	# Create Hypothesis Testing UI
	with gr.Blocks(title="Null Hypothesis Testing", theme=gr.themes.Soft()) as hypothesis_demo:
	gr.Markdown(
	"""
	# 🖥️ Null Hypothesis Testing

	Statistical significance testing for comparing multiple radiology report generation systems.
	This tool uses randomization-based significance testing to determine if differences between systems are statistically meaningful.

	⚠️ Performance Warning ⚠️

	Hypothesis testing with multiple metrics may take some time, especially with larger sample sizes. Please be patient during computation.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1.5):
	systems_input = gr.Textbox(
	label="📊 Systems Data (JSON Format)",
	lines=18,
	placeholder="""Enter systems data in JSON format, e.g.:
	{
	"references": [
	"No acute cardiopulmonary process.",
	"Mild cardiomegaly with clear lung fields."
	],
	"systems": {
	"baseline": [
	"No acute findings.",
	"Mild cardiomegaly, clear lungs."
	],
	"improved": [
	"No acute cardiopulmonary process.",
	"Mild cardiomegaly with clear lung fields bilaterally."
	]
	}
	}""",
	info="Provide reference reports and multiple systems to compare"
	)

	with gr.Column(scale=1):
	test_metrics_selection = gr.CheckboxGroup(
	label="🎯 Select Metrics for Testing",
	choices=["BLEU", "ROUGE", "BERTScore", "custom: Word Count"],
	value=["BLEU", "ROUGE", "BERTScore"],
	interactive=True,
	info="Only fast metrics are shown to ensure quick evaluation (slow ones are excluded)"
	)

	n_samples_input = gr.Number(
	label="🔄 Randomization Samples",
	value=50,
	minimum=10,
	maximum=1000,
	step=10,
	info="Number of randomisation samples (higher = more confidence, but slower)"
	)

	significance_level_input = gr.Number(
	label="📈 Significance Level (α)",
	value=0.05,
	minimum=0.01,
	maximum=0.10,
	step=0.01,
	info="Alpha level for significance testing"
	)

	example_button = gr.Button("📝 Load Example Data", variant="secondary")
	clear_button = gr.Button("🗑️ Clear Data", variant="secondary")


	with gr.Row():
	test_button = gr.Button("🧪 Run Hypothesis Testing", variant="primary", size="lg")

	with gr.Row():
	test_results = gr.Markdown(
	value="📊 Test results will appear here...\n\nClick 'Load Example Data' to see sample input, then click 'Run Hypothesis Testing' to see results."
	)

	# Example data button
	def load_example_data():
	example_data = {
	"references": [
	"No acute cardiopulmonary process.",
	"No radiographic findings to suggest pneumonia.",
	"Mild cardiomegaly with clear lung fields.",
	"Small pleural effusion on the right side.",
	"Status post cardiac surgery with stable appearance."
	],
	"systems": {
	"baseline": [
	"No acute findings.",
	"No pneumonia.",
	"Mild cardiomegaly, clear lungs.",
	"Small right pleural effusion.",
	"Post-cardiac surgery, stable."
	],
	"improved": [
	"No acute cardiopulmonary process.",
	"No radiographic findings suggesting pneumonia.",
	"Mild cardiomegaly with clear lung fields bilaterally.",
	"Small pleural effusion present on the right side.",
	"Status post cardiac surgery with stable appearance."
	],
	"poor": [
	"Normal.",
	"OK.",
	"Heart big.",
	"Some fluid.",
	"Surgery done."
	]
	}
	}
	import json
	return json.dumps(example_data, indent=2)

	example_button.click(
	load_example_data,
	outputs=systems_input
	)

	clear_button.click(
	lambda: "",
	outputs=systems_input
	)

	test_button.click(
	run_hypothesis_testing,
	inputs=[systems_input, test_metrics_selection, n_samples_input, significance_level_input],
	outputs=[test_results]
	)

	with gr.Accordion("💡 Hypothesis Testing Information", open=False):
	gr.Markdown(
	"""
	### 🔬 How it Works:

	This tool performs randomization-based significance testing to compare multiple systems:

	1. Null Hypothesis: No difference between systems
	2. Randomization: Randomly permute system outputs multiple times
	3. P-value Calculation: Proportion of permutations where random difference ≥ observed difference
	4. Significance: If p-value < α, reject null hypothesis (systems are significantly different)

	### 📊 Input Format:
	- References: Ground truth reports
	- Systems: Multiple systems to compare (each with same number of outputs as references)
	- Metrics: Evaluation metrics to use for comparison

	### 📈 Output:
	- Significance Matrix: P-values for all pairwise system comparisons
	- Mean Scores: Average performance of each system on each metric
	- Bold p-values: Indicate statistically significant differences

	### ⚡ Performance:
	- Fast Metrics Only: This tool only includes BLEU, ROUGE, BERTScore, and Word Count for optimal performance
	- Excluded Slow Metrics: RadGraph F1, CheXbert F1 are excluded to ensure reasonable computation time
	- More randomization samples = more accurate p-values but slower computation
	- Recommended: 50-100 samples for quick testing, 1000+ for publication
	"""
	)

	# Combine both demos using gr.Blocks to add a header
	with gr.Blocks(
	title="RadEval: A framework for radiology text evaluation",
	theme=gr.themes.Soft(),
	css="""
	.tab-nav button {
	font-weight: bold !important;
	border: 2px solid #e0e7ff !important;
	border-radius: 10px !important;
	margin: 0 5px !important;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
	color: white !important;
	box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2) !important;
	transition: all 0.3s ease !important;
	}
	.tab-nav button:hover {
	transform: translateY(-2px) !important;
	box-shadow: 0 6px 20px rgba(0, 0, 0, 0.3) !important;
	background: linear-gradient(135deg, #764ba2 0%, #667eea 100%) !important;
	}
	.tab-nav button.selected {
	background: linear-gradient(135deg, #ff6b6b 0%, #ee5a24 100%) !important;
	border-color: #ff6b6b !important;
	transform: translateY(-1px) !important;
	box-shadow: 0 8px 25px rgba(255, 107, 107, 0.4) !important;
	}
	"""
	) as combined_demo:
	gr.Markdown(
	"""
	# 🩺 RadEval: A framework for radiology text evaluation
	### [Github](https://github.com/jbdel/RadEval) \| [PyPI](https://pypi.org/project/RadEval) \| [Video](https://justin13601.github.io/files/radeval.mp4) \| [arXiv]() \| [RadEval_ModernBERT Model](https://huggingface.co/IAMJB/RadEvalModernBERT) \| [Expert Dataset]()

	"""
	)

	tabs = gr.TabbedInterface(
	[demo, hypothesis_demo],
	["🏎️ RadEval Evaluation", "🖥️ Null Hypothesis Testing"]
	)

	if __name__ == "__main__":
	combined_demo.launch()