Spaces:

akera
/

leaderboard

Running

App Files Files Community

leaderboard / src /plotting.py

akera

Update src/plotting.py

b9c4788 verified 2 months ago

raw

history blame contribute delete

17.3 kB

	# src/plotting.py
	import plotly.graph_objects as go
	import plotly.express as px
	from plotly.subplots import make_subplots
	import pandas as pd
	import numpy as np
	import json
	from collections import defaultdict
	from typing import Dict, List, Optional, Union
	from config import (
	LANGUAGE_NAMES,
	ALL_UG40_LANGUAGES,
	GOOGLE_SUPPORTED_LANGUAGES,
	METRICS_CONFIG,
	EVALUATION_TRACKS,
	MODEL_CATEGORIES,
	CHART_CONFIG,
	)


	def create_leaderboard_plot(
	df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15
	) -> go.Figure:
	"""Create leaderboard plot with confidence intervals."""

	if df.empty:
	fig = go.Figure()
	fig.add_annotation(
	text="No models available for this track",
	xref="paper", yref="paper",
	x=0.5, y=0.5, showarrow=False,
	font=dict(size=16)
	)
	fig.update_layout(
	title=f"No Data Available - {track.title()} Track",
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(0,0,0,0)"
	)
	return fig

	try:
	# Get top N models for this track
	metric_col = f"{track}_{metric}"
	ci_lower_col = f"{track}_ci_lower"
	ci_upper_col = f"{track}_ci_upper"

	if metric_col not in df.columns:
	fig = go.Figure()
	fig.add_annotation(
	text=f"Metric {metric} not available for {track} track",
	xref="paper", yref="paper",
	x=0.5, y=0.5, showarrow=False,
	)
	return fig

	# Ensure numeric columns are properly typed
	numeric_cols = [metric_col, ci_lower_col, ci_upper_col]
	for col in numeric_cols:
	if col in df.columns:
	df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)

	# Filter and sort
	valid_models = df[(df[metric_col] > 0)].head(top_n).copy()

	if valid_models.empty:
	fig = go.Figure()
	fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
	return fig

	# Create color mapping by category
	colors = [MODEL_CATEGORIES.get(cat, {}).get("color", "#808080") for cat in valid_models["model_category"]]

	# Main bar plot
	fig = go.Figure()

	# Add bars with error bars if confidence intervals available
	error_x = None
	if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns:
	try:
	error_x = dict(
	type="data",
	array=valid_models[ci_upper_col] - valid_models[metric_col],
	arrayminus=valid_models[metric_col] - valid_models[ci_lower_col],
	visible=True,
	thickness=2,
	width=4,
	)
	except Exception as e:
	print(f"Error creating error bars: {e}")
	error_x = None

	# Safely format text values
	try:
	text_values = [f"{float(score):.3f}" for score in valid_models[metric_col]]
	except:
	text_values = ["0.000"] * len(valid_models)

	# Safely prepare custom data
	try:
	samples_col = f"{track}_samples"
	samples_data = valid_models.get(samples_col, [0] * len(valid_models))
	customdata = list(zip(
	valid_models["model_category"].fillna("unknown"),
	valid_models["author"].fillna("Anonymous"),
	[int(float(x)) if pd.notnull(x) else 0 for x in samples_data]
	))
	except Exception as e:
	print(f"Error preparing custom data: {e}")
	customdata = [("unknown", "Anonymous", 0)] * len(valid_models)

	fig.add_trace(go.Bar(
	y=valid_models["model_name"],
	x=valid_models[metric_col],
	orientation="h",
	marker=dict(color=colors, line=dict(color="black", width=0.5)),
	error_x=error_x,
	text=text_values,
	textposition="auto",
	hovertemplate=(
	"<b>%{y}</b><br>" +
	f"{metric.title()}: %{{x:.4f}}<br>" +
	"Category: %{customdata[0]}<br>" +
	"Author: %{customdata[1]}<br>" +
	"Samples: %{customdata[2]}<br>" +
	"<extra></extra>"
	),
	customdata=customdata,
	))

	# Customize layout
	track_info = EVALUATION_TRACKS[track]
	fig.update_layout(
	title=f"🏆 {track_info['name']} - {metric.title()} Score",
	xaxis_title=f"{metric.title()} Score (with 95% CI)",
	yaxis_title="Models",
	height=max(400, len(valid_models) * 35 + 100),
	margin=dict(l=20, r=20, t=60, b=20),
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(0,0,0,0)",
	font=dict(size=12),
	)

	# Reverse y-axis to show best model at top
	fig.update_yaxes(autorange="reversed")

	return fig

	except Exception as e:
	print(f"Error creating leaderboard plot: {e}")
	fig = go.Figure()
	fig.add_annotation(
	text=f"Error creating plot: {str(e)}",
	x=0.5, y=0.5, showarrow=False
	)
	return fig


	def create_language_pair_heatmap(
	model_results: Dict, track: str, metric: str = "quality_score"
	) -> go.Figure:
	"""Create language pair heatmap for a model."""

	if not model_results or "tracks" not in model_results:
	fig = go.Figure()
	fig.add_annotation(text="No model results available", x=0.5, y=0.5, showarrow=False)
	return fig

	track_data = model_results["tracks"].get(track, {})
	if track_data.get("error") or "pair_metrics" not in track_data:
	fig = go.Figure()
	fig.add_annotation(text=f"No data available for {track} track", x=0.5, y=0.5, showarrow=False)
	return fig

	pair_metrics = track_data["pair_metrics"]
	track_languages = EVALUATION_TRACKS[track]["languages"]

	# Create matrix for heatmap
	n_langs = len(track_languages)
	matrix = np.full((n_langs, n_langs), np.nan)

	for i, src_lang in enumerate(track_languages):
	for j, tgt_lang in enumerate(track_languages):
	if src_lang != tgt_lang:
	pair_key = f"{src_lang}_to_{tgt_lang}"
	if pair_key in pair_metrics and metric in pair_metrics[pair_key]:
	matrix[i, j] = pair_metrics[pair_key][metric]["mean"]

	# Create language labels
	lang_labels = [LANGUAGE_NAMES.get(lang, lang.upper()) for lang in track_languages]

	# Create heatmap
	fig = go.Figure(data=go.Heatmap(
	z=matrix,
	x=lang_labels,
	y=lang_labels,
	colorscale="Viridis",
	showscale=True,
	colorbar=dict(
	title=f"{metric.replace('_', ' ').title()}",
	titleside="right",
	len=0.8,
	),
	hovertemplate=(
	"Source: %{y}<br>" +
	"Target: %{x}<br>" +
	f"{metric.replace('_', ' ').title()}: %{{z:.3f}}<br>" +
	"<extra></extra>"
	),
	zmin=0,
	zmax=1 if metric == "quality_score" else None,
	))

	# Customize layout
	track_info = EVALUATION_TRACKS[track]
	fig.update_layout(
	title=f"🗺️ {track_info['name']} - {metric.replace('_', ' ').title()} by Language Pair",
	xaxis_title="Target Language",
	yaxis_title="Source Language",
	height=600,
	width=700,
	font=dict(size=12),
	xaxis=dict(side="bottom"),
	yaxis=dict(autorange="reversed"),
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(0,0,0,0)",
	)

	return fig


	def create_performance_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
	"""Create performance comparison plot showing confidence intervals."""

	if df.empty:
	fig = go.Figure()
	fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
	return fig

	try:
	metric_col = f"{track}_quality"
	ci_lower_col = f"{track}_ci_lower"
	ci_upper_col = f"{track}_ci_upper"

	# Ensure numeric columns are properly typed
	numeric_cols = [metric_col, ci_lower_col, ci_upper_col]
	for col in numeric_cols:
	if col in df.columns:
	df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)

	# Filter to models with data for this track
	valid_models = df[
	(df[metric_col] > 0) &
	(df[ci_lower_col].notna()) &
	(df[ci_upper_col].notna())
	].head(10).copy()

	if valid_models.empty:
	fig = go.Figure()
	fig.add_annotation(text="No models with confidence intervals", x=0.5, y=0.5, showarrow=False)
	return fig

	fig = go.Figure()

	# Add confidence intervals as error bars
	for i, (_, model) in enumerate(valid_models.iterrows()):
	try:
	category = str(model["model_category"])
	color = MODEL_CATEGORIES.get(category, {}).get("color", "#808080")
	model_name = str(model["model_name"])

	# Safely extract numeric values
	quality_val = float(model[metric_col])
	ci_lower_val = float(model[ci_lower_col])
	ci_upper_val = float(model[ci_upper_col])

	# Main point
	fig.add_trace(go.Scatter(
	x=[quality_val],
	y=[i],
	mode="markers",
	marker=dict(
	size=12,
	color=color,
	line=dict(color="black", width=1),
	),
	name=model_name,
	showlegend=False,
	hovertemplate=(
	f"<b>{model_name}</b><br>" +
	f"Quality: {quality_val:.4f}<br>" +
	f"95% CI: [{ci_lower_val:.4f}, {ci_upper_val:.4f}]<br>" +
	f"Category: {category}<br>" +
	"<extra></extra>"
	),
	))

	# Confidence interval line
	fig.add_trace(go.Scatter(
	x=[ci_lower_val, ci_upper_val],
	y=[i, i],
	mode="lines",
	line=dict(color=color, width=3),
	showlegend=False,
	hoverinfo="skip",
	))

	except Exception as e:
	print(f"Error adding model {i} to comparison plot: {e}")
	continue

	# Safely prepare tick labels
	try:
	tick_labels = [str(name) for name in valid_models["model_name"]]
	except:
	tick_labels = [f"Model {i}" for i in range(len(valid_models))]

	# Customize layout
	track_info = EVALUATION_TRACKS[track]
	fig.update_layout(
	title=f"📊 {track_info['name']} - Performance Comparison",
	xaxis_title="Quality Score",
	yaxis_title="Models",
	height=max(400, len(valid_models) * 40 + 100),
	yaxis=dict(
	tickmode="array",
	tickvals=list(range(len(valid_models))),
	ticktext=tick_labels,
	autorange="reversed",
	),
	showlegend=False,
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(0,0,0,0)",
	)

	return fig

	except Exception as e:
	print(f"Error creating performance comparison plot: {e}")
	fig = go.Figure()
	fig.add_annotation(
	text=f"Error creating plot: {str(e)}",
	x=0.5, y=0.5, showarrow=False
	)
	return fig


	def create_language_pair_comparison_plot(pairs_df: pd.DataFrame, track: str) -> go.Figure:
	"""Create language pair comparison plot showing all models across all pairs."""

	if pairs_df.empty:
	fig = go.Figure()
	fig.add_annotation(
	text="No language pair data available",
	x=0.5, y=0.5, showarrow=False
	)
	return fig

	# Get unique language pairs and models
	language_pairs = sorted(pairs_df['Language Pair'].unique())
	models = sorted(pairs_df['Model'].unique())

	if len(language_pairs) == 0 or len(models) == 0:
	fig = go.Figure()
	fig.add_annotation(
	text="Insufficient data for comparison",
	x=0.5, y=0.5, showarrow=False
	)
	return fig

	# Create subplot for each metric
	fig = make_subplots(
	rows=2, cols=1,
	subplot_titles=('Quality Score by Language Pair', 'BLEU Score by Language Pair'),
	vertical_spacing=0.1,
	shared_xaxes=True
	)

	# Quality Score comparison
	for model in models:
	model_data = pairs_df[pairs_df['Model'] == model]
	category = model_data['Category'].iloc[0] if not model_data.empty else 'community'
	color = MODEL_CATEGORIES.get(category, {}).get('color', '#808080')

	fig.add_trace(
	go.Bar(
	name=model,
	x=model_data['Language Pair'],
	y=model_data['Quality Score'],
	marker_color=color,
	opacity=0.8,
	legendgroup=model,
	showlegend=True,
	hovertemplate=(
	f"<b>{model}</b><br>" +
	"Language Pair: %{x}<br>" +
	"Quality Score: %{y:.4f}<br>" +
	f"Category: {category}<br>" +
	"<extra></extra>"
	)
	),
	row=1, col=1
	)

	# BLEU Score comparison
	fig.add_trace(
	go.Bar(
	name=model,
	x=model_data['Language Pair'],
	y=model_data['BLEU'],
	marker_color=color,
	opacity=0.8,
	legendgroup=model,
	showlegend=False,
	hovertemplate=(
	f"<b>{model}</b><br>" +
	"Language Pair: %{x}<br>" +
	"BLEU: %{y:.2f}<br>" +
	f"Category: {category}<br>" +
	"<extra></extra>"
	)
	),
	row=2, col=1
	)

	# Update layout
	track_info = EVALUATION_TRACKS[track]
	fig.update_layout(
	title=f"📊 {track_info['name']} - Language Pair Performance Comparison",
	height=800,
	barmode='group',
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(0,0,0,0)",
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=1.02,
	xanchor="right",
	x=1
	)
	)

	# Rotate x-axis labels for better readability
	fig.update_xaxes(tickangle=45, row=2, col=1)
	fig.update_yaxes(title_text="Quality Score", row=1, col=1)
	fig.update_yaxes(title_text="BLEU Score", row=2, col=1)

	return fig


	def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure:
	"""Create category-wise comparison plot."""

	if df.empty:
	fig = go.Figure()
	fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
	return fig

	metric_col = f"{track}_quality"

	# Filter to models with data
	valid_models = df[df[metric_col] > 0]

	if valid_models.empty:
	fig = go.Figure()
	fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False)
	return fig

	fig = go.Figure()

	# Create box plot for each category
	for category, info in MODEL_CATEGORIES.items():
	category_models = valid_models[valid_models["model_category"] == category]

	if len(category_models) > 0:
	fig.add_trace(go.Box(
	y=category_models[metric_col],
	name=info["name"],
	marker_color=info["color"],
	boxpoints="all", # Show all points
	jitter=0.3,
	pointpos=-1.8,
	hovertemplate=(
	f"<b>{info['name']}</b><br>" +
	"Quality: %{y:.4f}<br>" +
	"Model: %{customdata}<br>" +
	"<extra></extra>"
	),
	customdata=category_models["model_name"],
	))

	# Customize layout
	track_info = EVALUATION_TRACKS[track]
	fig.update_layout(
	title=f"📈 {track_info['name']} - Performance by Category",
	xaxis_title="Model Category",
	yaxis_title="Quality Score",
	height=500,
	showlegend=False,
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(0,0,0,0)",
	)

	return fig