# src/plotting.py import plotly.graph_objects as go import plotly.express as px from plotly.subplots import make_subplots import pandas as pd import numpy as np import json from collections import defaultdict from typing import Dict, List, Optional, Union from config import ( LANGUAGE_NAMES, ALL_UG40_LANGUAGES, GOOGLE_SUPPORTED_LANGUAGES, METRICS_CONFIG, EVALUATION_TRACKS, MODEL_CATEGORIES, CHART_CONFIG, ) def create_leaderboard_plot( df: pd.DataFrame, track: str, metric: str = "quality", top_n: int = 15 ) -> go.Figure: """Create leaderboard plot with confidence intervals.""" if df.empty: fig = go.Figure() fig.add_annotation( text="No models available for this track", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False, font=dict(size=16) ) fig.update_layout( title=f"No Data Available - {track.title()} Track", paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)" ) return fig try: # Get top N models for this track metric_col = f"{track}_{metric}" ci_lower_col = f"{track}_ci_lower" ci_upper_col = f"{track}_ci_upper" if metric_col not in df.columns: fig = go.Figure() fig.add_annotation( text=f"Metric {metric} not available for {track} track", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False, ) return fig # Ensure numeric columns are properly typed numeric_cols = [metric_col, ci_lower_col, ci_upper_col] for col in numeric_cols: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Filter and sort valid_models = df[(df[metric_col] > 0)].head(top_n).copy() if valid_models.empty: fig = go.Figure() fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False) return fig # Create color mapping by category colors = [MODEL_CATEGORIES.get(cat, {}).get("color", "#808080") for cat in valid_models["model_category"]] # Main bar plot fig = go.Figure() # Add bars with error bars if confidence intervals available error_x = None if ci_lower_col in valid_models.columns and ci_upper_col in valid_models.columns: try: error_x = dict( type="data", array=valid_models[ci_upper_col] - valid_models[metric_col], arrayminus=valid_models[metric_col] - valid_models[ci_lower_col], visible=True, thickness=2, width=4, ) except Exception as e: print(f"Error creating error bars: {e}") error_x = None # Safely format text values try: text_values = [f"{float(score):.3f}" for score in valid_models[metric_col]] except: text_values = ["0.000"] * len(valid_models) # Safely prepare custom data try: samples_col = f"{track}_samples" samples_data = valid_models.get(samples_col, [0] * len(valid_models)) customdata = list(zip( valid_models["model_category"].fillna("unknown"), valid_models["author"].fillna("Anonymous"), [int(float(x)) if pd.notnull(x) else 0 for x in samples_data] )) except Exception as e: print(f"Error preparing custom data: {e}") customdata = [("unknown", "Anonymous", 0)] * len(valid_models) fig.add_trace(go.Bar( y=valid_models["model_name"], x=valid_models[metric_col], orientation="h", marker=dict(color=colors, line=dict(color="black", width=0.5)), error_x=error_x, text=text_values, textposition="auto", hovertemplate=( "%{y}
" + f"{metric.title()}: %{{x:.4f}}
" + "Category: %{customdata[0]}
" + "Author: %{customdata[1]}
" + "Samples: %{customdata[2]}
" + "" ), customdata=customdata, )) # Customize layout track_info = EVALUATION_TRACKS[track] fig.update_layout( title=f"πŸ† {track_info['name']} - {metric.title()} Score", xaxis_title=f"{metric.title()} Score (with 95% CI)", yaxis_title="Models", height=max(400, len(valid_models) * 35 + 100), margin=dict(l=20, r=20, t=60, b=20), paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", font=dict(size=12), ) # Reverse y-axis to show best model at top fig.update_yaxes(autorange="reversed") return fig except Exception as e: print(f"Error creating leaderboard plot: {e}") fig = go.Figure() fig.add_annotation( text=f"Error creating plot: {str(e)}", x=0.5, y=0.5, showarrow=False ) return fig def create_language_pair_heatmap( model_results: Dict, track: str, metric: str = "quality_score" ) -> go.Figure: """Create language pair heatmap for a model.""" if not model_results or "tracks" not in model_results: fig = go.Figure() fig.add_annotation(text="No model results available", x=0.5, y=0.5, showarrow=False) return fig track_data = model_results["tracks"].get(track, {}) if track_data.get("error") or "pair_metrics" not in track_data: fig = go.Figure() fig.add_annotation(text=f"No data available for {track} track", x=0.5, y=0.5, showarrow=False) return fig pair_metrics = track_data["pair_metrics"] track_languages = EVALUATION_TRACKS[track]["languages"] # Create matrix for heatmap n_langs = len(track_languages) matrix = np.full((n_langs, n_langs), np.nan) for i, src_lang in enumerate(track_languages): for j, tgt_lang in enumerate(track_languages): if src_lang != tgt_lang: pair_key = f"{src_lang}_to_{tgt_lang}" if pair_key in pair_metrics and metric in pair_metrics[pair_key]: matrix[i, j] = pair_metrics[pair_key][metric]["mean"] # Create language labels lang_labels = [LANGUAGE_NAMES.get(lang, lang.upper()) for lang in track_languages] # Create heatmap fig = go.Figure(data=go.Heatmap( z=matrix, x=lang_labels, y=lang_labels, colorscale="Viridis", showscale=True, colorbar=dict( title=f"{metric.replace('_', ' ').title()}", titleside="right", len=0.8, ), hovertemplate=( "Source: %{y}
" + "Target: %{x}
" + f"{metric.replace('_', ' ').title()}: %{{z:.3f}}
" + "" ), zmin=0, zmax=1 if metric == "quality_score" else None, )) # Customize layout track_info = EVALUATION_TRACKS[track] fig.update_layout( title=f"πŸ—ΊοΈ {track_info['name']} - {metric.replace('_', ' ').title()} by Language Pair", xaxis_title="Target Language", yaxis_title="Source Language", height=600, width=700, font=dict(size=12), xaxis=dict(side="bottom"), yaxis=dict(autorange="reversed"), paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", ) return fig def create_performance_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure: """Create performance comparison plot showing confidence intervals.""" if df.empty: fig = go.Figure() fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False) return fig try: metric_col = f"{track}_quality" ci_lower_col = f"{track}_ci_lower" ci_upper_col = f"{track}_ci_upper" # Ensure numeric columns are properly typed numeric_cols = [metric_col, ci_lower_col, ci_upper_col] for col in numeric_cols: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Filter to models with data for this track valid_models = df[ (df[metric_col] > 0) & (df[ci_lower_col].notna()) & (df[ci_upper_col].notna()) ].head(10).copy() if valid_models.empty: fig = go.Figure() fig.add_annotation(text="No models with confidence intervals", x=0.5, y=0.5, showarrow=False) return fig fig = go.Figure() # Add confidence intervals as error bars for i, (_, model) in enumerate(valid_models.iterrows()): try: category = str(model["model_category"]) color = MODEL_CATEGORIES.get(category, {}).get("color", "#808080") model_name = str(model["model_name"]) # Safely extract numeric values quality_val = float(model[metric_col]) ci_lower_val = float(model[ci_lower_col]) ci_upper_val = float(model[ci_upper_col]) # Main point fig.add_trace(go.Scatter( x=[quality_val], y=[i], mode="markers", marker=dict( size=12, color=color, line=dict(color="black", width=1), ), name=model_name, showlegend=False, hovertemplate=( f"{model_name}
" + f"Quality: {quality_val:.4f}
" + f"95% CI: [{ci_lower_val:.4f}, {ci_upper_val:.4f}]
" + f"Category: {category}
" + "" ), )) # Confidence interval line fig.add_trace(go.Scatter( x=[ci_lower_val, ci_upper_val], y=[i, i], mode="lines", line=dict(color=color, width=3), showlegend=False, hoverinfo="skip", )) except Exception as e: print(f"Error adding model {i} to comparison plot: {e}") continue # Safely prepare tick labels try: tick_labels = [str(name) for name in valid_models["model_name"]] except: tick_labels = [f"Model {i}" for i in range(len(valid_models))] # Customize layout track_info = EVALUATION_TRACKS[track] fig.update_layout( title=f"πŸ“Š {track_info['name']} - Performance Comparison", xaxis_title="Quality Score", yaxis_title="Models", height=max(400, len(valid_models) * 40 + 100), yaxis=dict( tickmode="array", tickvals=list(range(len(valid_models))), ticktext=tick_labels, autorange="reversed", ), showlegend=False, paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", ) return fig except Exception as e: print(f"Error creating performance comparison plot: {e}") fig = go.Figure() fig.add_annotation( text=f"Error creating plot: {str(e)}", x=0.5, y=0.5, showarrow=False ) return fig def create_language_pair_comparison_plot(pairs_df: pd.DataFrame, track: str) -> go.Figure: """Create language pair comparison plot showing all models across all pairs.""" if pairs_df.empty: fig = go.Figure() fig.add_annotation( text="No language pair data available", x=0.5, y=0.5, showarrow=False ) return fig # Get unique language pairs and models language_pairs = sorted(pairs_df['Language Pair'].unique()) models = sorted(pairs_df['Model'].unique()) if len(language_pairs) == 0 or len(models) == 0: fig = go.Figure() fig.add_annotation( text="Insufficient data for comparison", x=0.5, y=0.5, showarrow=False ) return fig # Create subplot for each metric fig = make_subplots( rows=2, cols=1, subplot_titles=('Quality Score by Language Pair', 'BLEU Score by Language Pair'), vertical_spacing=0.1, shared_xaxes=True ) # Quality Score comparison for model in models: model_data = pairs_df[pairs_df['Model'] == model] category = model_data['Category'].iloc[0] if not model_data.empty else 'community' color = MODEL_CATEGORIES.get(category, {}).get('color', '#808080') fig.add_trace( go.Bar( name=model, x=model_data['Language Pair'], y=model_data['Quality Score'], marker_color=color, opacity=0.8, legendgroup=model, showlegend=True, hovertemplate=( f"{model}
" + "Language Pair: %{x}
" + "Quality Score: %{y:.4f}
" + f"Category: {category}
" + "" ) ), row=1, col=1 ) # BLEU Score comparison fig.add_trace( go.Bar( name=model, x=model_data['Language Pair'], y=model_data['BLEU'], marker_color=color, opacity=0.8, legendgroup=model, showlegend=False, hovertemplate=( f"{model}
" + "Language Pair: %{x}
" + "BLEU: %{y:.2f}
" + f"Category: {category}
" + "" ) ), row=2, col=1 ) # Update layout track_info = EVALUATION_TRACKS[track] fig.update_layout( title=f"πŸ“Š {track_info['name']} - Language Pair Performance Comparison", height=800, barmode='group', paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ) ) # Rotate x-axis labels for better readability fig.update_xaxes(tickangle=45, row=2, col=1) fig.update_yaxes(title_text="Quality Score", row=1, col=1) fig.update_yaxes(title_text="BLEU Score", row=2, col=1) return fig def create_category_comparison_plot(df: pd.DataFrame, track: str) -> go.Figure: """Create category-wise comparison plot.""" if df.empty: fig = go.Figure() fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False) return fig metric_col = f"{track}_quality" # Filter to models with data valid_models = df[df[metric_col] > 0] if valid_models.empty: fig = go.Figure() fig.add_annotation(text="No valid models found", x=0.5, y=0.5, showarrow=False) return fig fig = go.Figure() # Create box plot for each category for category, info in MODEL_CATEGORIES.items(): category_models = valid_models[valid_models["model_category"] == category] if len(category_models) > 0: fig.add_trace(go.Box( y=category_models[metric_col], name=info["name"], marker_color=info["color"], boxpoints="all", # Show all points jitter=0.3, pointpos=-1.8, hovertemplate=( f"{info['name']}
" + "Quality: %{y:.4f}
" + "Model: %{customdata}
" + "" ), customdata=category_models["model_name"], )) # Customize layout track_info = EVALUATION_TRACKS[track] fig.update_layout( title=f"πŸ“ˆ {track_info['name']} - Performance by Category", xaxis_title="Model Category", yaxis_title="Quality Score", height=500, showlegend=False, paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)", ) return fig