Spaces:
Running
Running
import sys | |
from pathlib import Path | |
sys.path.append(str(Path(__file__).parent.parent)) | |
import fev | |
import pandas as pd | |
import streamlit as st | |
from streamlit.elements.lib.column_types import ColumnConfig | |
from src.strings import ( | |
CHRONOS_BENCHMARK_BASIC_INFO, | |
CHRONOS_BENCHMARK_DETAILS, | |
CITATION_CHRONOS, | |
CITATION_FEV, | |
CITATION_HEADER, | |
PAIRWISE_BENCHMARK_DETAILS, | |
get_pivot_legend, | |
) | |
from src.utils import ( | |
construct_bar_chart, | |
construct_pairwise_chart, | |
construct_pivot_table, | |
format_leaderboard, | |
format_metric_name, | |
get_metric_description, | |
) | |
st.set_page_config(layout="wide", page_title="FEV Benchmark Leaderboard", page_icon=":material/trophy:") | |
TITLE = "<h1 style='text-align: center; font-size: 350%;'>Chronos Benchmark II</h1>" | |
BASELINE_MODEL = "seasonal_naive" | |
LEAKAGE_IMPUTATION_MODEL = "chronos_bolt_base" | |
SORT_COL = "win_rate" | |
N_RESAMPLES_FOR_CI = 1000 | |
TOP_K_MODELS_TO_PLOT = 15 | |
AVAILABLE_METRICS = ["WQL", "MASE"] | |
SUMMARY_URLS = [ | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_arima.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_ets.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_theta.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_base.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_large.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_mini.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_small.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_tiny.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_base.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_mini.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_small.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_tiny.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_base.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_large.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_small.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm-2.0.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/ttm-r2.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/tirex.csv", | |
] | |
def load_summaries(): | |
summaries = [] | |
for url in SUMMARY_URLS: | |
df = pd.read_csv(url) | |
summaries.append(df) | |
return pd.concat(summaries, ignore_index=True) | |
def get_leaderboard(metric_name: str) -> pd.DataFrame: | |
summaries = load_summaries() | |
lb = fev.analysis.leaderboard( | |
summaries=summaries, | |
metric_column=metric_name, | |
missing_strategy="impute", | |
baseline_model=BASELINE_MODEL, | |
leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL, | |
) | |
lb = lb.astype("float64").reset_index() | |
lb["skill_score"] = lb["skill_score"] * 100 | |
lb["win_rate"] = lb["win_rate"] * 100 | |
lb["num_failures"] = lb["num_failures"] / summaries["task_name"].nunique() * 100 | |
return lb | |
def get_pairwise(metric_name: str, included_models: list[str]) -> pd.DataFrame: | |
if BASELINE_MODEL not in included_models: | |
included_models = included_models + [BASELINE_MODEL] | |
summaries = load_summaries() | |
return ( | |
fev.analysis.pairwise_comparison( | |
summaries, | |
included_models=included_models, | |
metric_column=metric_name, | |
baseline_model=BASELINE_MODEL, | |
missing_strategy="impute", | |
n_resamples=N_RESAMPLES_FOR_CI, | |
leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL, | |
) | |
.round(3) | |
.reset_index() | |
) | |
with st.sidebar: | |
selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name) | |
st.caption(get_metric_description(selected_metric)) | |
cols = st.columns(spec=[0.025, 0.95, 0.025]) | |
with cols[1] as main_container: | |
st.markdown(TITLE, unsafe_allow_html=True) | |
metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False) | |
top_k_models = metric_df.head(TOP_K_MODELS_TO_PLOT)["model_name"].tolist() | |
pairwise_df = get_pairwise(selected_metric, included_models=top_k_models) | |
st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True) | |
st.markdown(CHRONOS_BENCHMARK_BASIC_INFO, unsafe_allow_html=True) | |
df_styled = format_leaderboard(metric_df) | |
st.dataframe( | |
df_styled, | |
use_container_width=True, | |
hide_index=True, | |
column_config={ | |
"model_name": ColumnConfig(label="Model Name", alignment="left"), | |
"win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"), | |
"skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"), | |
"median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"), | |
"training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"), | |
"num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"), | |
"zero_shot": ColumnConfig(label="Zero-shot", alignment="center"), | |
"org": ColumnConfig(label="Organization", alignment="left"), | |
"link": st.column_config.LinkColumn(label="Link", display_text=":material/open_in_new:"), | |
}, | |
) | |
with st.expander("See details"): | |
st.markdown(CHRONOS_BENCHMARK_DETAILS, unsafe_allow_html=True) | |
st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True) | |
chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45]) | |
with chart_col_1: | |
st.altair_chart( | |
construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric), | |
use_container_width=True, | |
) | |
with chart_col_2: | |
st.altair_chart( | |
construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric), | |
use_container_width=True, | |
) | |
with st.expander("See details"): | |
st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True) | |
st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True) | |
with st.expander("Show detailed results"): | |
st.markdown(get_pivot_legend(BASELINE_MODEL, LEAKAGE_IMPUTATION_MODEL), unsafe_allow_html=True) | |
st.dataframe( | |
construct_pivot_table( | |
summaries=load_summaries(), | |
metric_name=selected_metric, | |
baseline_model=BASELINE_MODEL, | |
leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL, | |
) | |
) | |
st.divider() | |
st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True) | |
st.markdown(CITATION_HEADER) | |
st.markdown(CITATION_FEV) | |
st.markdown(CITATION_CHRONOS) | |