shchuro commited on
Commit
eb8267f
·
1 Parent(s): ef56e52

Add new leaderboard

Browse files
Dockerfile CHANGED
@@ -9,7 +9,7 @@ RUN apt-get update && apt-get install -y \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
  COPY requirements.txt ./
12
- COPY src/ ./src/
13
 
14
  RUN pip3 install -r requirements.txt
15
 
@@ -17,4 +17,4 @@ EXPOSE 8501
17
 
18
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
 
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
  COPY requirements.txt ./
12
+ COPY . .
13
 
14
  RUN pip3 install -r requirements.txt
15
 
 
17
 
18
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
 
20
+ ENTRYPOINT ["streamlit", "run", "fev-leaderboard-app.py", "--server.port=8501", "--server.address=0.0.0.0"]
fev-leaderboard-app.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ pages = [
4
+ st.Page("pages/fev_bench.py", title="fev-bench", icon=":material/trophy:"),
5
+ st.Page("pages/chronos_bench_ii.py", title="Chronos Benchmark II", icon=":material/trophy:"),
6
+ st.Page("pages/about.py", title="About", icon=":material/info:"),
7
+ ]
8
+
9
+ page = st.navigation(pages)
10
+ page.run()
pages/about.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ ABOUT_LEADERBOARD = """
4
+ ## About `fev`
5
+
6
+ **fev** is a lightweight wrapper around the 🤗 [datasets](https://huggingface.co/docs/datasets/en/index) library designed to streamline
7
+ time series forecasting model benchmarking.
8
+
9
+ ### 📚 Resources
10
+ - **Documentation**: [Official docs](https://autogluon.github.io/fev/latest/)
11
+ - **Source Code**: [GitHub repository](https://github.com/autogluon/fev)
12
+ - **Issues & Questions**: [GitHub Issues](https://github.com/autogluon/fev/issues)
13
+
14
+ ### 🚀 Submit Your Model
15
+ Ready to add your model to the leaderboard? Follow this [tutorial](https://autogluon.github.io/fev/latest/tutorials/04-models/) to evaluate your model with fev and contribute your results.
16
+ """
17
+ st.set_page_config(layout="wide", page_title="About FEV", page_icon=":material/info:")
18
+ st.markdown(ABOUT_LEADERBOARD)
pages/chronos_bench_ii.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ sys.path.append(str(Path(__file__).parent.parent))
5
+
6
+ import fev
7
+ import pandas as pd
8
+ import streamlit as st
9
+ from streamlit.elements.lib.column_types import ColumnConfig
10
+
11
+ from src.strings import (
12
+ CHRONOS_BENCHMARK_BASIC_INFO,
13
+ CHRONOS_BENCHMARK_DETAILS,
14
+ CITATION_CHRONOS,
15
+ CITATION_FEV,
16
+ CITATION_HEADER,
17
+ PAIRWISE_BENCHMARK_DETAILS,
18
+ get_pivot_legend,
19
+ )
20
+ from src.utils import (
21
+ construct_bar_chart,
22
+ construct_pairwise_chart,
23
+ construct_pivot_table,
24
+ format_leaderboard,
25
+ format_metric_name,
26
+ get_metric_description,
27
+ )
28
+
29
+ st.set_page_config(layout="wide", page_title="FEV Benchmark Leaderboard", page_icon=":material/trophy:")
30
+
31
+ TITLE = "<h1 style='text-align: center; font-size: 350%;'>Chronos Benchmark II</h1>"
32
+ BASELINE_MODEL = "seasonal_naive"
33
+ LEAKAGE_IMPUTATION_MODEL = "chronos_bolt_base"
34
+ SORT_COL = "win_rate"
35
+ N_RESAMPLES_FOR_CI = 1000
36
+ TOP_K_MODELS_TO_PLOT = 15
37
+ AVAILABLE_METRICS = ["WQL", "MASE"]
38
+ SUMMARY_URLS = [
39
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_arima.csv",
40
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_ets.csv",
41
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_theta.csv",
42
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_base.csv",
43
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_large.csv",
44
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_mini.csv",
45
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_small.csv",
46
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_tiny.csv",
47
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_base.csv",
48
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_mini.csv",
49
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_small.csv",
50
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_tiny.csv",
51
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_base.csv",
52
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_large.csv",
53
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_small.csv",
54
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv",
55
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
56
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm-2.0.csv",
57
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/ttm-r2.csv",
58
+ "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/tirex.csv",
59
+ ]
60
+
61
+
62
+ @st.cache_data()
63
+ def load_summaries():
64
+ summaries = []
65
+ for url in SUMMARY_URLS:
66
+ df = pd.read_csv(url)
67
+ summaries.append(df)
68
+ return pd.concat(summaries, ignore_index=True)
69
+
70
+
71
+ @st.cache_data()
72
+ def get_leaderboard(metric_name: str) -> pd.DataFrame:
73
+ summaries = load_summaries()
74
+ lb = fev.analysis.leaderboard(
75
+ summaries=summaries,
76
+ metric_column=metric_name,
77
+ missing_strategy="impute",
78
+ baseline_model=BASELINE_MODEL,
79
+ leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
80
+ )
81
+ lb = lb.astype("float64").reset_index()
82
+
83
+ lb["skill_score"] = lb["skill_score"] * 100
84
+ lb["win_rate"] = lb["win_rate"] * 100
85
+ lb["num_failures"] = lb["num_failures"] / summaries["task_name"].nunique() * 100
86
+ return lb
87
+
88
+
89
+ @st.cache_data()
90
+ def get_pairwise(metric_name: str, included_models: list[str]) -> pd.DataFrame:
91
+ if BASELINE_MODEL not in included_models:
92
+ included_models = included_models + [BASELINE_MODEL]
93
+ summaries = load_summaries()
94
+ return (
95
+ fev.analysis.pairwise_comparison(
96
+ summaries,
97
+ included_models=included_models,
98
+ metric_column=metric_name,
99
+ baseline_model=BASELINE_MODEL,
100
+ missing_strategy="impute",
101
+ n_resamples=N_RESAMPLES_FOR_CI,
102
+ leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
103
+ )
104
+ .round(3)
105
+ .reset_index()
106
+ )
107
+
108
+
109
+ with st.sidebar:
110
+ selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name)
111
+ st.caption(get_metric_description(selected_metric))
112
+
113
+ cols = st.columns(spec=[0.025, 0.95, 0.025])
114
+
115
+ with cols[1] as main_container:
116
+ st.markdown(TITLE, unsafe_allow_html=True)
117
+
118
+ metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False)
119
+ top_k_models = metric_df.head(TOP_K_MODELS_TO_PLOT)["model_name"].tolist()
120
+ pairwise_df = get_pairwise(selected_metric, included_models=top_k_models)
121
+
122
+ st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True)
123
+ st.markdown(CHRONOS_BENCHMARK_BASIC_INFO, unsafe_allow_html=True)
124
+ df_styled = format_leaderboard(metric_df)
125
+ st.dataframe(
126
+ df_styled,
127
+ use_container_width=True,
128
+ hide_index=True,
129
+ column_config={
130
+ "model_name": ColumnConfig(label="Model Name", alignment="left"),
131
+ "win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"),
132
+ "skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"),
133
+ "median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"),
134
+ "training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"),
135
+ "num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"),
136
+ "zero_shot": ColumnConfig(label="Zero-shot", alignment="center"),
137
+ "org": ColumnConfig(label="Organization", alignment="left"),
138
+ "link": st.column_config.LinkColumn(label="Link", display_text=":material/open_in_new:"),
139
+ },
140
+ )
141
+
142
+ with st.expander("See details"):
143
+ st.markdown(CHRONOS_BENCHMARK_DETAILS, unsafe_allow_html=True)
144
+
145
+ st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True)
146
+ chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45])
147
+
148
+ with chart_col_1:
149
+ st.altair_chart(
150
+ construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric),
151
+ use_container_width=True,
152
+ )
153
+
154
+ with chart_col_2:
155
+ st.altair_chart(
156
+ construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric),
157
+ use_container_width=True,
158
+ )
159
+
160
+ with st.expander("See details"):
161
+ st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True)
162
+
163
+ st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True)
164
+ with st.expander("Show detailed results"):
165
+ st.markdown(get_pivot_legend(BASELINE_MODEL, LEAKAGE_IMPUTATION_MODEL), unsafe_allow_html=True)
166
+ st.dataframe(
167
+ construct_pivot_table(
168
+ summaries=load_summaries(),
169
+ metric_name=selected_metric,
170
+ baseline_model=BASELINE_MODEL,
171
+ leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
172
+ )
173
+ )
174
+
175
+ st.divider()
176
+ st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True)
177
+ st.markdown(CITATION_HEADER)
178
+ st.markdown(CITATION_FEV)
179
+ st.markdown(CITATION_CHRONOS)
pages/fev_bench.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ sys.path.append(str(Path(__file__).parent.parent))
5
+
6
+ import fev
7
+ import pandas as pd
8
+ import streamlit as st
9
+ from streamlit.elements.lib.column_types import ColumnConfig
10
+
11
+ from src.strings import (
12
+ CITATION_FEV,
13
+ CITATION_HEADER,
14
+ FEV_BENCHMARK_BASIC_INFO,
15
+ FEV_BENCHMARK_DETAILS,
16
+ PAIRWISE_BENCHMARK_DETAILS,
17
+ get_pivot_legend,
18
+ )
19
+ from src.utils import (
20
+ construct_pairwise_chart,
21
+ construct_pivot_table,
22
+ format_leaderboard,
23
+ format_metric_name,
24
+ get_metric_description,
25
+ )
26
+
27
+ st.set_page_config(layout="wide", page_title="fev leaderboard", page_icon=":material/trophy:")
28
+
29
+ TITLE = "<h1 style='text-align: center; font-size: 350%;'>fev-bench</h1>"
30
+ BASELINE_MODEL = "Seasonal Naive"
31
+ LEAKAGE_IMPUTATION_MODEL = "Chronos-Bolt"
32
+ SORT_COL = "win_rate"
33
+ N_RESAMPLES_FOR_CI = 1000
34
+ TOP_K_MODELS_TO_PLOT = 15
35
+ AVAILABLE_METRICS = ["SQL", "MASE", "WQL", "WAPE"]
36
+
37
+
38
+ @st.cache_data()
39
+ def load_summaries():
40
+ summaries = pd.read_csv("summaries.csv")
41
+ return summaries
42
+
43
+
44
+ @st.cache_data()
45
+ def get_leaderboard(metric_name: str) -> pd.DataFrame:
46
+ summaries = load_summaries()
47
+ lb = fev.analysis.leaderboard(
48
+ summaries=summaries,
49
+ metric_column=metric_name,
50
+ missing_strategy="impute",
51
+ baseline_model=BASELINE_MODEL,
52
+ leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
53
+ )
54
+ lb = lb.astype("float64").reset_index()
55
+
56
+ lb["skill_score"] = lb["skill_score"] * 100
57
+ lb["win_rate"] = lb["win_rate"] * 100
58
+ lb["num_failures"] = lb["num_failures"] / summaries["task_name"].nunique() * 100
59
+ return lb
60
+
61
+
62
+ @st.cache_data()
63
+ def get_pairwise(metric_name: str, included_models: list[str]) -> pd.DataFrame:
64
+ if BASELINE_MODEL not in included_models:
65
+ included_models = included_models + [BASELINE_MODEL]
66
+ summaries = load_summaries()
67
+ return (
68
+ fev.analysis.pairwise_comparison(
69
+ summaries,
70
+ included_models=included_models,
71
+ metric_column=metric_name,
72
+ baseline_model=BASELINE_MODEL,
73
+ missing_strategy="impute",
74
+ n_resamples=N_RESAMPLES_FOR_CI,
75
+ leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
76
+ )
77
+ .round(3)
78
+ .reset_index()
79
+ )
80
+
81
+
82
+ with st.sidebar:
83
+ selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name)
84
+ st.caption(get_metric_description(selected_metric))
85
+
86
+ cols = st.columns(spec=[0.025, 0.95, 0.025])
87
+
88
+ with cols[1] as main_container:
89
+ st.markdown(TITLE, unsafe_allow_html=True)
90
+
91
+ metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False)
92
+ top_k_models = metric_df.head(TOP_K_MODELS_TO_PLOT)["model_name"].tolist()
93
+ pairwise_df = get_pairwise(selected_metric, included_models=top_k_models)
94
+
95
+ st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True)
96
+ st.markdown(FEV_BENCHMARK_BASIC_INFO, unsafe_allow_html=True)
97
+ df_styled = format_leaderboard(metric_df)
98
+ st.dataframe(
99
+ df_styled,
100
+ use_container_width=True,
101
+ hide_index=True,
102
+ column_config={
103
+ "model_name": ColumnConfig(label="Model Name", alignment="left"),
104
+ "win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"),
105
+ "skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"),
106
+ "median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"),
107
+ "training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"),
108
+ "num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"),
109
+ "zero_shot": ColumnConfig(label="Zero-shot", alignment="center"),
110
+ "org": ColumnConfig(label="Organization", alignment="left"),
111
+ "link": st.column_config.LinkColumn(label="Link", display_text="🔗"),
112
+ },
113
+ )
114
+
115
+ with st.expander("See details"):
116
+ st.markdown(FEV_BENCHMARK_DETAILS, unsafe_allow_html=True)
117
+
118
+ st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True)
119
+ chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45])
120
+
121
+ with chart_col_1:
122
+ st.altair_chart(
123
+ construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric),
124
+ use_container_width=True,
125
+ )
126
+
127
+ with chart_col_2:
128
+ st.altair_chart(
129
+ construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric),
130
+ use_container_width=True,
131
+ )
132
+
133
+ with st.expander("See details"):
134
+ st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True)
135
+
136
+ st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True)
137
+ with st.expander("Show detailed results"):
138
+ st.markdown(get_pivot_legend(BASELINE_MODEL, LEAKAGE_IMPUTATION_MODEL), unsafe_allow_html=True)
139
+ st.dataframe(
140
+ construct_pivot_table(
141
+ summaries=load_summaries(),
142
+ metric_name=selected_metric,
143
+ baseline_model=BASELINE_MODEL,
144
+ leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
145
+ )
146
+ )
147
+
148
+ st.divider()
149
+ st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True)
150
+ st.markdown(CITATION_HEADER)
151
+ st.markdown(CITATION_FEV)
requirements.txt CHANGED
@@ -1,3 +1,7 @@
1
- altair
2
  pandas
3
- streamlit
 
 
 
 
 
 
 
1
  pandas
2
+ matplotlib
3
+ numpy
4
+ pandas
5
+ streamlit==1.49.1
6
+ fev>=0.6.0
7
+ altair>=5.5.0
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (168 Bytes). View file
 
src/__pycache__/colors.cpython-311.pyc ADDED
Binary file (327 Bytes). View file
 
src/__pycache__/strings.cpython-311.pyc ADDED
Binary file (7.32 kB). View file
 
src/__pycache__/utils.cpython-311.pyc ADDED
Binary file (17.4 kB). View file
 
src/colors.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Legacy colors - kept for backward compatibility if needed elsewhere
2
+ VERY_PALE_PURPLE = "#e8d9f3"
3
+ VERY_PALE_GREEN = "#cffdbc"
4
+ VERY_PALE_BLUE = "#d6fffe"
5
+ DEEP_LAVENDER = "#8d5eb7"
6
+ GRASS_GREEN = "#3f9b0b"
src/strings.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.utils import COLORS
2
+
3
+ INTRODUCTION_TEXT = """
4
+ This space hosts evaluation results for time series forecasting models. The results are obtained using [fev](https://github.com/autogluon/fev) - a lightweight library for evaluating time series forecasting models.
5
+ """
6
+
7
+ LEGEND = """
8
+ """
9
+
10
+ TABLE_INFO = f"""
11
+ The leaderboard summarizes the performance of all models evaluated on the 100 tasks comprising `fev-bench`.
12
+
13
+ Model names are colored by type: <span style='color: {COLORS["dl_text"]}; font-weight: bold;'>Deep Learning</span> and <span style='color: {COLORS["st_text"]}; font-weight: bold;'>Statistical</span>.
14
+
15
+ The full matrix $E_{{rj}}$ with the error of each model $j$ on task $r$ is available at the bottom of the page.
16
+
17
+ * **Avg. win rate (%)**: Fraction of all possible model pairs and tasks where this model achieves lower error than the competing model. For model $j$, defined as $W_j = \\frac{{1}}{{R(M-1)}} \\sum_{{r=1}}^{{R}} \\sum_{{k \\neq j}} (\\mathbf{{1}}(E_{{rj}} < E_{{rk}}) + 0.5 \\cdot \\mathbf{{1}}(E_{{rj}} = E_{{rk}}))$ where $R$ is number of tasks, $M$ is number of models. Ties count as half-wins. Ranges from 0% (worst) to 100% (best). Higher values are better.
18
+
19
+ * **Skill score (%)**: Measures how much the model reduces forecasting error compared to the Seasonal Naive baseline. Computed as $S_j = 100 \\times (1 - \\sqrt[R]{{\\prod_{{r=1}}^{{R}} E_{{rj}}/E_{{r\\beta}}}})$, where $E_{{r\\beta}}$ is baseline error on task $r$. Relative errors are clipped between 0.01 and 100 before aggregation to avoid extreme outliers. Positive values indicate better-than-baseline performance, negative values indicate worse-than-baseline performance. Higher values are better.
20
+
21
+ * **Median runtime (s)**: Median end-to-end time (training + prediction across all evaluation windows) in seconds. Note that inference times depend on hardware, batch sizes, and implementation details, so these serve as a rough guide rather than definitive performance benchmarks.
22
+
23
+ * **Leakage (%)**: For zero-shot models, percentage of benchmark datasets included in the model's training corpus. For zero-shot models, results for tasks with reported overlap are replaced with Chronos-Bolt (Base) performance to prevent data leakage.
24
+
25
+ * **Failed tasks (%)**: Percentage of tasks where the model failed to produce a forecast. Results for failed tasks are replaced with Seasonal Naive performance.
26
+
27
+ * **Zero-shot**: Indicates whether the model can make predictions without task-specific training (✓ = zero-shot, × = task-specific).
28
+ """
29
+
30
+ CHRONOS_BENCHMARK_BASIC_INFO = f"""
31
+ **Chronos Benchmark II** contains results for various forecasting models on the 27 datasets used in Benchmark II in the paper [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815). {LEGEND}
32
+ """
33
+
34
+ CHRONOS_BENCHMARK_DETAILS = f"""
35
+ {TABLE_INFO}
36
+
37
+ Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/abs/2403.07815).
38
+ """
39
+
40
+ FEV_BENCHMARK_BASIC_INFO = f"""
41
+ Results for various forecasting models on 100 tasks of the **fev-bench** benchmark, as described in the paper [fev-bench: A Realistic Benchmark for Time Series Forecasting](https://arxiv.org/abs/2509.26468). {LEGEND}
42
+ """
43
+
44
+ FEV_BENCHMARK_DETAILS = f"""
45
+ {TABLE_INFO}
46
+
47
+ Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/).
48
+ """
49
+
50
+ CITATION_HEADER = """
51
+ If you find this leaderboard useful for your research, please consider citing the associated paper(s):
52
+
53
+ """
54
+ CITATION_FEV = """
55
+ ```
56
+ @article{shchur2025fev,
57
+ title={{fev-bench}: A Realistic Benchmark for Time Series Forecasting},
58
+ author={Shchur, Oleksandr and Ansari, Abdul Fatir and Turkmen, Caner and Stella, Lorenzo and Erickson, Nick and Guerron, Pablo and Bohlke-Schneider, Michael and Wang, Yuyang},
59
+ year={2025},
60
+ }
61
+ ```
62
+ """
63
+
64
+
65
+ def get_pivot_legend(baseline_model: str, leakage_imputation_model: str) -> str:
66
+ return f"""
67
+ Task definitions and raw results in CSV format are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/fev_bench).
68
+
69
+ Best results for each task are marked with
70
+ <span style='background: {COLORS["gold"]}; color: {COLORS["text_default"]}; padding: 3px; border-radius: 5px;'>🥇 1st</span>
71
+ <span style='background: {COLORS["silver"]}; color: {COLORS["text_default"]}; padding: 3px; border-radius: 5px;'>🥈 2nd</span>
72
+ <span style='background: {COLORS["bronze"]}; color: {COLORS["text_default"]}; padding: 3px; border-radius: 5px;'>🥉 3rd</span>
73
+ <br><br>
74
+ **Imputation:**
75
+ - <span style='color: {COLORS["failure_impute"]}; font-weight: bold;'>Failed tasks</span> imputed by {baseline_model}
76
+ - <span style='color: {COLORS["leakage_impute"]}; font-weight: bold;'>Leaky tasks</span> imputed by {leakage_imputation_model}
77
+ """
78
+
79
+
80
+ PAIRWISE_BENCHMARK_DETAILS = """
81
+ The pairwise charts show head-to-head results between models:
82
+
83
+ * **Win rate**: Percentage of tasks where Model 1 achieves lower error than Model 2 (ties count as half-wins).
84
+ A value above 50% means Model 1 is more accurate than Model 2 on average.
85
+
86
+ * **Skill score**: Average relative error reduction of Model 1 with respect to Model 2.
87
+ A positive value means Model 1 reduces forecasting error compared to Model 2 on average.
88
+
89
+ **Confidence Intervals**: 95% intervals are estimated using 1000 bootstrap samples over tasks.
90
+ For each bootstrap sample, tasks are resampled with replacement and the pairwise win rate / skill score are recomputed.
91
+ The intervals correspond to the 2.5th and 97.5th percentiles of these bootstrap distributions,
92
+ capturing how model comparisons vary under alternative benchmark compositions.
93
+ """
94
+
95
+
96
+ CITATION_CHRONOS = """
97
+ ```
98
+ @article{ansari2024chronos,
99
+ title={Chronos: Learning the Language of Time Series},
100
+ author={Ansari, Abdul Fatir and Stella, Lorenzo and Turkmen, Caner and Zhang, Xiyuan, and Mercado, Pedro and Shen, Huibin and Shchur, Oleksandr and Rangapuram, Syama Syndar and Pineda Arango, Sebastian and Kapoor, Shubham and Zschiegner, Jasper and Maddix, Danielle C. and Wang, Hao and Mahoney, Michael W. and Torkkola, Kari and Gordon Wilson, Andrew and Bohlke-Schneider, Michael and Wang, Yuyang},
101
+ journal={Transactions on Machine Learning Research},
102
+ issn={2835-8856},
103
+ year={2024},
104
+ url={https://openreview.net/forum?id=gerNCVqqtR}
105
+ }
106
+ ```
107
+ """
src/utils.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import altair as alt
2
+ import fev
3
+ import pandas as pd
4
+ import pandas.io.formats.style
5
+
6
+ # Color constants - all colors defined in one place
7
+
8
+ COLORS = {
9
+ "dl_text": "#5A7FA5",
10
+ "st_text": "#666666",
11
+ "bar_fill": "#8d5eb7",
12
+ "error_bar": "#222222",
13
+ "point": "#111111",
14
+ "text_white": "white",
15
+ "text_black": "black",
16
+ "text_default": "#111",
17
+ "gold": "#F7D36B",
18
+ "silver": "#E5E7EB",
19
+ "bronze": "#E6B089",
20
+ "leakage_impute": "#3B82A0",
21
+ "failure_impute": "#E07B39",
22
+ }
23
+ HEATMAP_COLOR_SCHEME = "purplegreen"
24
+
25
+ # Model configuration: (url, org, zero_shot, model_type)
26
+ MODEL_CONFIG = {
27
+ # Chronos Models
28
+ "chronos_tiny": ("amazon/chronos-t5-tiny", "AWS", True, "DL"),
29
+ "chronos_mini": ("amazon/chronos-t5-mini", "AWS", True, "DL"),
30
+ "chronos_small": ("amazon/chronos-t5-small", "AWS", True, "DL"),
31
+ "chronos_base": ("amazon/chronos-t5-base", "AWS", True, "DL"),
32
+ "chronos_large": ("amazon/chronos-t5-large", "AWS", True, "DL"),
33
+ "chronos_bolt_tiny": ("amazon/chronos-bolt-tiny", "AWS", True, "DL"),
34
+ "chronos_bolt_mini": ("amazon/chronos-bolt-mini", "AWS", True, "DL"),
35
+ "chronos_bolt_small": ("amazon/chronos-bolt-small", "AWS", True, "DL"),
36
+ "chronos_bolt_base": ("amazon/chronos-bolt-base", "AWS", True, "DL"),
37
+ "chronos-bolt": ("amazon/chronos-bolt-base", "AWS", True, "DL"),
38
+ # Moirai Models
39
+ "moirai_large": ("Salesforce/moirai-1.1-R-large", "Salesforce", True, "DL"),
40
+ "moirai_base": ("Salesforce/moirai-1.1-R-base", "Salesforce", True, "DL"),
41
+ "moirai_small": ("Salesforce/moirai-1.1-R-small", "Salesforce", True, "DL"),
42
+ "moirai-2.0": ("Salesforce/moirai-2.0-R-small", "Salesforce", True, "DL"),
43
+ # TimesFM Models
44
+ "timesfm": ("google/timesfm-1.0-200m-pytorch", "Google", True, "DL"),
45
+ "timesfm-2.0": ("google/timesfm-2.0-500m-pytorch", "Google", True, "DL"),
46
+ "timesfm-2.5": ("google/timesfm-2.5-200m-pytorch", "Google", True, "DL"),
47
+ # Toto Models
48
+ "toto-1.0": ("Datadog/Toto-Open-Base-1.0", "Datadog", True, "DL"),
49
+ # Other Models
50
+ "tirex": ("NX-AI/TiRex", "NX-AI", True, "DL"),
51
+ "tabpfn-ts": ("Prior-Labs/TabPFN-v2-reg", "Prior Labs", True, "DL"),
52
+ "sundial-base": ("thuml/sundial-base-128m", "Tsinghua University", True, "DL"),
53
+ "ttm-r2": ("ibm-granite/granite-timeseries-ttm-r2", "IBM", True, "DL"),
54
+ # Task-specific models
55
+ "stat. ensemble": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
56
+ "autoarima": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
57
+ "autotheta": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
58
+ "autoets": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
59
+ "seasonalnaive": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
60
+ "seasonal naive": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
61
+ "drift": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
62
+ "naive": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
63
+ }
64
+
65
+
66
+ ALL_METRICS = {
67
+ "SQL": (
68
+ "SQL: Scaled Quantile Loss",
69
+ "The [Scaled Quantile Loss (SQL)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.SQL) is a scale-invariant metric for evaluating probabilistic forecasts.",
70
+ ),
71
+ "MASE": (
72
+ "MASE: Mean Absolute Scaled Error",
73
+ "The [Mean Absolute Scaled Error (MASE)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.MASE) is a scale-invariant metric for evaluating point forecasts.",
74
+ ),
75
+ "WQL": (
76
+ "WQL: Weighted Quantile Loss",
77
+ "The [Weighted Quantile Loss (WQL)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.WQL), is a scale-dependent metric for evaluating probabilistic forecasts.",
78
+ ),
79
+ "WAPE": (
80
+ "WAPE: Weighted Absolute Percentage Error",
81
+ "The [Weighted Absolute Percentage Error (WAPE)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.WAPE) is a scale-dependent metric for evaluating point forecasts.",
82
+ ),
83
+ }
84
+
85
+
86
+ def format_metric_name(metric_name: str):
87
+ return ALL_METRICS[metric_name][0]
88
+
89
+
90
+ def get_metric_description(metric_name: str):
91
+ return ALL_METRICS[metric_name][1]
92
+
93
+
94
+ def get_model_link(model_name):
95
+ config = MODEL_CONFIG.get(model_name.lower())
96
+ if not config or not config[0]:
97
+ return ""
98
+ url = config[0]
99
+ return url if url.startswith("https:") else f"https://huggingface.co/{url}"
100
+
101
+
102
+ def get_model_organization(model_name):
103
+ config = MODEL_CONFIG.get(model_name.lower())
104
+ return config[1] if config else "—"
105
+
106
+
107
+ def get_zero_shot_status(model_name):
108
+ config = MODEL_CONFIG.get(model_name.lower())
109
+ return "✓" if config and config[2] else "×"
110
+
111
+
112
+ def get_model_type(model_name):
113
+ config = MODEL_CONFIG.get(model_name.lower())
114
+ return config[3] if config else "—"
115
+
116
+
117
+ def highlight_model_type_color(cell):
118
+ config = MODEL_CONFIG.get(cell.lower())
119
+ if config:
120
+ color = COLORS["dl_text"] if config[3] == "DL" else COLORS["st_text"]
121
+ return f"font-weight: bold; color: {color}"
122
+ return "font-weight: bold"
123
+
124
+
125
+ def format_leaderboard(df: pd.DataFrame):
126
+ df = df.copy()
127
+ df["skill_score"] = df["skill_score"].round(1)
128
+ df["win_rate"] = df["win_rate"].round(1)
129
+ df["zero_shot"] = df["model_name"].apply(get_zero_shot_status)
130
+ # Format leakage column: convert to int for all models, 0 for non-zero-shot
131
+ df["training_corpus_overlap"] = df.apply(
132
+ lambda row: int(round(row["training_corpus_overlap"] * 100)) if row["zero_shot"] == "✓" else 0, axis=1
133
+ )
134
+ df["link"] = df["model_name"].apply(get_model_link)
135
+ df["org"] = df["model_name"].apply(get_model_organization)
136
+ df = df[
137
+ [
138
+ "model_name",
139
+ "win_rate",
140
+ "skill_score",
141
+ "median_inference_time_s",
142
+ "training_corpus_overlap",
143
+ "num_failures",
144
+ "zero_shot",
145
+ "org",
146
+ "link",
147
+ ]
148
+ ]
149
+ return df.style.map(highlight_model_type_color, subset=["model_name"]).map(
150
+ lambda x: "font-weight: bold", subset=["zero_shot"]
151
+ ).apply(lambda x: ['background-color: #f8f9fa' if i % 2 == 1 else '' for i in range(len(x))], axis=0)
152
+
153
+
154
+ def construct_bar_chart(df: pd.DataFrame, col: str, metric_name: str):
155
+ label = "Skill Score" if col == "skill_score" else "Win Rate"
156
+
157
+ tooltip = [
158
+ alt.Tooltip("model_name:N"),
159
+ alt.Tooltip(f"{col}:Q", format=".2f"),
160
+ alt.Tooltip(f"{col}_lower:Q", title="95% CI Lower", format=".2f"),
161
+ alt.Tooltip(f"{col}_upper:Q", title="95% CI Upper", format=".2f"),
162
+ ]
163
+
164
+ base_encode = {"y": alt.Y("model_name:N", title="Forecasting Model", sort=None), "tooltip": tooltip}
165
+
166
+ bars = (
167
+ alt.Chart(df)
168
+ .mark_bar(color=COLORS["bar_fill"], cornerRadius=4)
169
+ .encode(x=alt.X(f"{col}:Q", title=f"{label} (%)", scale=alt.Scale(zero=False)), **base_encode)
170
+ )
171
+
172
+ error_bars = (
173
+ alt.Chart(df)
174
+ .mark_errorbar(ticks={"height": 5}, color=COLORS["error_bar"])
175
+ .encode(
176
+ y=alt.Y("model_name:N", title=None, sort=None),
177
+ x=alt.X(f"{col}_lower:Q", title=f"{label} (%)"),
178
+ x2=alt.X2(f"{col}_upper:Q"),
179
+ tooltip=tooltip,
180
+ )
181
+ )
182
+
183
+ points = (
184
+ alt.Chart(df)
185
+ .mark_point(filled=True, color=COLORS["point"])
186
+ .encode(x=alt.X(f"{col}:Q", title=f"{label} (%)"), **base_encode)
187
+ )
188
+
189
+ return (
190
+ (bars + error_bars + points)
191
+ .properties(height=500, title=f"{label} ({metric_name}) with 95% CIs")
192
+ .configure_title(fontSize=16)
193
+ )
194
+
195
+
196
+ def construct_pairwise_chart(df: pd.DataFrame, col: str, metric_name: str):
197
+ config = {
198
+ "win_rate": ("Win Rate", [0, 100], 50, f"abs(datum.{col} - 50) > 30"),
199
+ "skill_score": ("Skill Score", [-15, 15], 0, f"abs(datum.{col}) > 10"),
200
+ }
201
+ cbar_label, domain, domain_mid, text_condition = config[col]
202
+
203
+ df = df.copy()
204
+ for c in [col, f"{col}_lower", f"{col}_upper"]:
205
+ df[c] *= 100
206
+
207
+ model_order = df.groupby("model_1")[col].mean().sort_values(ascending=False).index.tolist()
208
+
209
+ tooltip = [
210
+ alt.Tooltip("model_1:N", title="Model 1"),
211
+ alt.Tooltip("model_2:N", title="Model 2"),
212
+ alt.Tooltip(f"{col}:Q", title=cbar_label.split(" ")[0], format=".1f"),
213
+ alt.Tooltip(f"{col}_lower:Q", title="95% CI Lower", format=".1f"),
214
+ alt.Tooltip(f"{col}_upper:Q", title="95% CI Upper", format=".1f"),
215
+ ]
216
+
217
+ base = alt.Chart(df).encode(
218
+ x=alt.X("model_2:N", sort=model_order, title="Model 2", axis=alt.Axis(orient="top", labelAngle=-90)),
219
+ y=alt.Y("model_1:N", sort=model_order, title="Model 1"),
220
+ )
221
+
222
+ heatmap = base.mark_rect().encode(
223
+ color=alt.Color(
224
+ f"{col}:Q",
225
+ legend=alt.Legend(title=f"{cbar_label} (%)", direction="vertical", orient="right"),
226
+ scale=alt.Scale(scheme=HEATMAP_COLOR_SCHEME, domain=domain, domainMid=domain_mid, clamp=True),
227
+ ),
228
+ tooltip=tooltip,
229
+ )
230
+
231
+ text_main = base.mark_text(dy=-8, fontSize=8, baseline="top", yOffset=5).encode(
232
+ text=alt.Text(f"{col}:Q", format=".1f"),
233
+ color=alt.condition(text_condition, alt.value(COLORS["text_white"]), alt.value(COLORS["text_black"])),
234
+ tooltip=tooltip,
235
+ )
236
+
237
+ return (
238
+ (heatmap + text_main)
239
+ .properties(height=550, title={"text": f"Pairwise {cbar_label} ({metric_name}) with 95% CIs", "fontSize": 16})
240
+ .configure_axis(labelFontSize=11, titleFontSize=13, titleFontWeight="bold")
241
+ .resolve_scale(color="independent")
242
+ )
243
+
244
+
245
+ def construct_pivot_table(
246
+ summaries: pd.DataFrame, metric_name: str, baseline_model: str, leakage_imputation_model: str
247
+ ) -> pd.io.formats.style.Styler:
248
+ errors = fev.pivot_table(summaries=summaries, metric_column=metric_name, task_columns=["task_name"])
249
+ train_overlap = (
250
+ fev.pivot_table(summaries=summaries, metric_column="trained_on_this_dataset", task_columns=["task_name"])
251
+ .fillna(False)
252
+ .astype(bool)
253
+ )
254
+
255
+ is_imputed_baseline = errors.isna()
256
+ is_leakage_imputed = train_overlap
257
+
258
+ # Handle imputations
259
+ errors = errors.mask(train_overlap, errors[leakage_imputation_model], axis=0)
260
+ for col in errors.columns:
261
+ if col != baseline_model:
262
+ errors[col] = errors[col].fillna(errors[baseline_model])
263
+
264
+ errors = errors[errors.rank(axis=1).mean().sort_values().index]
265
+ errors.index.rename("Task name", inplace=True)
266
+
267
+ def highlight_by_position(styler):
268
+ rank_colors = {1: COLORS["gold"], 2: COLORS["silver"], 3: COLORS["bronze"]}
269
+
270
+ for row_idx in errors.index:
271
+ row_ranks = errors.loc[row_idx].rank(method="min")
272
+ for col_idx in errors.columns:
273
+ rank = row_ranks[col_idx]
274
+ style_parts = []
275
+
276
+ # Rank background colors
277
+ if rank <= 3:
278
+ style_parts.append(f"background-color: {rank_colors[rank]}")
279
+
280
+ # Imputation text colors
281
+ if is_leakage_imputed.loc[row_idx, col_idx]:
282
+ style_parts.append(f"color: {COLORS['leakage_impute']}")
283
+ elif is_imputed_baseline.loc[row_idx, col_idx]:
284
+ style_parts.append(f"color: {COLORS['failure_impute']}")
285
+ elif not style_parts or (len(style_parts) == 1 and "font-weight" in style_parts[0]):
286
+ style_parts.append(f"color: {COLORS['text_default']}")
287
+
288
+ if style_parts:
289
+ styler = styler.map(
290
+ lambda x, s="; ".join(style_parts): s, subset=pd.IndexSlice[row_idx:row_idx, col_idx:col_idx]
291
+ )
292
+ return styler
293
+
294
+ return highlight_by_position(errors.style).format(precision=3)
summaries.csv ADDED
The diff for this file is too large to render. See raw diff