fev-leaderboard

Running

App Files Files Community

shchuro commited on 8 days ago

Commit

eb8267f

1 Parent(s): ef56e52

Add new leaderboard

Browse files

Files changed (15) hide show

Dockerfile +2 -2
fev-leaderboard-app.py +10 -0
pages/about.py +18 -0
pages/chronos_bench_ii.py +179 -0
pages/fev_bench.py +151 -0
requirements.txt +6 -2
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/__pycache__/colors.cpython-311.pyc +0 -0
src/__pycache__/strings.cpython-311.pyc +0 -0
src/__pycache__/utils.cpython-311.pyc +0 -0
src/colors.py +6 -0
src/strings.py +107 -0
src/utils.py +294 -0
summaries.csv +0 -0

Dockerfile CHANGED Viewed

@@ -9,7 +9,7 @@ RUN apt-get update && apt-get install -y \
     && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt ./
-COPY src/ ./src/
 RUN pip3 install -r requirements.txt
@@ -17,4 +17,4 @@ EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

     && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt ./
+COPY . .
 RUN pip3 install -r requirements.txt
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "fev-leaderboard-app.py", "--server.port=8501", "--server.address=0.0.0.0"]

fev-leaderboard-app.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import streamlit as st
+pages = [
+    st.Page("pages/fev_bench.py", title="fev-bench", icon=":material/trophy:"),
+    st.Page("pages/chronos_bench_ii.py", title="Chronos Benchmark II", icon=":material/trophy:"),
+    st.Page("pages/about.py", title="About", icon=":material/info:"),
+]
+page = st.navigation(pages)
+page.run()

pages/about.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import streamlit as st
+ABOUT_LEADERBOARD = """
+## About `fev`
+**fev** is a lightweight wrapper around the 🤗 [datasets](https://huggingface.co/docs/datasets/en/index) library designed to streamline
+ time series forecasting model benchmarking.
+### 📚 Resources
+- **Documentation**: [Official docs](https://autogluon.github.io/fev/latest/)
+- **Source Code**: [GitHub repository](https://github.com/autogluon/fev)
+- **Issues & Questions**: [GitHub Issues](https://github.com/autogluon/fev/issues)
+### 🚀 Submit Your Model
+Ready to add your model to the leaderboard? Follow this [tutorial](https://autogluon.github.io/fev/latest/tutorials/04-models/) to evaluate your model with fev and contribute your results.
+"""
+st.set_page_config(layout="wide", page_title="About FEV", page_icon=":material/info:")
+st.markdown(ABOUT_LEADERBOARD)

pages/chronos_bench_ii.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).parent.parent))
+import fev
+import pandas as pd
+import streamlit as st
+from streamlit.elements.lib.column_types import ColumnConfig
+from src.strings import (
+    CHRONOS_BENCHMARK_BASIC_INFO,
+    CHRONOS_BENCHMARK_DETAILS,
+    CITATION_CHRONOS,
+    CITATION_FEV,
+    CITATION_HEADER,
+    PAIRWISE_BENCHMARK_DETAILS,
+    get_pivot_legend,
+)
+from src.utils import (
+    construct_bar_chart,
+    construct_pairwise_chart,
+    construct_pivot_table,
+    format_leaderboard,
+    format_metric_name,
+    get_metric_description,
+)
+st.set_page_config(layout="wide", page_title="FEV Benchmark Leaderboard", page_icon=":material/trophy:")
+TITLE = "<h1 style='text-align: center; font-size: 350%;'>Chronos Benchmark II</h1>"
+BASELINE_MODEL = "seasonal_naive"
+LEAKAGE_IMPUTATION_MODEL = "chronos_bolt_base"
+SORT_COL = "win_rate"
+N_RESAMPLES_FOR_CI = 1000
+TOP_K_MODELS_TO_PLOT = 15
+AVAILABLE_METRICS = ["WQL", "MASE"]
+SUMMARY_URLS = [
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_arima.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_ets.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_theta.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_base.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_large.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_mini.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_small.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_tiny.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_base.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_mini.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_small.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_tiny.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_base.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_large.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_small.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm-2.0.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/ttm-r2.csv",
+    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/tirex.csv",
+]
+@st.cache_data()
+def load_summaries():
+    summaries = []
+    for url in SUMMARY_URLS:
+        df = pd.read_csv(url)
+        summaries.append(df)
+    return pd.concat(summaries, ignore_index=True)
+@st.cache_data()
+def get_leaderboard(metric_name: str) -> pd.DataFrame:
+    summaries = load_summaries()
+    lb = fev.analysis.leaderboard(
+        summaries=summaries,
+        metric_column=metric_name,
+        missing_strategy="impute",
+        baseline_model=BASELINE_MODEL,
+        leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
+    )
+    lb = lb.astype("float64").reset_index()
+    lb["skill_score"] = lb["skill_score"] * 100
+    lb["win_rate"] = lb["win_rate"] * 100
+    lb["num_failures"] = lb["num_failures"] / summaries["task_name"].nunique() * 100
+    return lb
+@st.cache_data()
+def get_pairwise(metric_name: str, included_models: list[str]) -> pd.DataFrame:
+    if BASELINE_MODEL not in included_models:
+        included_models = included_models + [BASELINE_MODEL]
+    summaries = load_summaries()
+    return (
+        fev.analysis.pairwise_comparison(
+            summaries,
+            included_models=included_models,
+            metric_column=metric_name,
+            baseline_model=BASELINE_MODEL,
+            missing_strategy="impute",
+            n_resamples=N_RESAMPLES_FOR_CI,
+            leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
+        )
+        .round(3)
+        .reset_index()
+    )
+with st.sidebar:
+    selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name)
+    st.caption(get_metric_description(selected_metric))
+cols = st.columns(spec=[0.025, 0.95, 0.025])
+with cols[1] as main_container:
+    st.markdown(TITLE, unsafe_allow_html=True)
+    metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False)
+    top_k_models = metric_df.head(TOP_K_MODELS_TO_PLOT)["model_name"].tolist()
+    pairwise_df = get_pairwise(selected_metric, included_models=top_k_models)
+    st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True)
+    st.markdown(CHRONOS_BENCHMARK_BASIC_INFO, unsafe_allow_html=True)
+    df_styled = format_leaderboard(metric_df)
+    st.dataframe(
+        df_styled,
+        use_container_width=True,
+        hide_index=True,
+        column_config={
+            "model_name": ColumnConfig(label="Model Name", alignment="left"),
+            "win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"),
+            "skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"),
+            "median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"),
+            "training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"),
+            "num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"),
+            "zero_shot": ColumnConfig(label="Zero-shot", alignment="center"),
+            "org": ColumnConfig(label="Organization", alignment="left"),
+            "link": st.column_config.LinkColumn(label="Link", display_text=":material/open_in_new:"),
+        },
+    )
+    with st.expander("See details"):
+        st.markdown(CHRONOS_BENCHMARK_DETAILS, unsafe_allow_html=True)
+    st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True)
+    chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45])
+    with chart_col_1:
+        st.altair_chart(
+            construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric),
+            use_container_width=True,
+        )
+    with chart_col_2:
+        st.altair_chart(
+            construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric),
+            use_container_width=True,
+        )
+    with st.expander("See details"):
+        st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True)
+    st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True)
+    with st.expander("Show detailed results"):
+        st.markdown(get_pivot_legend(BASELINE_MODEL, LEAKAGE_IMPUTATION_MODEL), unsafe_allow_html=True)
+        st.dataframe(
+            construct_pivot_table(
+                summaries=load_summaries(),
+                metric_name=selected_metric,
+                baseline_model=BASELINE_MODEL,
+                leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
+            )
+        )
+    st.divider()
+    st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True)
+    st.markdown(CITATION_HEADER)
+    st.markdown(CITATION_FEV)
+    st.markdown(CITATION_CHRONOS)

pages/fev_bench.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).parent.parent))
+import fev
+import pandas as pd
+import streamlit as st
+from streamlit.elements.lib.column_types import ColumnConfig
+from src.strings import (
+    CITATION_FEV,
+    CITATION_HEADER,
+    FEV_BENCHMARK_BASIC_INFO,
+    FEV_BENCHMARK_DETAILS,
+    PAIRWISE_BENCHMARK_DETAILS,
+    get_pivot_legend,
+)
+from src.utils import (
+    construct_pairwise_chart,
+    construct_pivot_table,
+    format_leaderboard,
+    format_metric_name,
+    get_metric_description,
+)
+st.set_page_config(layout="wide", page_title="fev leaderboard", page_icon=":material/trophy:")
+TITLE = "<h1 style='text-align: center; font-size: 350%;'>fev-bench</h1>"
+BASELINE_MODEL = "Seasonal Naive"
+LEAKAGE_IMPUTATION_MODEL = "Chronos-Bolt"
+SORT_COL = "win_rate"
+N_RESAMPLES_FOR_CI = 1000
+TOP_K_MODELS_TO_PLOT = 15
+AVAILABLE_METRICS = ["SQL", "MASE", "WQL", "WAPE"]
+@st.cache_data()
+def load_summaries():
+    summaries = pd.read_csv("summaries.csv")
+    return summaries
+@st.cache_data()
+def get_leaderboard(metric_name: str) -> pd.DataFrame:
+    summaries = load_summaries()
+    lb = fev.analysis.leaderboard(
+        summaries=summaries,
+        metric_column=metric_name,
+        missing_strategy="impute",
+        baseline_model=BASELINE_MODEL,
+        leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
+    )
+    lb = lb.astype("float64").reset_index()
+    lb["skill_score"] = lb["skill_score"] * 100
+    lb["win_rate"] = lb["win_rate"] * 100
+    lb["num_failures"] = lb["num_failures"] / summaries["task_name"].nunique() * 100
+    return lb
+@st.cache_data()
+def get_pairwise(metric_name: str, included_models: list[str]) -> pd.DataFrame:
+    if BASELINE_MODEL not in included_models:
+        included_models = included_models + [BASELINE_MODEL]
+    summaries = load_summaries()
+    return (
+        fev.analysis.pairwise_comparison(
+            summaries,
+            included_models=included_models,
+            metric_column=metric_name,
+            baseline_model=BASELINE_MODEL,
+            missing_strategy="impute",
+            n_resamples=N_RESAMPLES_FOR_CI,
+            leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
+        )
+        .round(3)
+        .reset_index()
+    )
+with st.sidebar:
+    selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name)
+    st.caption(get_metric_description(selected_metric))
+cols = st.columns(spec=[0.025, 0.95, 0.025])
+with cols[1] as main_container:
+    st.markdown(TITLE, unsafe_allow_html=True)
+    metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False)
+    top_k_models = metric_df.head(TOP_K_MODELS_TO_PLOT)["model_name"].tolist()
+    pairwise_df = get_pairwise(selected_metric, included_models=top_k_models)
+    st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True)
+    st.markdown(FEV_BENCHMARK_BASIC_INFO, unsafe_allow_html=True)
+    df_styled = format_leaderboard(metric_df)
+    st.dataframe(
+        df_styled,
+        use_container_width=True,
+        hide_index=True,
+        column_config={
+            "model_name": ColumnConfig(label="Model Name", alignment="left"),
+            "win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"),
+            "skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"),
+            "median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"),
+            "training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"),
+            "num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"),
+            "zero_shot": ColumnConfig(label="Zero-shot", alignment="center"),
+            "org": ColumnConfig(label="Organization", alignment="left"),
+            "link": st.column_config.LinkColumn(label="Link", display_text="🔗"),
+        },
+    )
+    with st.expander("See details"):
+        st.markdown(FEV_BENCHMARK_DETAILS, unsafe_allow_html=True)
+    st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True)
+    chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45])
+    with chart_col_1:
+        st.altair_chart(
+            construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric),
+            use_container_width=True,
+        )
+    with chart_col_2:
+        st.altair_chart(
+            construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric),
+            use_container_width=True,
+        )
+    with st.expander("See details"):
+        st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True)
+    st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True)
+    with st.expander("Show detailed results"):
+        st.markdown(get_pivot_legend(BASELINE_MODEL, LEAKAGE_IMPUTATION_MODEL), unsafe_allow_html=True)
+        st.dataframe(
+            construct_pivot_table(
+                summaries=load_summaries(),
+                metric_name=selected_metric,
+                baseline_model=BASELINE_MODEL,
+                leakage_imputation_model=LEAKAGE_IMPUTATION_MODEL,
+            )
+        )
+    st.divider()
+    st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True)
+    st.markdown(CITATION_HEADER)
+    st.markdown(CITATION_FEV)

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
-altair
 pandas
-streamlit

 pandas
+matplotlib
+numpy
+pandas
+streamlit==1.49.1
+fev>=0.6.0
+altair>=5.5.0

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (168 Bytes). View file

src/__pycache__/colors.cpython-311.pyc ADDED Viewed

Binary file (327 Bytes). View file

src/__pycache__/strings.cpython-311.pyc ADDED Viewed

Binary file (7.32 kB). View file

src/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (17.4 kB). View file

src/colors.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Legacy colors - kept for backward compatibility if needed elsewhere
+VERY_PALE_PURPLE = "#e8d9f3"
+VERY_PALE_GREEN = "#cffdbc"
+VERY_PALE_BLUE = "#d6fffe"
+DEEP_LAVENDER = "#8d5eb7"
+GRASS_GREEN = "#3f9b0b"

src/strings.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from src.utils import COLORS
+INTRODUCTION_TEXT = """
+This space hosts evaluation results for time series forecasting models. The results are obtained using [fev](https://github.com/autogluon/fev) - a lightweight library for evaluating time series forecasting models.
+"""
+LEGEND = """
+"""
+TABLE_INFO = f"""
+The leaderboard summarizes the performance of all models evaluated on the 100 tasks comprising `fev-bench`.
+Model names are colored by type: <span style='color: {COLORS["dl_text"]}; font-weight: bold;'>Deep Learning</span> and <span style='color: {COLORS["st_text"]}; font-weight: bold;'>Statistical</span>.
+The full matrix $E_{{rj}}$ with the error of each model $j$ on task $r$ is available at the bottom of the page.
+* **Avg. win rate (%)**: Fraction of all possible model pairs and tasks where this model achieves lower error than the competing model. For model $j$, defined as $W_j = \\frac{{1}}{{R(M-1)}} \\sum_{{r=1}}^{{R}} \\sum_{{k \\neq j}} (\\mathbf{{1}}(E_{{rj}} < E_{{rk}}) + 0.5 \\cdot \\mathbf{{1}}(E_{{rj}} = E_{{rk}}))$ where $R$ is number of tasks, $M$ is number of models. Ties count as half-wins. Ranges from 0% (worst) to 100% (best). Higher values are better.
+* **Skill score (%)**: Measures how much the model reduces forecasting error compared to the Seasonal Naive baseline. Computed as $S_j = 100 \\times (1 - \\sqrt[R]{{\\prod_{{r=1}}^{{R}} E_{{rj}}/E_{{r\\beta}}}})$, where $E_{{r\\beta}}$ is baseline error on task $r$. Relative errors are clipped between 0.01 and 100 before aggregation to avoid extreme outliers. Positive values indicate better-than-baseline performance, negative values indicate worse-than-baseline performance. Higher values are better.
+* **Median runtime (s)**: Median end-to-end time (training + prediction across all evaluation windows) in seconds. Note that inference times depend on hardware, batch sizes, and implementation details, so these serve as a rough guide rather than definitive performance benchmarks.
+* **Leakage (%)**: For zero-shot models, percentage of benchmark datasets included in the model's training corpus. For zero-shot models, results for tasks with reported overlap are replaced with Chronos-Bolt (Base) performance to prevent data leakage.
+* **Failed tasks (%)**: Percentage of tasks where the model failed to produce a forecast. Results for failed tasks are replaced with Seasonal Naive performance.
+* **Zero-shot**: Indicates whether the model can make predictions without task-specific training (✓ = zero-shot, × = task-specific).
+"""
+CHRONOS_BENCHMARK_BASIC_INFO = f"""
+**Chronos Benchmark II** contains results for various forecasting models on the 27 datasets used in Benchmark II in the paper [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815). {LEGEND}
+"""
+CHRONOS_BENCHMARK_DETAILS = f"""
+{TABLE_INFO}
+Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/abs/2403.07815).
+"""
+FEV_BENCHMARK_BASIC_INFO = f"""
+Results for various forecasting models on 100 tasks of the **fev-bench** benchmark, as described in the paper [fev-bench: A Realistic Benchmark for Time Series Forecasting](https://arxiv.org/abs/2509.26468). {LEGEND}
+"""
+FEV_BENCHMARK_DETAILS = f"""
+{TABLE_INFO}
+Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/).
+"""
+CITATION_HEADER = """
+If you find this leaderboard useful for your research, please consider citing the associated paper(s):
+"""
+CITATION_FEV = """
+```
+@article{shchur2025fev,
+  title={{fev-bench}: A Realistic Benchmark for Time Series Forecasting},
+  author={Shchur, Oleksandr and Ansari, Abdul Fatir and Turkmen, Caner and Stella, Lorenzo and Erickson, Nick and Guerron, Pablo and Bohlke-Schneider, Michael and Wang, Yuyang},
+  year={2025},
+}
+```
+"""
+def get_pivot_legend(baseline_model: str, leakage_imputation_model: str) -> str:
+    return f"""
+Task definitions and raw results in CSV format are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/fev_bench).
+Best results for each task are marked with
+<span style='background: {COLORS["gold"]}; color: {COLORS["text_default"]}; padding: 3px; border-radius: 5px;'>🥇 1st</span>
+<span style='background: {COLORS["silver"]}; color: {COLORS["text_default"]}; padding: 3px; border-radius: 5px;'>🥈 2nd</span>
+<span style='background: {COLORS["bronze"]}; color: {COLORS["text_default"]}; padding: 3px; border-radius: 5px;'>🥉 3rd</span>
+<br><br>
+**Imputation:**
+- <span style='color: {COLORS["failure_impute"]}; font-weight: bold;'>Failed tasks</span> imputed by {baseline_model}
+- <span style='color: {COLORS["leakage_impute"]}; font-weight: bold;'>Leaky tasks</span> imputed by {leakage_imputation_model}
+"""
+PAIRWISE_BENCHMARK_DETAILS = """
+The pairwise charts show head-to-head results between models:
+* **Win rate**: Percentage of tasks where Model 1 achieves lower error than Model 2 (ties count as half-wins).
+  A value above 50% means Model 1 is more accurate than Model 2 on average.
+* **Skill score**: Average relative error reduction of Model 1 with respect to Model 2.
+  A positive value means Model 1 reduces forecasting error compared to Model 2 on average.
+**Confidence Intervals**: 95% intervals are estimated using 1000 bootstrap samples over tasks.
+For each bootstrap sample, tasks are resampled with replacement and the pairwise win rate / skill score are recomputed.
+The intervals correspond to the 2.5th and 97.5th percentiles of these bootstrap distributions,
+capturing how model comparisons vary under alternative benchmark compositions.
+"""
+CITATION_CHRONOS = """
+```
+@article{ansari2024chronos,
+  title={Chronos: Learning the Language of Time Series},
+  author={Ansari, Abdul Fatir and Stella, Lorenzo and Turkmen, Caner and Zhang, Xiyuan, and Mercado, Pedro and Shen, Huibin and Shchur, Oleksandr and Rangapuram, Syama Syndar and Pineda Arango, Sebastian and Kapoor, Shubham and Zschiegner, Jasper and Maddix, Danielle C. and Wang, Hao and Mahoney, Michael W. and Torkkola, Kari and Gordon Wilson, Andrew and Bohlke-Schneider, Michael and Wang, Yuyang},
+  journal={Transactions on Machine Learning Research},
+  issn={2835-8856},
+  year={2024},
+  url={https://openreview.net/forum?id=gerNCVqqtR}
+}
+```
+"""

src/utils.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import altair as alt
+import fev
+import pandas as pd
+import pandas.io.formats.style
+# Color constants - all colors defined in one place
+COLORS = {
+    "dl_text": "#5A7FA5",
+    "st_text": "#666666",
+    "bar_fill": "#8d5eb7",
+    "error_bar": "#222222",
+    "point": "#111111",
+    "text_white": "white",
+    "text_black": "black",
+    "text_default": "#111",
+    "gold": "#F7D36B",
+    "silver": "#E5E7EB",
+    "bronze": "#E6B089",
+    "leakage_impute": "#3B82A0",
+    "failure_impute": "#E07B39",
+}
+HEATMAP_COLOR_SCHEME = "purplegreen"
+# Model configuration: (url, org, zero_shot, model_type)
+MODEL_CONFIG = {
+    # Chronos Models
+    "chronos_tiny": ("amazon/chronos-t5-tiny", "AWS", True, "DL"),
+    "chronos_mini": ("amazon/chronos-t5-mini", "AWS", True, "DL"),
+    "chronos_small": ("amazon/chronos-t5-small", "AWS", True, "DL"),
+    "chronos_base": ("amazon/chronos-t5-base", "AWS", True, "DL"),
+    "chronos_large": ("amazon/chronos-t5-large", "AWS", True, "DL"),
+    "chronos_bolt_tiny": ("amazon/chronos-bolt-tiny", "AWS", True, "DL"),
+    "chronos_bolt_mini": ("amazon/chronos-bolt-mini", "AWS", True, "DL"),
+    "chronos_bolt_small": ("amazon/chronos-bolt-small", "AWS", True, "DL"),
+    "chronos_bolt_base": ("amazon/chronos-bolt-base", "AWS", True, "DL"),
+    "chronos-bolt": ("amazon/chronos-bolt-base", "AWS", True, "DL"),
+    # Moirai Models
+    "moirai_large": ("Salesforce/moirai-1.1-R-large", "Salesforce", True, "DL"),
+    "moirai_base": ("Salesforce/moirai-1.1-R-base", "Salesforce", True, "DL"),
+    "moirai_small": ("Salesforce/moirai-1.1-R-small", "Salesforce", True, "DL"),
+    "moirai-2.0": ("Salesforce/moirai-2.0-R-small", "Salesforce", True, "DL"),
+    # TimesFM Models
+    "timesfm": ("google/timesfm-1.0-200m-pytorch", "Google", True, "DL"),
+    "timesfm-2.0": ("google/timesfm-2.0-500m-pytorch", "Google", True, "DL"),
+    "timesfm-2.5": ("google/timesfm-2.5-200m-pytorch", "Google", True, "DL"),
+    # Toto Models
+    "toto-1.0": ("Datadog/Toto-Open-Base-1.0", "Datadog", True, "DL"),
+    # Other Models
+    "tirex": ("NX-AI/TiRex", "NX-AI", True, "DL"),
+    "tabpfn-ts": ("Prior-Labs/TabPFN-v2-reg", "Prior Labs", True, "DL"),
+    "sundial-base": ("thuml/sundial-base-128m", "Tsinghua University", True, "DL"),
+    "ttm-r2": ("ibm-granite/granite-timeseries-ttm-r2", "IBM", True, "DL"),
+    # Task-specific models
+    "stat. ensemble": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
+    "autoarima": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
+    "autotheta": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
+    "autoets": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
+    "seasonalnaive": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
+    "seasonal naive": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
+    "drift": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
+    "naive": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
+}
+ALL_METRICS = {
+    "SQL": (
+        "SQL: Scaled Quantile Loss",
+        "The [Scaled Quantile Loss (SQL)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.SQL) is a scale-invariant metric for evaluating probabilistic forecasts.",
+    ),
+    "MASE": (
+        "MASE: Mean Absolute Scaled Error",
+        "The [Mean Absolute Scaled Error (MASE)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.MASE) is a scale-invariant metric for evaluating point forecasts.",
+    ),
+    "WQL": (
+        "WQL: Weighted Quantile Loss",
+        "The [Weighted Quantile Loss (WQL)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.WQL), is a scale-dependent metric for evaluating probabilistic forecasts.",
+    ),
+    "WAPE": (
+        "WAPE: Weighted Absolute Percentage Error",
+        "The [Weighted Absolute Percentage Error (WAPE)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.WAPE) is a scale-dependent metric for evaluating point forecasts.",
+    ),
+}
+def format_metric_name(metric_name: str):
+    return ALL_METRICS[metric_name][0]
+def get_metric_description(metric_name: str):
+    return ALL_METRICS[metric_name][1]
+def get_model_link(model_name):
+    config = MODEL_CONFIG.get(model_name.lower())
+    if not config or not config[0]:
+        return ""
+    url = config[0]
+    return url if url.startswith("https:") else f"https://huggingface.co/{url}"
+def get_model_organization(model_name):
+    config = MODEL_CONFIG.get(model_name.lower())
+    return config[1] if config else "—"
+def get_zero_shot_status(model_name):
+    config = MODEL_CONFIG.get(model_name.lower())
+    return "✓" if config and config[2] else "×"
+def get_model_type(model_name):
+    config = MODEL_CONFIG.get(model_name.lower())
+    return config[3] if config else "—"
+def highlight_model_type_color(cell):
+    config = MODEL_CONFIG.get(cell.lower())
+    if config:
+        color = COLORS["dl_text"] if config[3] == "DL" else COLORS["st_text"]
+        return f"font-weight: bold; color: {color}"
+    return "font-weight: bold"
+def format_leaderboard(df: pd.DataFrame):
+    df = df.copy()
+    df["skill_score"] = df["skill_score"].round(1)
+    df["win_rate"] = df["win_rate"].round(1)
+    df["zero_shot"] = df["model_name"].apply(get_zero_shot_status)
+    # Format leakage column: convert to int for all models, 0 for non-zero-shot
+    df["training_corpus_overlap"] = df.apply(
+        lambda row: int(round(row["training_corpus_overlap"] * 100)) if row["zero_shot"] == "✓" else 0, axis=1
+    )
+    df["link"] = df["model_name"].apply(get_model_link)
+    df["org"] = df["model_name"].apply(get_model_organization)
+    df = df[
+        [
+            "model_name",
+            "win_rate",
+            "skill_score",
+            "median_inference_time_s",
+            "training_corpus_overlap",
+            "num_failures",
+            "zero_shot",
+            "org",
+            "link",
+        ]
+    ]
+    return df.style.map(highlight_model_type_color, subset=["model_name"]).map(
+        lambda x: "font-weight: bold", subset=["zero_shot"]
+    ).apply(lambda x: ['background-color: #f8f9fa' if i % 2 == 1 else '' for i in range(len(x))], axis=0)
+def construct_bar_chart(df: pd.DataFrame, col: str, metric_name: str):
+    label = "Skill Score" if col == "skill_score" else "Win Rate"
+    tooltip = [
+        alt.Tooltip("model_name:N"),
+        alt.Tooltip(f"{col}:Q", format=".2f"),
+        alt.Tooltip(f"{col}_lower:Q", title="95% CI Lower", format=".2f"),
+        alt.Tooltip(f"{col}_upper:Q", title="95% CI Upper", format=".2f"),
+    ]
+    base_encode = {"y": alt.Y("model_name:N", title="Forecasting Model", sort=None), "tooltip": tooltip}
+    bars = (
+        alt.Chart(df)
+        .mark_bar(color=COLORS["bar_fill"], cornerRadius=4)
+        .encode(x=alt.X(f"{col}:Q", title=f"{label} (%)", scale=alt.Scale(zero=False)), **base_encode)
+    )
+    error_bars = (
+        alt.Chart(df)
+        .mark_errorbar(ticks={"height": 5}, color=COLORS["error_bar"])
+        .encode(
+            y=alt.Y("model_name:N", title=None, sort=None),
+            x=alt.X(f"{col}_lower:Q", title=f"{label} (%)"),
+            x2=alt.X2(f"{col}_upper:Q"),
+            tooltip=tooltip,
+        )
+    )
+    points = (
+        alt.Chart(df)
+        .mark_point(filled=True, color=COLORS["point"])
+        .encode(x=alt.X(f"{col}:Q", title=f"{label} (%)"), **base_encode)
+    )
+    return (
+        (bars + error_bars + points)
+        .properties(height=500, title=f"{label} ({metric_name}) with 95% CIs")
+        .configure_title(fontSize=16)
+    )
+def construct_pairwise_chart(df: pd.DataFrame, col: str, metric_name: str):
+    config = {
+        "win_rate": ("Win Rate", [0, 100], 50, f"abs(datum.{col} - 50) > 30"),
+        "skill_score": ("Skill Score", [-15, 15], 0, f"abs(datum.{col}) > 10"),
+    }
+    cbar_label, domain, domain_mid, text_condition = config[col]
+    df = df.copy()
+    for c in [col, f"{col}_lower", f"{col}_upper"]:
+        df[c] *= 100
+    model_order = df.groupby("model_1")[col].mean().sort_values(ascending=False).index.tolist()
+    tooltip = [
+        alt.Tooltip("model_1:N", title="Model 1"),
+        alt.Tooltip("model_2:N", title="Model 2"),
+        alt.Tooltip(f"{col}:Q", title=cbar_label.split(" ")[0], format=".1f"),
+        alt.Tooltip(f"{col}_lower:Q", title="95% CI Lower", format=".1f"),
+        alt.Tooltip(f"{col}_upper:Q", title="95% CI Upper", format=".1f"),
+    ]
+    base = alt.Chart(df).encode(
+        x=alt.X("model_2:N", sort=model_order, title="Model 2", axis=alt.Axis(orient="top", labelAngle=-90)),
+        y=alt.Y("model_1:N", sort=model_order, title="Model 1"),
+    )
+    heatmap = base.mark_rect().encode(
+        color=alt.Color(
+            f"{col}:Q",
+            legend=alt.Legend(title=f"{cbar_label} (%)", direction="vertical", orient="right"),
+            scale=alt.Scale(scheme=HEATMAP_COLOR_SCHEME, domain=domain, domainMid=domain_mid, clamp=True),
+        ),
+        tooltip=tooltip,
+    )
+    text_main = base.mark_text(dy=-8, fontSize=8, baseline="top", yOffset=5).encode(
+        text=alt.Text(f"{col}:Q", format=".1f"),
+        color=alt.condition(text_condition, alt.value(COLORS["text_white"]), alt.value(COLORS["text_black"])),
+        tooltip=tooltip,
+    )
+    return (
+        (heatmap + text_main)
+        .properties(height=550, title={"text": f"Pairwise {cbar_label} ({metric_name}) with 95% CIs", "fontSize": 16})
+        .configure_axis(labelFontSize=11, titleFontSize=13, titleFontWeight="bold")
+        .resolve_scale(color="independent")
+    )
+def construct_pivot_table(
+    summaries: pd.DataFrame, metric_name: str, baseline_model: str, leakage_imputation_model: str
+) -> pd.io.formats.style.Styler:
+    errors = fev.pivot_table(summaries=summaries, metric_column=metric_name, task_columns=["task_name"])
+    train_overlap = (
+        fev.pivot_table(summaries=summaries, metric_column="trained_on_this_dataset", task_columns=["task_name"])
+        .fillna(False)
+        .astype(bool)
+    )
+    is_imputed_baseline = errors.isna()
+    is_leakage_imputed = train_overlap
+    # Handle imputations
+    errors = errors.mask(train_overlap, errors[leakage_imputation_model], axis=0)
+    for col in errors.columns:
+        if col != baseline_model:
+            errors[col] = errors[col].fillna(errors[baseline_model])
+    errors = errors[errors.rank(axis=1).mean().sort_values().index]
+    errors.index.rename("Task name", inplace=True)
+    def highlight_by_position(styler):
+        rank_colors = {1: COLORS["gold"], 2: COLORS["silver"], 3: COLORS["bronze"]}
+        for row_idx in errors.index:
+            row_ranks = errors.loc[row_idx].rank(method="min")
+            for col_idx in errors.columns:
+                rank = row_ranks[col_idx]
+                style_parts = []
+                # Rank background colors
+                if rank <= 3:
+                    style_parts.append(f"background-color: {rank_colors[rank]}")
+                # Imputation text colors
+                if is_leakage_imputed.loc[row_idx, col_idx]:
+                    style_parts.append(f"color: {COLORS['leakage_impute']}")
+                elif is_imputed_baseline.loc[row_idx, col_idx]:
+                    style_parts.append(f"color: {COLORS['failure_impute']}")
+                elif not style_parts or (len(style_parts) == 1 and "font-weight" in style_parts[0]):
+                    style_parts.append(f"color: {COLORS['text_default']}")
+                if style_parts:
+                    styler = styler.map(
+                        lambda x, s="; ".join(style_parts): s, subset=pd.IndexSlice[row_idx:row_idx, col_idx:col_idx]
+                    )
+        return styler
+    return highlight_by_position(errors.style).format(precision=3)

summaries.csv ADDED Viewed

The diff for this file is too large to render. See raw diff