File size: 6,255 Bytes
41e170e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f8ba39
 
41e170e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60fddb6
 
 
 
b134f5e
 
 
 
 
 
 
41e170e
5f8ba39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41e170e
9eae046
 
 
 
3b4b828
41e170e
 
 
 
 
 
 
 
 
 
 
 
 
 
7e203f8
41e170e
 
 
 
 
7e203f8
41e170e
5f8ba39
41e170e
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
Data service provider
"""
import json
from typing import List

import pandas as pd

from utils.cache_decorator import cache_df_with_custom_key, cache_dict_with_custom_key
from utils.http_utils import get

COLUMNS = ['model_name',
           'embd_dtype', 'embd_dim', 'num_params', 'max_tokens', 'similarity',
           'query_instruct', 'corpus_instruct',

           ]
COLUMNS_TYPES = ["markdown",
                 'str', 'str', 'number', 'number', 'str',
                 'str', 'str',

                 ]

BRANCH = 'main'
GIT_URL = f"https://raw.githubusercontent.com/embedding-benchmark/rteb/refs/heads/{BRANCH}/results/"
DATASET_URL = f"{GIT_URL}datasets.json"
MODEL_URL = f"{GIT_URL}models.json"
RESULT_URL = f"{GIT_URL}results.json"


class DataEngine:

    def __init__(self):
        self.df = self.init_dataframe()

    @property
    @cache_dict_with_custom_key("models")
    def models(self):
        """
        Get models data
        """
        res = get(MODEL_URL)
        if res.status_code == 200:
            return res.json()
        return {}

    @property
    @cache_dict_with_custom_key("datasets")
    def datasets(self):
        """
        Get tasks data
        """
        res = get(DATASET_URL)
        if res.status_code == 200:
            return res.json()
        return {}

    @property
    @cache_dict_with_custom_key("results")
    def results(self):
        """
        Get results data
        """
        res = get(RESULT_URL)
        if res.status_code == 200:
            return res.json()
        return {}

    def init_dataframe(self):
        """
        Initialize DataFrame
        """
        d = {"hello": [123], "world": [456]}
        return pd.DataFrame(d)

    @cache_df_with_custom_key("json_result")
    def jsons_to_df(self):

        results_list = self.results
        df_results_list = []
        for result_dict in results_list:
            dataset_name = result_dict["dataset_name"]
            df_result_row = pd.DataFrame(result_dict["results"])
            df_result_row["dataset_name"] = dataset_name
            df_results_list.append(df_result_row)
        df_result = pd.concat(df_results_list)

        df_result = df_result[["model_name", "dataset_name", "ndcg_at_10", "embd_dim", "embd_dtype"]]

        df_result["ndcg_at_10"] = (df_result["ndcg_at_10"] * 100).round(2)

        df_datasets_list = []
        for item in self.datasets:
            dataset_names = item["datasets"]
            df_dataset_row = pd.DataFrame(
                {
                    "group_name": [item["name"] for _ in range(len(dataset_names))],
                    "dataset_name": dataset_names,
                    "leaderboard": [item["leaderboard"] for _ in range(len(dataset_names))]
                }
            )
            df_datasets_list.append(df_dataset_row)
        df_dataset = pd.concat(df_datasets_list).drop_duplicates()

        models_list = self.models

        df_model = pd.DataFrame(models_list)
        
        # Replace None values in num_params with "Unknown"
        if 'num_params' in df_model.columns:
            df_model['num_params'] = df_model['num_params'].fillna("Unknown")
        
        # Replace blank/None values in vendor with "Open source"
        if 'vendor' in df_model.columns:
            df_model['vendor'] = df_model['vendor'].fillna("Open source")
            df_model['vendor'] = df_model['vendor'].replace('', "Open source")
            # Also handle whitespace-only strings
            df_model['vendor'] = df_model['vendor'].apply(lambda x: "Open source" if isinstance(x, str) and x.strip() == '' else x)

        # Create mapping for model names/aliases
        if 'alias' in df_model.columns:
            # Create a lookup table for alias to model_name mapping
            alias_mapping = df_model[df_model['alias'].notna()].set_index('alias')['model_name'].to_dict()

            # Add rows for aliases to enable joining
            alias_rows = []
            for _, row in df_model[df_model['alias'].notna()].iterrows():
                alias_row = row.copy()
                alias_row['model_name'] = row['alias']
                alias_rows.append(alias_row)

            if alias_rows:
                df_model_extended = pd.concat([df_model, pd.DataFrame(alias_rows)], ignore_index=True)
            else:
                df_model_extended = df_model
        else:
            df_model_extended = df_model

        df = pd.merge(df_result, df_dataset, on=["dataset_name"], how="inner")

        # set dataset default value to 0
        df = df.pivot(index=["model_name", "embd_dim", "embd_dtype", "group_name"], columns="dataset_name",
                 values=["ndcg_at_10"]).fillna(0).stack(level=1).reset_index()
        df = pd.merge(df, df_dataset, on=["group_name","dataset_name"], how="inner")

        # dataset_num_map = {}
        # grouped_dataset_count = df.groupby(["group_name"]).agg({
        #     "dataset_name": "nunique"
        # }).reset_index()
        #
        # for _, row in grouped_dataset_count.iterrows():
        #     dataset_num_map[row["group_name"]] = row["dataset_name"]

        grouped_model = df.groupby(["model_name", "group_name", "embd_dim", "embd_dtype"]).agg({
            "ndcg_at_10": "mean",
        }).reset_index()

        pivot = grouped_model.pivot(index=["model_name", "embd_dim", "embd_dtype"], columns="group_name",
                                    values=["ndcg_at_10"]).round(2).fillna(0)

        # Rename columns
        pivot.columns = list(
            map(lambda x: f"{x[1].capitalize()} Average" if x[1] != 'text' else f"Average", pivot.columns))

        pivot_dataset = df_result.pivot(index=["model_name", "embd_dim", "embd_dtype"], columns="dataset_name", values="ndcg_at_10").fillna(0)

        df = pd.merge(df_model_extended, pivot, on=["model_name", "embd_dim", "embd_dtype"])
        df = pd.merge(df, pivot_dataset, on=["model_name", "embd_dim", "embd_dtype"])

        if df.empty:
            return pd.DataFrame(columns=COLUMNS + ["reference"])
        return df

    def filter_df(self, group_name: str):
        """
        filter_by_providers
        """
        df = self.jsons_to_df()

        return df[df["group_name"] == group_name][COLUMNS][:]