File size: 11,346 Bytes
c041a1a
 
 
 
 
6bc3b5e
c041a1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff4a86a
c041a1a
 
 
 
 
 
 
3330400
c041a1a
 
 
 
 
3330400
 
 
50f726d
c041a1a
 
 
 
 
 
50f726d
 
 
 
 
 
 
 
 
 
 
 
c041a1a
 
 
3f074b2
 
 
c041a1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff4a86a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1635507
ff4a86a
c041a1a
 
 
80c20a7
192167f
c041a1a
50f726d
c041a1a
 
0a286b0
 
ec7611c
c041a1a
 
 
 
f9bfa06
c041a1a
 
 
 
 
 
 
ff4a86a
 
 
c041a1a
 
21dd092
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c041a1a
86d9b17
 
 
 
 
 
 
 
 
 
 
 
 
d273462
86d9b17
d273462
c041a1a
 
f9bfa06
 
 
ff4a86a
 
df0a6e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50f726d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9bfa06
 
c041a1a
 
 
 
 
 
 
f9bfa06
 
c041a1a
f9bfa06
 
 
 
50f726d
f9bfa06
 
 
3f074b2
 
50f726d
f9bfa06
 
 
 
3f074b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c041a1a
 
 
ff4a86a
 
 
1635507
c041a1a
f9bfa06
 
 
c041a1a
50f726d
c041a1a
ff4a86a
 
 
 
c041a1a
 
50f726d
c041a1a
f9bfa06
e55e9c5
f9bfa06
50f726d
f9bfa06
c041a1a
 
 
ff4a86a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>*LLM Benchmark overview (update ongoing)*</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            background-color: #fdf6fb;
            color: #333;
            margin: 0;
            padding: 20px;
        }
        h1 {
            text-align: center;
            color: #d16ba5;
        }
        .table-container {
            overflow-x: auto;
            margin-top: 20px;
            position: relative;
        }
        table {
            width: 100%;
            border-collapse: collapse;
            margin: 0 auto;
            background-color: #fff;
            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
            table-layout: fixed;
        }
        th, td {
            padding: 10px;
            text-align: left;
            border: 1px solid #ddd;
            overflow: hidden;
            text-overflow: ellipsis;
            white-space: nowrap;
            position: relative;
        }
        th {
            background-color: #f7d9eb;
            color: #333;
            font-weight: bold;
        }
        th.resizable {
            position: relative;
        }
        th.resizable .resizer {
            position: absolute;
            top: 0;
            right: 0;
            width: 5px;
            height: 100%;
            cursor: col-resize;
            background-color: transparent;
        }
        td.expandable {
            cursor: pointer;
        }
        td:nth-child(2) {
            background-color: #fcebf7;
        }
        .filter {
            margin-bottom: 20px;
            text-align: center;
        }
        .filter label {
            font-size: 16px;
            margin-right: 10px;
            color: #d16ba5;
        }
        .filter select {
            padding: 5px;
            font-size: 16px;
            border: 1px solid #ccc;
            border-radius: 5px;
        }
        .expanded {
            white-space: normal;
            background-color: #fcebf7;
        }
        .modal {
            position: fixed;
            top: 50%;
            left: 50%;
            transform: translate(-50%, -50%);
            background-color: #fff;
            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
            padding: 20px;
            z-index: 1000;
            border-radius: 10px;
            max-width: 80%;
            max-height: 80%;
            overflow: auto;
        }
        .overlay {
            position: fixed;
            top: 0;
            left: 0;
            width: 100%;
            height: 100%;
            background: rgba(0, 0, 0, 0.5);
            z-index: 999;
            white-space: pre-wrap;
        }
    </style>
</head>
<body>
    <h1>LLM Benchmark overview (update ongoing) </h1>
    <div>As the development and evaluation of large language models (LLMs) continue to evolve, I conducted an overview of the principal benchmarks commonly found in research papers. My goal is to create a clear and comprehensive resource that summarizes what is being tested in LLMs, with concrete examples, key metrics, and direct links to related papers and repositories. This document serves as a centralized matrix that will be continuously updated with insights from future papers I review.</div>
    <div class="filter">
        <label for="metricFilter">Filter by Evaluated task:</label>
        <select id="metricFilter">
            <option value="">All</option>
        </select>
        <h2></h2>
        <input type="text" id="searchInput" placeholder="Search for benchmark names ..." style="margin-bottom: 10px; padding: 8px; width: 100%;">
    </div>
    <div class="table-container">
        <table id="csvTable">
            <thead>
                <!-- Headers will be dynamically added -->
            </thead>
            <tbody>
                <!-- Rows will be dynamically added here -->
            </tbody>
        </table>
    </div>

    <div class="overlay" id="overlay" style="display: none;"></div>
    <div class="modal" id="modal" style="display: none;"></div>

    <script>
        function parseCSV(content) {
            const rows = [];
            let currentRow = [];
            let currentField = '';
            let insideQuotes = false;
            for (let i = 0; i < content.length; i++) {
                const char = content[i];
                if (char === '"') {
                    insideQuotes = !insideQuotes;
                } else if (char === ',' && !insideQuotes) {
                    currentRow.push(currentField.trim());
                    currentField = '';
                } else if (char === '\n' && !insideQuotes) {
                    currentRow.push(currentField.trim());
                    rows.push(currentRow);
                    currentRow = [];
                    currentField = '';
                } else {
                    currentField += char;
                }
            }
            if (currentField) currentRow.push(currentField.trim());
            if (currentRow.length > 0) rows.push(currentRow);
            const headers = rows.shift();
            return { headers, rows };
        }
        async function loadCSVFromHuggingFace(dataset, filename, token) {
            const url = `https://huggingface.co/datasets/${dataset}/resolve/main/${filename}`;
            
            const response = await fetch(url, {
                headers: {
                    'Authorization': `Bearer ${token}`,
                },
            });
            
            if (!response.ok) {
                throw new Error(`Failed to fetch file: ${response.statusText}`);
            }
            
            const content = await response.text();
            
            return parseCSV(content);
        }
        const metricFilter = document.getElementById('metricFilter');
        const table = document.getElementById('csvTable');
        const tableHead = table.querySelector('thead');
        const tableBody = table.querySelector('tbody');
        const overlay = document.getElementById('overlay');
        const modal = document.getElementById('modal');

        document.getElementById('searchInput').addEventListener('input', function () {
            const filter = this.value.trim().toLowerCase(); // Normalize input
            const table = document.getElementById('csvTable');
            const rows = table.querySelectorAll('tbody tr');
        
            rows.forEach(row => {
              const nameCell = row.cells[1];
              if (nameCell) {
                const name = nameCell.textContent.trim().toLowerCase();
                row.style.display = name.includes(filter) ? '' : 'none';
              }
            });
          });
      
        function makeResizable() {
            const thElements = document.querySelectorAll('th');
            thElements.forEach(th => {
                const resizer = document.createElement('div');
                resizer.classList.add('resizer');
                th.appendChild(resizer);
                let startX;
                let startWidth;
                resizer.addEventListener('mousedown', (e) => {
                    startX = e.pageX;
                    startWidth = th.offsetWidth;
                    document.addEventListener('mousemove', resizeColumn);
                    document.addEventListener('mouseup', stopResize);
                });
                function resizeColumn(e) {
                    const newWidth = startWidth + (e.pageX - startX);
                    th.style.width = `${newWidth}px`;
                }
                function stopResize() {
                    document.removeEventListener('mousemove', resizeColumn);
                    document.removeEventListener('mouseup', stopResize);
                }
            });
        }
        function populateFilterOptions(data, headerIndex) {
            const uniqueMetricTypes = [...new Set(data.map(row => row[headerIndex]))];
            uniqueMetricTypes.forEach(type => {
                const option = document.createElement('option');
                option.value = type;
                option.textContent = type;
                metricFilter.appendChild(option);
            });
        }
        function populateTable(headers, rows, filterValue, headerIndex) {
            tableHead.innerHTML = '';
            tableBody.innerHTML = '';
            const headerRow = document.createElement('tr');
            headers.forEach(header => {
                const th = document.createElement('th');
                th.textContent = header;
                th.classList.add('resizable');
                headerRow.appendChild(th);
            });
            tableHead.appendChild(headerRow);
            rows
                .filter(row => !filterValue || row[headerIndex] === filterValue)
                .sort((a, b) => a[0].localeCompare(b[0]))
                .forEach(row => {
                    const tr = document.createElement('tr');
                    row.forEach((value, index) => {
                        const td = document.createElement('td');
                        if (headers[index] === 'Paper' && value) {
                            const link = document.createElement('a');
                            link.href = value;
                            link.textContent = 'paper link';
                            link.target = '_blank';
                            td.appendChild(link);
                        } else if (headers[index] === 'HF or Git link' && value) {
                            const link = document.createElement('a');
                            link.href = value;
                            link.textContent = 'dataset link';
                            link.target = '_blank';
                            td.appendChild(link);
                        } else {
                            td.textContent = value;
                        }
                        td.classList.add('expandable');
                        td.title = 'Click to expand';
                        td.addEventListener('click', () => {
                            overlay.style.display = 'block';
                            modal.style.display = 'block';
                            modal.textContent = value;
                            modal.style.whiteSpace = 'pre-wrap';
                        });
                        tr.appendChild(td);
                    });
                    tableBody.appendChild(tr);
                });
            makeResizable();
        }
        overlay.addEventListener('click', () => {
            overlay.style.display = 'none';
            modal.style.display = 'none';
        });
        metricFilter.addEventListener('change', () => {
            const filterValue = metricFilter.value;
            populateTable(parsedCSV.headers, parsedCSV.rows, filterValue, 0);
        });
        let parsedCSV;
        loadCSVFromHuggingFace('UlrickBL/benchmark_overview', 'benchmark_overview.csv', window.huggingface.variables.HF_TOKEN).then(({ headers, rows }) => {
            parsedCSV = { headers, rows };
            populateFilterOptions(rows, 0);
            populateTable(headers, rows, '', 0);
        });
    </script>
</body>
</html>