Spaces:
Running
Running
Update index.html
Browse files- index.html +105 -90
index.html
CHANGED
@@ -6,130 +6,129 @@
|
|
6 |
<title>LLM Benchmark Overview</title>
|
7 |
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600&display=swap" rel="stylesheet">
|
8 |
<style>
|
9 |
-
|
10 |
body {
|
11 |
font-family: 'Inter', sans-serif;
|
12 |
-
background-color: #1a1a2e;
|
13 |
-
color: #e0e0e0;
|
14 |
margin: 0;
|
15 |
padding: 20px;
|
16 |
line-height: 1.6;
|
17 |
-
font-size: 14px;
|
18 |
}
|
19 |
|
20 |
-
|
21 |
h1 {
|
22 |
text-align: center;
|
23 |
-
color: #a766ff;
|
24 |
margin-bottom: 30px;
|
25 |
font-weight: 600;
|
26 |
font-size: 2.2em;
|
27 |
text-shadow: 0 0 10px rgba(167, 102, 255, 0.4);
|
28 |
}
|
29 |
|
30 |
-
|
31 |
-
body > div:nth-of-type(1) {
|
32 |
max-width: 900px;
|
33 |
margin: 0 auto 30px auto;
|
34 |
text-align: justify;
|
35 |
-
background-color: #2a2a4a;
|
36 |
padding: 20px;
|
37 |
border-radius: 12px;
|
38 |
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
|
39 |
font-size: 0.95em;
|
40 |
}
|
41 |
|
42 |
-
|
43 |
.table-container {
|
44 |
overflow-x: auto;
|
45 |
margin-top: 20px;
|
46 |
position: relative;
|
47 |
border-radius: 12px;
|
48 |
-
box-shadow: 0 8px 20px rgba(0, 0, 0, 0.5);
|
49 |
}
|
50 |
|
51 |
-
|
52 |
table {
|
53 |
-
width: 100
|
54 |
border-collapse: collapse;
|
55 |
margin: 0 auto;
|
56 |
-
background-color: #2a2a4a;
|
57 |
border-radius: 12px;
|
58 |
-
overflow: hidden;
|
59 |
-
min-width:
|
60 |
-
table-layout: fixed;
|
61 |
}
|
62 |
|
63 |
-
|
64 |
th, td {
|
65 |
-
padding:
|
66 |
text-align: left;
|
67 |
-
border: 1px solid #3a3a5a;
|
68 |
-
font-size: 0.9em;
|
69 |
-
vertical-align: top;
|
70 |
-
|
|
|
71 |
}
|
72 |
|
73 |
-
|
74 |
th {
|
75 |
-
background-color: #3a3a5a;
|
76 |
-
color: #c0c0c0;
|
77 |
font-weight: 600;
|
78 |
position: relative;
|
79 |
-
white-space: normal
|
80 |
-
word-wrap: break-word;
|
81 |
}
|
82 |
|
83 |
-
|
84 |
th.resizable .resizer {
|
85 |
position: absolute;
|
86 |
top: 0;
|
87 |
right: 0;
|
88 |
-
width: 8px;
|
89 |
height: 100%;
|
90 |
cursor: col-resize;
|
91 |
-
background-color: rgba(167, 102, 255, 0.2);
|
92 |
transition: background-color 0.2s ease-in-out;
|
93 |
}
|
94 |
|
95 |
th.resizable .resizer:hover {
|
96 |
-
background-color: rgba(167, 102, 255, 0.5);
|
97 |
}
|
98 |
|
99 |
-
|
100 |
tr:nth-child(even) {
|
101 |
-
background-color: #2f2f50;
|
102 |
}
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
th:nth-child(1), td:nth-child(1) { width: 15%; min-width: 120px; }
|
107 |
-
th:nth-child(2), td:nth-child(2) { width: 15%; min-width: 120px; }
|
108 |
-
th:nth-child(3), td:nth-child(3) { width: 20%; min-width: 150px; }
|
109 |
-
th:nth-child(4), td:nth-child(4) { width: 25%; min-width: 200px; }
|
110 |
-
th:nth-child(5), td:nth-child(5) { width: 25%; min-width: 200px; }
|
111 |
-
th:nth-child(6), td:nth-child(6) { width: 10%; min-width: 80px; }
|
112 |
-
th:nth-child(7), td:nth-child(7) { width: 10%; min-width: 80px; }
|
113 |
|
114 |
|
115 |
-
|
116 |
.cell-content {
|
117 |
-
cursor: pointer;
|
118 |
overflow: hidden;
|
119 |
text-overflow: ellipsis;
|
120 |
display: -webkit-box;
|
121 |
-
-webkit-line-clamp: 4;
|
122 |
-webkit-box-orient: vertical;
|
123 |
-
white-space: normal
|
124 |
-
word-wrap: break-word;
|
125 |
}
|
126 |
|
127 |
-
|
128 |
td:hover {
|
129 |
-
background-color: #3a3a5a;
|
130 |
}
|
131 |
|
132 |
-
|
133 |
.filter {
|
134 |
margin-bottom: 25px;
|
135 |
text-align: center;
|
@@ -142,16 +141,16 @@
|
|
142 |
.filter label {
|
143 |
font-size: 1em;
|
144 |
margin-right: 5px;
|
145 |
-
color: #a766ff;
|
146 |
font-weight: 600;
|
147 |
}
|
148 |
.filter select, .filter input[type="text"] {
|
149 |
padding: 8px 12px;
|
150 |
font-size: 0.95em;
|
151 |
-
border: 1px solid #5a5a7a;
|
152 |
border-radius: 8px;
|
153 |
-
background-color: #3a3a5a;
|
154 |
-
color: #e0e0e0;
|
155 |
outline: none;
|
156 |
transition: border-color 0.2s ease-in-out, box-shadow 0.2s ease-in-out;
|
157 |
}
|
@@ -160,27 +159,27 @@
|
|
160 |
box-shadow: 0 0 8px rgba(167, 102, 255, 0.5);
|
161 |
}
|
162 |
.filter input[type="text"] {
|
163 |
-
flex-grow: 1;
|
164 |
max-width: 400px;
|
165 |
}
|
166 |
|
167 |
-
|
168 |
.modal {
|
169 |
position: fixed;
|
170 |
top: 50%;
|
171 |
left: 50%;
|
172 |
transform: translate(-50%, -50%);
|
173 |
-
background-color: #2a2a4a;
|
174 |
-
box-shadow: 0 8px 20px rgba(0, 0, 0, 0.6);
|
175 |
padding: 30px;
|
176 |
z-index: 1000;
|
177 |
border-radius: 12px;
|
178 |
max-width: 90%;
|
179 |
max-height: 90%;
|
180 |
overflow: auto;
|
181 |
-
color: #e0e0e0;
|
182 |
font-size: 1em;
|
183 |
-
white-space: pre-wrap;
|
184 |
}
|
185 |
.overlay {
|
186 |
position: fixed;
|
@@ -188,24 +187,24 @@
|
|
188 |
left: 0;
|
189 |
width: 100%;
|
190 |
height: 100%;
|
191 |
-
background: rgba(0, 0, 0, 0.8);
|
192 |
z-index: 999;
|
193 |
}
|
194 |
|
195 |
-
|
196 |
a {
|
197 |
-
color: #a766ff;
|
198 |
text-decoration: none;
|
199 |
transition: color 0.2s ease-in-out;
|
200 |
}
|
201 |
a:hover {
|
202 |
-
color: #c08cff;
|
203 |
text-decoration: underline;
|
204 |
}
|
205 |
</style>
|
206 |
</head>
|
207 |
<body>
|
208 |
-
<h1>LLM Benchmark Overview</h1>
|
209 |
<div>As the development and evaluation of large language models (LLMs) continue to evolve, I conducted an overview of the principal benchmarks commonly found in research papers. My goal is to create a clear and comprehensive resource that summarizes what is being tested in LLMs, with concrete examples, key metrics, and direct links to related papers and repositories. This document serves as a centralized matrix that will be continuously updated with insights from future papers I review.</div>
|
210 |
<div class="filter">
|
211 |
<label for="metricFilter">Filter by Evaluated task:</label>
|
@@ -229,7 +228,8 @@
|
|
229 |
<div class="modal" id="modal" style="display: none;"></div>
|
230 |
|
231 |
<script>
|
232 |
-
|
|
|
233 |
const rows = [];
|
234 |
let currentRow = [];
|
235 |
let currentField = '';
|
@@ -250,27 +250,35 @@
|
|
250 |
currentField += char;
|
251 |
}
|
252 |
}
|
|
|
253 |
if (currentField) currentRow.push(currentField.trim());
|
254 |
if (currentRow.length > 0) rows.push(currentRow);
|
255 |
-
|
|
|
256 |
return { headers, rows };
|
257 |
}
|
|
|
|
|
258 |
async function loadCSVFromHuggingFace(dataset, filename, token) {
|
259 |
-
const url =
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
}
|
270 |
-
|
271 |
-
const content = await response.text();
|
272 |
-
|
273 |
-
return parseCSV(content);
|
274 |
}
|
275 |
|
276 |
const metricFilter = document.getElementById('metricFilter');
|
@@ -427,24 +435,31 @@
|
|
427 |
});
|
428 |
makeResizable(); // Re-apply resizable functionality after table population
|
429 |
}
|
430 |
-
|
|
|
431 |
overlay.addEventListener('click', () => {
|
432 |
overlay.style.display = 'none';
|
433 |
modal.style.display = 'none';
|
434 |
});
|
435 |
|
|
|
436 |
metricFilter.addEventListener('change', () => {
|
437 |
const filterValue = metricFilter.value;
|
438 |
populateTable(parsedCSV.headers, parsedCSV.rows, filterValue, 0); // Re-populate table with new filter
|
439 |
});
|
440 |
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
|
|
|
|
|
|
|
|
|
|
448 |
</script>
|
449 |
</body>
|
450 |
</html>
|
|
|
6 |
<title>LLM Benchmark Overview</title>
|
7 |
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600&display=swap" rel="stylesheet">
|
8 |
<style>
|
9 |
+
/* General Body and Font Styles */
|
10 |
body {
|
11 |
font-family: 'Inter', sans-serif;
|
12 |
+
background-color: #1a1a2e; /* Dark background */
|
13 |
+
color: #e0e0e0; /* Light text */
|
14 |
margin: 0;
|
15 |
padding: 20px;
|
16 |
line-height: 1.6;
|
17 |
+
font-size: 14px; /* Reduced base font size */
|
18 |
}
|
19 |
|
20 |
+
/* Header Styling */
|
21 |
h1 {
|
22 |
text-align: center;
|
23 |
+
color: #a766ff; /* Neo purple */
|
24 |
margin-bottom: 30px;
|
25 |
font-weight: 600;
|
26 |
font-size: 2.2em;
|
27 |
text-shadow: 0 0 10px rgba(167, 102, 255, 0.4);
|
28 |
}
|
29 |
|
30 |
+
/* Introduction Text */
|
31 |
+
body > div:nth-of-type(1) { /* Targeting the intro div */
|
32 |
max-width: 900px;
|
33 |
margin: 0 auto 30px auto;
|
34 |
text-align: justify;
|
35 |
+
background-color: #2a2a4a; /* Slightly lighter dark background */
|
36 |
padding: 20px;
|
37 |
border-radius: 12px;
|
38 |
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
|
39 |
font-size: 0.95em;
|
40 |
}
|
41 |
|
42 |
+
/* Table Container and Shadow */
|
43 |
.table-container {
|
44 |
overflow-x: auto;
|
45 |
margin-top: 20px;
|
46 |
position: relative;
|
47 |
border-radius: 12px;
|
48 |
+
box-shadow: 0 8px 20px rgba(0, 0, 0, 0.5); /* Stronger shadow */
|
49 |
}
|
50 |
|
51 |
+
/* Table Styling */
|
52 |
table {
|
53 |
+
width: auto; /* Changed from 100% to auto to allow min-width to force overflow */
|
54 |
border-collapse: collapse;
|
55 |
margin: 0 auto;
|
56 |
+
background-color: #2a2a4a; /* Darker table background */
|
57 |
border-radius: 12px;
|
58 |
+
overflow: hidden; /* Ensures rounded corners apply */
|
59 |
+
min-width: 950px; /* Ensure a minimum width for the table itself (sum of column min-widths) */
|
60 |
+
table-layout: fixed; /* Keep fixed layout for column width control */
|
61 |
}
|
62 |
|
63 |
+
/* Table Headers and Cells */
|
64 |
th, td {
|
65 |
+
padding: 10px 15px; /* Reduced vertical padding from 12px to 10px */
|
66 |
text-align: left;
|
67 |
+
border: 1px solid #3a3a5a; /* Darker border */
|
68 |
+
font-size: 0.9em; /* Smaller font for table content */
|
69 |
+
vertical-align: top; /* Align content to top */
|
70 |
+
white-space: normal; /* Ensure cells allow content to wrap */
|
71 |
+
word-wrap: break-word; /* Ensure long words break within cells */
|
72 |
}
|
73 |
|
74 |
+
/* Table Header Specifics */
|
75 |
th {
|
76 |
+
background-color: #3a3a5a; /* Dark header background */
|
77 |
+
color: #c0c0c0; /* Lighter header text */
|
78 |
font-weight: 600;
|
79 |
position: relative;
|
80 |
+
/* white-space: normal and word-wrap: break-word are now in th, td general rule */
|
|
|
81 |
}
|
82 |
|
83 |
+
/* Resizable Column Handler */
|
84 |
th.resizable .resizer {
|
85 |
position: absolute;
|
86 |
top: 0;
|
87 |
right: 0;
|
88 |
+
width: 8px; /* Wider resizer for easier grabbing */
|
89 |
height: 100%;
|
90 |
cursor: col-resize;
|
91 |
+
background-color: rgba(167, 102, 255, 0.2); /* Semi-transparent purple */
|
92 |
transition: background-color 0.2s ease-in-out;
|
93 |
}
|
94 |
|
95 |
th.resizable .resizer:hover {
|
96 |
+
background-color: rgba(167, 102, 255, 0.5); /* More visible on hover */
|
97 |
}
|
98 |
|
99 |
+
/* Alternating Row Colors */
|
100 |
tr:nth-child(even) {
|
101 |
+
background-color: #2f2f50; /* Slightly different shade for even rows */
|
102 |
}
|
103 |
|
104 |
+
/* Specific Column Styling for wider columns */
|
105 |
+
/* Adjusted widths for better display */
|
106 |
+
th:nth-child(1), td:nth-child(1) { width: 15%; min-width: 120px; } /* Evaluated task */
|
107 |
+
th:nth-child(2), td:nth-child(2) { width: 15%; min-width: 120px; } /* Benchmark Name */
|
108 |
+
th:nth-child(3), td:nth-child(3) { width: 20%; min-width: 150px; } /* Metric often used */
|
109 |
+
th:nth-child(4), td:nth-child(4) { width: 25%; min-width: 200px; } /* Question + context example */
|
110 |
+
th:nth-child(5), td:nth-child(5) { width: 25%; min-width: 200px; } /* Answer examp */
|
111 |
+
th:nth-child(6), td:nth-child(6) { width: 10%; min-width: 80px; } /* Paper */
|
112 |
+
th:nth-child(7), td:nth-child(7) { width: 10%; min-width: 80px; } /* HF or Git link */
|
113 |
|
114 |
|
115 |
+
/* Inner div for truncated content */
|
116 |
.cell-content {
|
117 |
+
cursor: pointer; /* Keep cursor pointer for expandability */
|
118 |
overflow: hidden;
|
119 |
text-overflow: ellipsis;
|
120 |
display: -webkit-box;
|
121 |
+
-webkit-line-clamp: 4; /* Limit to 4 lines */
|
122 |
-webkit-box-orient: vertical;
|
123 |
+
/* white-space: normal and word-wrap: break-word are now in th, td general rule */
|
|
|
124 |
}
|
125 |
|
126 |
+
/* Hover effect on the cell, not the inner content */
|
127 |
td:hover {
|
128 |
+
background-color: #3a3a5a; /* Highlight on hover */
|
129 |
}
|
130 |
|
131 |
+
/* Filter and Search Bar Styling */
|
132 |
.filter {
|
133 |
margin-bottom: 25px;
|
134 |
text-align: center;
|
|
|
141 |
.filter label {
|
142 |
font-size: 1em;
|
143 |
margin-right: 5px;
|
144 |
+
color: #a766ff; /* Neo purple */
|
145 |
font-weight: 600;
|
146 |
}
|
147 |
.filter select, .filter input[type="text"] {
|
148 |
padding: 8px 12px;
|
149 |
font-size: 0.95em;
|
150 |
+
border: 1px solid #5a5a7a; /* Darker border */
|
151 |
border-radius: 8px;
|
152 |
+
background-color: #3a3a5a; /* Dark input background */
|
153 |
+
color: #e0e0e0; /* Light input text */
|
154 |
outline: none;
|
155 |
transition: border-color 0.2s ease-in-out, box-shadow 0.2s ease-in-out;
|
156 |
}
|
|
|
159 |
box-shadow: 0 0 8px rgba(167, 102, 255, 0.5);
|
160 |
}
|
161 |
.filter input[type="text"] {
|
162 |
+
flex-grow: 1; /* Allow search input to grow */
|
163 |
max-width: 400px;
|
164 |
}
|
165 |
|
166 |
+
/* Modal and Overlay Styling */
|
167 |
.modal {
|
168 |
position: fixed;
|
169 |
top: 50%;
|
170 |
left: 50%;
|
171 |
transform: translate(-50%, -50%);
|
172 |
+
background-color: #2a2a4a; /* Dark modal background */
|
173 |
+
box-shadow: 0 8px 20px rgba(0, 0, 0, 0.6); /* Stronger shadow */
|
174 |
padding: 30px;
|
175 |
z-index: 1000;
|
176 |
border-radius: 12px;
|
177 |
max-width: 90%;
|
178 |
max-height: 90%;
|
179 |
overflow: auto;
|
180 |
+
color: #e0e0e0; /* Light text */
|
181 |
font-size: 1em;
|
182 |
+
white-space: pre-wrap; /* Preserve formatting for modal content */
|
183 |
}
|
184 |
.overlay {
|
185 |
position: fixed;
|
|
|
187 |
left: 0;
|
188 |
width: 100%;
|
189 |
height: 100%;
|
190 |
+
background: rgba(0, 0, 0, 0.8); /* Darker overlay */
|
191 |
z-index: 999;
|
192 |
}
|
193 |
|
194 |
+
/* Link Styling */
|
195 |
a {
|
196 |
+
color: #a766ff; /* Neo purple for links */
|
197 |
text-decoration: none;
|
198 |
transition: color 0.2s ease-in-out;
|
199 |
}
|
200 |
a:hover {
|
201 |
+
color: #c08cff; /* Lighter purple on hover */
|
202 |
text-decoration: underline;
|
203 |
}
|
204 |
</style>
|
205 |
</head>
|
206 |
<body>
|
207 |
+
<h1>LLM Benchmark Overview (Update Ongoing)</h1>
|
208 |
<div>As the development and evaluation of large language models (LLMs) continue to evolve, I conducted an overview of the principal benchmarks commonly found in research papers. My goal is to create a clear and comprehensive resource that summarizes what is being tested in LLMs, with concrete examples, key metrics, and direct links to related papers and repositories. This document serves as a centralized matrix that will be continuously updated with insights from future papers I review.</div>
|
209 |
<div class="filter">
|
210 |
<label for="metricFilter">Filter by Evaluated task:</label>
|
|
|
228 |
<div class="modal" id="modal" style="display: none;"></div>
|
229 |
|
230 |
<script>
|
231 |
+
// Custom CSV parser to handle quoted fields with commas/newlines
|
232 |
+
function parseCSV(content) {
|
233 |
const rows = [];
|
234 |
let currentRow = [];
|
235 |
let currentField = '';
|
|
|
250 |
currentField += char;
|
251 |
}
|
252 |
}
|
253 |
+
// Add the last field and row if any
|
254 |
if (currentField) currentRow.push(currentField.trim());
|
255 |
if (currentRow.length > 0) rows.push(currentRow);
|
256 |
+
|
257 |
+
const headers = rows.shift(); // First row is headers
|
258 |
return { headers, rows };
|
259 |
}
|
260 |
+
|
261 |
+
// Function to load CSV from Hugging Face (commented out for Canvas preview)
|
262 |
async function loadCSVFromHuggingFace(dataset, filename, token) {
|
263 |
+
const url = `https://huggingface.co/datasets/${dataset}/resolve/main/${filename}`;
|
264 |
+
try {
|
265 |
+
const response = await fetch(url, {
|
266 |
+
headers: {
|
267 |
+
// 'Authorization': `Bearer ${token}`, // Uncomment if a token is required
|
268 |
+
},
|
269 |
+
});
|
270 |
+
|
271 |
+
if (!response.ok) {
|
272 |
+
throw new Error(`Failed to fetch file: ${response.statusText}`);
|
273 |
+
}
|
274 |
+
|
275 |
+
const content = await response.text();
|
276 |
+
return parseCSV(content);
|
277 |
+
} catch (error) {
|
278 |
+
console.error("Error loading CSV from Hugging Face:", error);
|
279 |
+
// Fallback or error message to user could go here
|
280 |
+
return { headers: [], rows: [] };
|
281 |
}
|
|
|
|
|
|
|
|
|
282 |
}
|
283 |
|
284 |
const metricFilter = document.getElementById('metricFilter');
|
|
|
435 |
});
|
436 |
makeResizable(); // Re-apply resizable functionality after table population
|
437 |
}
|
438 |
+
|
439 |
+
// Close modal on overlay click
|
440 |
overlay.addEventListener('click', () => {
|
441 |
overlay.style.display = 'none';
|
442 |
modal.style.display = 'none';
|
443 |
});
|
444 |
|
445 |
+
// Filter change listener
|
446 |
metricFilter.addEventListener('change', () => {
|
447 |
const filterValue = metricFilter.value;
|
448 |
populateTable(parsedCSV.headers, parsedCSV.rows, filterValue, 0); // Re-populate table with new filter
|
449 |
});
|
450 |
|
451 |
+
|
452 |
+
loadCSVFromHuggingFace('UlrickBL/benchmark_overview', 'benchmark_overview.csv', window.huggingface.variables.HF_TOKEN)
|
453 |
+
.then(({ headers, rows }) => {
|
454 |
+
parsedCSV = { headers, rows };
|
455 |
+
populateFilterOptions(rows, 0);
|
456 |
+
populateTable(headers, rows, '', 0);
|
457 |
+
})
|
458 |
+
.catch(error => {
|
459 |
+
console.error("Failed to load CSV data:", error);
|
460 |
+
// Display a user-friendly message if data loading fails
|
461 |
+
tableBody.innerHTML = '<tr><td colspan="7" style="text-align: center; color: #ff6b6b;">Failed to load data. Please check the dataset link or your internet connection.</td></tr>';
|
462 |
+
});
|
463 |
</script>
|
464 |
</body>
|
465 |
</html>
|