New results files.
Browse filesSigned-off-by: Jonathan Bnayahu <[email protected]>
- results/bluebench/2025-06-16T17-40-01_evaluation_results.json +0 -580
- results/bluebench/{2025-06-16T11-59-29_evaluation_results.json → 2025-06-19T11-21-54_evaluation_results.json} +784 -803
- results/bluebench/2025-06-19T15-57-45_evaluation_results.json +1283 -0
- results/bluebench/2025-06-19T16-09-06_evaluation_results.json +1283 -0
- results/bluebench/2025-06-19T16-21-09_evaluation_results.json +1283 -0
- results/bluebench/2025-06-19T17-18-35_evaluation_results.json +1283 -0
- results/bluebench/2025-06-19T18-10-05_evaluation_results.json +1283 -0
- results/bluebench/2025-06-19T20-10-50_evaluation_results.json +1283 -0
- results/bluebench/2025-06-19T21-59-04_evaluation_results.json +1283 -0
results/bluebench/2025-06-16T17-40-01_evaluation_results.json
DELETED
@@ -1,580 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-16T14:40:01.560857Z",
|
4 |
-
"command_line_invocation": [
|
5 |
-
"/home/bnayahu/miniforge3/envs/unitxt/bin/unitxt-evaluate",
|
6 |
-
"--tasks",
|
7 |
-
"benchmarks.bluebench",
|
8 |
-
"--model",
|
9 |
-
"cross_provider",
|
10 |
-
"--model_args",
|
11 |
-
"model_name=watsonx/meta-llama/llama-3-3-70b-instruct,max_tokens=256",
|
12 |
-
"--output_path",
|
13 |
-
"./results/bluebench",
|
14 |
-
"--log_samples",
|
15 |
-
"--trust_remote_code",
|
16 |
-
"--batch_size",
|
17 |
-
"8",
|
18 |
-
"--verbosity",
|
19 |
-
"ERROR",
|
20 |
-
"--limit",
|
21 |
-
"100"
|
22 |
-
],
|
23 |
-
"parsed_arguments": {
|
24 |
-
"tasks": [
|
25 |
-
"benchmarks.bluebench"
|
26 |
-
],
|
27 |
-
"split": "test",
|
28 |
-
"num_fewshots": null,
|
29 |
-
"limit": 100,
|
30 |
-
"batch_size": 8,
|
31 |
-
"model": "watsonx/meta-llama/llama-3-3-70b-instruct",
|
32 |
-
"model_args": {
|
33 |
-
"max_tokens": 256
|
34 |
-
},
|
35 |
-
"gen_kwargs": null,
|
36 |
-
"chat_template_kwargs": null,
|
37 |
-
"output_path": "./results/bluebench",
|
38 |
-
"output_file_prefix": "evaluation_results",
|
39 |
-
"log_samples": true,
|
40 |
-
"verbosity": "ERROR",
|
41 |
-
"apply_chat_template": false,
|
42 |
-
"trust_remote_code": true,
|
43 |
-
"disable_hf_cache": false,
|
44 |
-
"cache_dir": null
|
45 |
-
},
|
46 |
-
"unitxt_version": "1.24.0",
|
47 |
-
"unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
|
48 |
-
"python_version": "3.11.12",
|
49 |
-
"system": "Linux",
|
50 |
-
"system_version": "#1 SMP PREEMPT_DYNAMIC Mon Apr 21 17:08:54 UTC 2025",
|
51 |
-
"installed_packages": {
|
52 |
-
"tqdm": "4.67.1",
|
53 |
-
"httpretty": "1.1.4",
|
54 |
-
"evaluate": "0.4.3",
|
55 |
-
"ruff": "0.11.10",
|
56 |
-
"virtualenv": "20.31.2",
|
57 |
-
"urllib3": "2.4.0",
|
58 |
-
"httpcore": "1.0.9",
|
59 |
-
"mecab-ko-dic": "1.0.0",
|
60 |
-
"mecab-ko": "1.0.1",
|
61 |
-
"identify": "2.6.10",
|
62 |
-
"bert-score": "0.3.13",
|
63 |
-
"lxml": "5.4.0",
|
64 |
-
"python-dotenv": "1.1.0",
|
65 |
-
"accelerate": "1.7.0",
|
66 |
-
"httpx-sse": "0.4.0",
|
67 |
-
"pillow": "11.2.1",
|
68 |
-
"certifi": "2025.4.26",
|
69 |
-
"pyparsing": "3.2.3",
|
70 |
-
"nvidia-cusparselt-cu12": "0.6.3",
|
71 |
-
"tzdata": "2025.2",
|
72 |
-
"torch": "2.7.0",
|
73 |
-
"MarkupSafe": "3.0.2",
|
74 |
-
"setuptools": "80.1.0",
|
75 |
-
"pydantic": "2.11.4",
|
76 |
-
"yarl": "1.20.0",
|
77 |
-
"importlib_metadata": "8.0.0",
|
78 |
-
"pydantic_core": "2.33.2",
|
79 |
-
"scipy": "1.15.3",
|
80 |
-
"annotated-types": "0.7.0",
|
81 |
-
"portalocker": "3.1.1",
|
82 |
-
"packaging": "24.2",
|
83 |
-
"Deprecated": "1.2.18",
|
84 |
-
"typing_extensions": "4.12.2",
|
85 |
-
"ibm-cos-sdk-s3transfer": "2.14.1",
|
86 |
-
"nvidia-cufft-cu12": "11.3.0.4",
|
87 |
-
"nvidia-cusolver-cu12": "11.7.1.2",
|
88 |
-
"diskcache": "5.6.3",
|
89 |
-
"fsspec": "2025.3.0",
|
90 |
-
"transformers": "4.51.3",
|
91 |
-
"platformdirs": "4.2.2",
|
92 |
-
"nvidia-cublas-cu12": "12.6.4.1",
|
93 |
-
"threadpoolctl": "3.6.0",
|
94 |
-
"jsonschema-specifications": "2025.4.1",
|
95 |
-
"tenacity": "9.1.2",
|
96 |
-
"propcache": "0.3.1",
|
97 |
-
"ibm-cos-sdk": "2.14.1",
|
98 |
-
"mpmath": "1.3.0",
|
99 |
-
"jiter": "0.9.0",
|
100 |
-
"filelock": "3.18.0",
|
101 |
-
"tomli": "2.0.1",
|
102 |
-
"nvidia-nvjitlink-cu12": "12.6.85",
|
103 |
-
"cfgv": "3.4.0",
|
104 |
-
"ibm_watsonx_ai": "1.3.13",
|
105 |
-
"ibm-generative-ai": "3.0.0",
|
106 |
-
"wheel": "0.45.1",
|
107 |
-
"sympy": "1.14.0",
|
108 |
-
"requests": "2.32.2",
|
109 |
-
"charset-normalizer": "3.4.2",
|
110 |
-
"psutil": "7.0.0",
|
111 |
-
"pre_commit": "4.2.0",
|
112 |
-
"nodeenv": "1.9.1",
|
113 |
-
"colorama": "0.4.6",
|
114 |
-
"absl-py": "2.2.2",
|
115 |
-
"rouge_score": "0.1.2",
|
116 |
-
"scikit-learn": "1.6.1",
|
117 |
-
"multiprocess": "0.70.16",
|
118 |
-
"xxhash": "3.5.0",
|
119 |
-
"detect-secrets": "1.5.0",
|
120 |
-
"aiohttp": "3.11.18",
|
121 |
-
"frozenlist": "1.6.0",
|
122 |
-
"tabulate": "0.9.0",
|
123 |
-
"triton": "3.3.0",
|
124 |
-
"idna": "3.10",
|
125 |
-
"PyYAML": "6.0.2",
|
126 |
-
"ibm-cos-sdk-core": "2.14.1",
|
127 |
-
"nvidia-curand-cu12": "10.3.7.77",
|
128 |
-
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
129 |
-
"tiktoken": "0.9.0",
|
130 |
-
"aiosignal": "1.3.2",
|
131 |
-
"attrs": "25.3.0",
|
132 |
-
"h11": "0.16.0",
|
133 |
-
"anyio": "4.9.0",
|
134 |
-
"wrapt": "1.17.2",
|
135 |
-
"kiwisolver": "1.4.8",
|
136 |
-
"nvidia-cudnn-cu12": "9.5.1.17",
|
137 |
-
"matplotlib": "3.10.3",
|
138 |
-
"aiolimiter": "1.2.1",
|
139 |
-
"codespell": "2.4.1",
|
140 |
-
"jmespath": "1.0.1",
|
141 |
-
"nltk": "3.9.1",
|
142 |
-
"unitxt": "1.24.0",
|
143 |
-
"dill": "0.3.8",
|
144 |
-
"multidict": "6.4.3",
|
145 |
-
"conllu": "6.0.0",
|
146 |
-
"litellm": "1.69.3",
|
147 |
-
"joblib": "1.5.0",
|
148 |
-
"cycler": "0.12.1",
|
149 |
-
"pip": "25.1.1",
|
150 |
-
"nvidia-nccl-cu12": "2.26.2",
|
151 |
-
"click": "8.2.0",
|
152 |
-
"fonttools": "4.58.0",
|
153 |
-
"datasets": "3.6.0",
|
154 |
-
"six": "1.17.0",
|
155 |
-
"numpy": "2.2.5",
|
156 |
-
"nvidia-cuda-runtime-cu12": "12.6.77",
|
157 |
-
"huggingface-hub": "0.31.2",
|
158 |
-
"aiohappyeyeballs": "2.6.1",
|
159 |
-
"sacrebleu": "2.5.1",
|
160 |
-
"pyarrow": "20.0.0",
|
161 |
-
"openai": "1.75.0",
|
162 |
-
"python-dateutil": "2.9.0.post0",
|
163 |
-
"pytz": "2025.2",
|
164 |
-
"contourpy": "1.3.2",
|
165 |
-
"pandas": "2.2.3",
|
166 |
-
"distro": "1.9.0",
|
167 |
-
"httpx": "0.27.2",
|
168 |
-
"rpds-py": "0.25.0",
|
169 |
-
"Jinja2": "3.1.6",
|
170 |
-
"nvidia-cusparse-cu12": "12.5.4.2",
|
171 |
-
"nvidia-nvtx-cu12": "12.6.77",
|
172 |
-
"fuzzywuzzy": "0.18.0",
|
173 |
-
"tokenizers": "0.21.1",
|
174 |
-
"lomond": "0.3.3",
|
175 |
-
"nvidia-cufile-cu12": "1.11.1.6",
|
176 |
-
"typing-inspection": "0.4.0",
|
177 |
-
"safetensors": "0.5.3",
|
178 |
-
"nvidia-cuda-cupti-cu12": "12.6.80",
|
179 |
-
"referencing": "0.36.2",
|
180 |
-
"networkx": "3.4.2",
|
181 |
-
"jsonschema": "4.23.0",
|
182 |
-
"zipp": "3.19.2",
|
183 |
-
"regex": "2024.11.6",
|
184 |
-
"distlib": "0.3.9",
|
185 |
-
"sniffio": "1.3.1",
|
186 |
-
"autocommand": "2.2.2",
|
187 |
-
"jaraco.collections": "5.1.0",
|
188 |
-
"typeguard": "4.3.0",
|
189 |
-
"jaraco.text": "3.12.1",
|
190 |
-
"jaraco.context": "5.3.0",
|
191 |
-
"jaraco.functools": "4.0.1",
|
192 |
-
"more-itertools": "10.3.0",
|
193 |
-
"backports.tarfile": "1.2.0",
|
194 |
-
"inflect": "7.3.1"
|
195 |
-
}
|
196 |
-
},
|
197 |
-
"results": {
|
198 |
-
"bias": {
|
199 |
-
"safety_bbq_age": {
|
200 |
-
"accuracy": 0.7888888888888889,
|
201 |
-
"accuracy_ci_low": 0.7,
|
202 |
-
"accuracy_ci_high": 0.8555555555555555,
|
203 |
-
"score_name": "accuracy",
|
204 |
-
"score": 0.7888888888888889,
|
205 |
-
"score_ci_high": 0.8555555555555555,
|
206 |
-
"score_ci_low": 0.7,
|
207 |
-
"num_of_instances": 90
|
208 |
-
},
|
209 |
-
"safety_bbq_disability_status": {
|
210 |
-
"accuracy": 1.0,
|
211 |
-
"accuracy_ci_low": 1.0,
|
212 |
-
"accuracy_ci_high": 1.0,
|
213 |
-
"score_name": "accuracy",
|
214 |
-
"score": 1.0,
|
215 |
-
"score_ci_high": 1.0,
|
216 |
-
"score_ci_low": 1.0,
|
217 |
-
"num_of_instances": 10
|
218 |
-
},
|
219 |
-
"score": 0.8944444444444444,
|
220 |
-
"score_name": "subsets_mean",
|
221 |
-
"num_of_instances": 100
|
222 |
-
},
|
223 |
-
"chatbot_abilities": {
|
224 |
-
"arena_hard_generation_english_gpt_4_0314_reference": {
|
225 |
-
"num_of_instances": 100,
|
226 |
-
"llama_3_70b_instruct_template_arena_hard": 0.5,
|
227 |
-
"score": 0.5,
|
228 |
-
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
229 |
-
},
|
230 |
-
"score": 0.5,
|
231 |
-
"score_name": "subsets_mean",
|
232 |
-
"num_of_instances": 100
|
233 |
-
},
|
234 |
-
"entity_extraction": {
|
235 |
-
"universal_ner_en_ewt": {
|
236 |
-
"num_of_instances": 100,
|
237 |
-
"f1_Person": 0.5294117647058824,
|
238 |
-
"f1_Organization": 0.4489795918367347,
|
239 |
-
"f1_Location": 0.3076923076923077,
|
240 |
-
"f1_macro": 0.4286945547449749,
|
241 |
-
"recall_macro": 0.3447204968944099,
|
242 |
-
"precision_macro": 0.5806637806637807,
|
243 |
-
"in_classes_support": 0.6266666666666667,
|
244 |
-
"f1_micro": 0.3466666666666667,
|
245 |
-
"recall_micro": 0.3466666666666667,
|
246 |
-
"precision_micro": 0.3466666666666667,
|
247 |
-
"score": 0.3466666666666667,
|
248 |
-
"score_name": "f1_micro",
|
249 |
-
"score_ci_low": 0.2410871556202116,
|
250 |
-
"score_ci_high": 0.45611092451496155,
|
251 |
-
"f1_micro_ci_low": 0.2410871556202116,
|
252 |
-
"f1_micro_ci_high": 0.45611092451496155
|
253 |
-
},
|
254 |
-
"score": 0.3466666666666667,
|
255 |
-
"score_name": "subsets_mean",
|
256 |
-
"num_of_instances": 100
|
257 |
-
},
|
258 |
-
"knowledge": {
|
259 |
-
"mmlu_pro_biology": {
|
260 |
-
"accuracy": 0.704225352112676,
|
261 |
-
"accuracy_ci_low": 0.5915492957746479,
|
262 |
-
"accuracy_ci_high": 0.8028169014084507,
|
263 |
-
"score_name": "accuracy",
|
264 |
-
"score": 0.704225352112676,
|
265 |
-
"score_ci_high": 0.8028169014084507,
|
266 |
-
"score_ci_low": 0.5915492957746479,
|
267 |
-
"num_of_instances": 71
|
268 |
-
},
|
269 |
-
"mmlu_pro_business": {
|
270 |
-
"accuracy": 0.13793103448275862,
|
271 |
-
"accuracy_ci_low": 0.034482758620689655,
|
272 |
-
"accuracy_ci_high": 0.3103448275862069,
|
273 |
-
"score_name": "accuracy",
|
274 |
-
"score": 0.13793103448275862,
|
275 |
-
"score_ci_high": 0.3103448275862069,
|
276 |
-
"score_ci_low": 0.034482758620689655,
|
277 |
-
"num_of_instances": 29
|
278 |
-
},
|
279 |
-
"score": 0.42107819329771734,
|
280 |
-
"score_name": "subsets_mean",
|
281 |
-
"num_of_instances": 100
|
282 |
-
},
|
283 |
-
"legal": {
|
284 |
-
"legalbench_abercrombie": {
|
285 |
-
"f1_macro": 0.6635397677258142,
|
286 |
-
"f1_suggestive": 0.5555555555555556,
|
287 |
-
"f1_generic": 0.7692307692307693,
|
288 |
-
"f1_descriptive": 0.6976744186046512,
|
289 |
-
"f1_fanciful": 0.6666666666666666,
|
290 |
-
"f1_arbitrary": 0.6285714285714286,
|
291 |
-
"f1_macro_ci_low": 0.5625965245413472,
|
292 |
-
"f1_macro_ci_high": 0.7723256077319486,
|
293 |
-
"score_name": "f1_micro",
|
294 |
-
"score": 0.6586826347305389,
|
295 |
-
"score_ci_high": 0.7515151515151515,
|
296 |
-
"score_ci_low": 0.5524327906405184,
|
297 |
-
"num_of_instances": 85,
|
298 |
-
"accuracy": 0.6470588235294118,
|
299 |
-
"accuracy_ci_low": 0.5411764705882353,
|
300 |
-
"accuracy_ci_high": 0.7411764705882353,
|
301 |
-
"f1_micro": 0.6586826347305389,
|
302 |
-
"f1_micro_ci_low": 0.5524327906405184,
|
303 |
-
"f1_micro_ci_high": 0.7515151515151515
|
304 |
-
},
|
305 |
-
"legalbench_corporate_lobbying": {
|
306 |
-
"f1_macro": 0.5357142857142857,
|
307 |
-
"f1_no": 0.5,
|
308 |
-
"f1_yes": 0.5714285714285714,
|
309 |
-
"f1_macro_ci_low": 0.2833333333333333,
|
310 |
-
"f1_macro_ci_high": 0.7999279223515758,
|
311 |
-
"score_name": "f1_micro",
|
312 |
-
"score": 0.5384615384615384,
|
313 |
-
"score_ci_high": 0.7857142857142857,
|
314 |
-
"score_ci_low": 0.26917373942421613,
|
315 |
-
"num_of_instances": 15,
|
316 |
-
"accuracy": 0.4666666666666667,
|
317 |
-
"accuracy_ci_low": 0.2,
|
318 |
-
"accuracy_ci_high": 0.7333333333333333,
|
319 |
-
"f1_micro": 0.5384615384615384,
|
320 |
-
"f1_micro_ci_low": 0.26917373942421613,
|
321 |
-
"f1_micro_ci_high": 0.7857142857142857
|
322 |
-
},
|
323 |
-
"score": 0.5985720865960387,
|
324 |
-
"score_name": "subsets_mean",
|
325 |
-
"num_of_instances": 100
|
326 |
-
},
|
327 |
-
"news_classification": {
|
328 |
-
"20_newsgroups_short": {
|
329 |
-
"f1_macro": 0.6443434343434343,
|
330 |
-
"f1_cars": 0.9090909090909091,
|
331 |
-
"f1_windows x": 0.5714285714285714,
|
332 |
-
"f1_computer graphics": 0.6666666666666666,
|
333 |
-
"f1_atheism": 0.5714285714285714,
|
334 |
-
"f1_religion": 0.0,
|
335 |
-
"f1_medicine": 1.0,
|
336 |
-
"f1_christianity": 0.8571428571428571,
|
337 |
-
"f1_microsoft windows": 0.6666666666666666,
|
338 |
-
"f1_middle east": 0.5,
|
339 |
-
"f1_motorcycles": 0.6,
|
340 |
-
"f1_pc hardware": 0.8,
|
341 |
-
"f1_mac hardware": 0.8,
|
342 |
-
"f1_for sale": 0.5,
|
343 |
-
"f1_guns": 0.4444444444444444,
|
344 |
-
"f1_space": 0.75,
|
345 |
-
"f1_cryptography": 0.3333333333333333,
|
346 |
-
"f1_baseball": 1.0,
|
347 |
-
"f1_politics": 0.5,
|
348 |
-
"f1_hockey": 0.75,
|
349 |
-
"f1_electronics": 0.6666666666666666,
|
350 |
-
"f1_macro_ci_low": 0.5605248203581513,
|
351 |
-
"f1_macro_ci_high": 0.7498000775037662,
|
352 |
-
"score_name": "f1_micro",
|
353 |
-
"score": 0.6740331491712708,
|
354 |
-
"score_ci_high": 0.7567567567567568,
|
355 |
-
"score_ci_low": 0.5654571096096505,
|
356 |
-
"num_of_instances": 100,
|
357 |
-
"accuracy": 0.61,
|
358 |
-
"accuracy_ci_low": 0.5,
|
359 |
-
"accuracy_ci_high": 0.7,
|
360 |
-
"f1_micro": 0.6740331491712708,
|
361 |
-
"f1_micro_ci_low": 0.5654571096096505,
|
362 |
-
"f1_micro_ci_high": 0.7567567567567568
|
363 |
-
},
|
364 |
-
"score": 0.6740331491712708,
|
365 |
-
"score_name": "subsets_mean",
|
366 |
-
"num_of_instances": 100
|
367 |
-
},
|
368 |
-
"product_help": {
|
369 |
-
"cfpb_product_2023": {
|
370 |
-
"f1_macro": 0.8637383872166481,
|
371 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.927536231884058,
|
372 |
-
"f1_credit card or prepaid card": 1.0,
|
373 |
-
"f1_debt collection": 0.64,
|
374 |
-
"f1_checking or savings account": 0.9230769230769231,
|
375 |
-
"f1_mortgage": 0.8888888888888888,
|
376 |
-
"f1_vehicle loan or lease": 0.6666666666666666,
|
377 |
-
"f1_money transfer or virtual currency or money service": 1.0,
|
378 |
-
"f1_macro_ci_low": 0.7066777160591827,
|
379 |
-
"f1_macro_ci_high": 0.9300773607822144,
|
380 |
-
"score_name": "f1_micro",
|
381 |
-
"score": 0.8888888888888888,
|
382 |
-
"score_ci_high": 0.9393939393939394,
|
383 |
-
"score_ci_low": 0.8163265306122449,
|
384 |
-
"num_of_instances": 100,
|
385 |
-
"accuracy": 0.88,
|
386 |
-
"accuracy_ci_low": 0.81,
|
387 |
-
"accuracy_ci_high": 0.93,
|
388 |
-
"f1_micro": 0.8888888888888888,
|
389 |
-
"f1_micro_ci_low": 0.8163265306122449,
|
390 |
-
"f1_micro_ci_high": 0.9393939393939394
|
391 |
-
},
|
392 |
-
"score": 0.8888888888888888,
|
393 |
-
"score_name": "subsets_mean",
|
394 |
-
"num_of_instances": 100
|
395 |
-
},
|
396 |
-
"qa_finance": {
|
397 |
-
"fin_qa": {
|
398 |
-
"num_of_instances": 100,
|
399 |
-
"program_accuracy": 0.2,
|
400 |
-
"score": 0.2,
|
401 |
-
"score_name": "program_accuracy",
|
402 |
-
"execution_accuracy": 0.2,
|
403 |
-
"program_accuracy_ci_low": 0.13,
|
404 |
-
"program_accuracy_ci_high": 0.29,
|
405 |
-
"score_ci_low": 0.13,
|
406 |
-
"score_ci_high": 0.29,
|
407 |
-
"execution_accuracy_ci_low": 0.13,
|
408 |
-
"execution_accuracy_ci_high": 0.29
|
409 |
-
},
|
410 |
-
"score": 0.2,
|
411 |
-
"score_name": "subsets_mean",
|
412 |
-
"num_of_instances": 100
|
413 |
-
},
|
414 |
-
"rag_general": {
|
415 |
-
"rag_response_generation_clapnq": {
|
416 |
-
"precision": 0.4639242544792729,
|
417 |
-
"recall": 0.6403509065582018,
|
418 |
-
"f1": 0.4976247962897783,
|
419 |
-
"precision_ci_low": 0.42748095786992185,
|
420 |
-
"precision_ci_high": 0.5047660147148081,
|
421 |
-
"recall_ci_low": 0.6019930525471148,
|
422 |
-
"recall_ci_high": 0.6774368805520771,
|
423 |
-
"f1_ci_low": 0.4683673012782413,
|
424 |
-
"f1_ci_high": 0.5304072668948948,
|
425 |
-
"score_name": "f1",
|
426 |
-
"score": 0.4976247962897783,
|
427 |
-
"score_ci_high": 0.5304072668948948,
|
428 |
-
"score_ci_low": 0.4683673012782413,
|
429 |
-
"num_of_instances": 100,
|
430 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.6864667510986329,
|
431 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.7247687041759491,
|
432 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.6621749752759933,
|
433 |
-
"faithfullness_f1_token_overlap": 0.412101720837311,
|
434 |
-
"faithfullness_recall_token_overlap": 0.32402585790821237,
|
435 |
-
"faithfullness_precision_token_overlap": 0.6786708572055166,
|
436 |
-
"correctness_f1_token_overlap": 0.4976247962897783,
|
437 |
-
"correctness_recall_token_overlap": 0.6403509065582018,
|
438 |
-
"correctness_precision_token_overlap": 0.4639242544792729
|
439 |
-
},
|
440 |
-
"score": 0.4976247962897783,
|
441 |
-
"score_name": "subsets_mean",
|
442 |
-
"num_of_instances": 100
|
443 |
-
},
|
444 |
-
"reasoning": {
|
445 |
-
"hellaswag": {
|
446 |
-
"accuracy": 0.57,
|
447 |
-
"accuracy_ci_low": 0.47,
|
448 |
-
"accuracy_ci_high": 0.66,
|
449 |
-
"score_name": "accuracy",
|
450 |
-
"score": 0.57,
|
451 |
-
"score_ci_high": 0.66,
|
452 |
-
"score_ci_low": 0.47,
|
453 |
-
"num_of_instances": 100
|
454 |
-
},
|
455 |
-
"score": 0.57,
|
456 |
-
"score_name": "subsets_mean",
|
457 |
-
"num_of_instances": 100
|
458 |
-
},
|
459 |
-
"safety": {
|
460 |
-
"attaq_500": {
|
461 |
-
"severity_critical": 0.0,
|
462 |
-
"severity_high": 0.0,
|
463 |
-
"severity_medium": 1.0,
|
464 |
-
"severity_low": 99.0,
|
465 |
-
"category_harmful_info": 0.8407948520514996,
|
466 |
-
"category_deception": 0.8690199783316466,
|
467 |
-
"category_explicit_content": 0.8879365721962067,
|
468 |
-
"category_violence": 0.8437370418910942,
|
469 |
-
"category_discrimination": 0.7742161632530273,
|
470 |
-
"category_substance_abuse": 0.8879393110672632,
|
471 |
-
"category_pii": 0.7969612302603545,
|
472 |
-
"safety": 0.8464966776832524,
|
473 |
-
"safety_ci_low": 0.8296491155827732,
|
474 |
-
"safety_ci_high": 0.8642379209192678,
|
475 |
-
"score_name": "safety",
|
476 |
-
"score": 0.8464966776832524,
|
477 |
-
"score_ci_high": 0.8642379209192678,
|
478 |
-
"score_ci_low": 0.8296491155827732,
|
479 |
-
"num_of_instances": 100
|
480 |
-
},
|
481 |
-
"score": 0.8464966776832524,
|
482 |
-
"score_name": "subsets_mean",
|
483 |
-
"num_of_instances": 100
|
484 |
-
},
|
485 |
-
"summarization": {
|
486 |
-
"billsum_document_filtered_to_6000_chars": {
|
487 |
-
"num_of_instances": 100,
|
488 |
-
"rouge1": 0.43029845221947843,
|
489 |
-
"rougeL": 0.2955165700225417,
|
490 |
-
"score": 0.2955165700225417,
|
491 |
-
"score_name": "rougeL",
|
492 |
-
"rouge2": 0.20799738817238542,
|
493 |
-
"rougeLsum": 0.37091272315340484,
|
494 |
-
"rouge1_ci_low": 0.40762860443579957,
|
495 |
-
"rouge1_ci_high": 0.45046632483836146,
|
496 |
-
"rougeL_ci_low": 0.2802994422178466,
|
497 |
-
"rougeL_ci_high": 0.31441983596023754,
|
498 |
-
"score_ci_low": 0.2802994422178466,
|
499 |
-
"score_ci_high": 0.31441983596023754,
|
500 |
-
"rouge2_ci_low": 0.193214668225847,
|
501 |
-
"rouge2_ci_high": 0.22420116008616867,
|
502 |
-
"rougeLsum_ci_low": 0.35057685960681195,
|
503 |
-
"rougeLsum_ci_high": 0.3911461732163174
|
504 |
-
},
|
505 |
-
"score": 0.2955165700225417,
|
506 |
-
"score_name": "subsets_mean",
|
507 |
-
"num_of_instances": 100
|
508 |
-
},
|
509 |
-
"translation": {
|
510 |
-
"mt_flores_101_ara_eng": {
|
511 |
-
"num_of_instances": 66,
|
512 |
-
"counts": [
|
513 |
-
1308,
|
514 |
-
854,
|
515 |
-
606,
|
516 |
-
437
|
517 |
-
],
|
518 |
-
"totals": [
|
519 |
-
1801,
|
520 |
-
1735,
|
521 |
-
1669,
|
522 |
-
1603
|
523 |
-
],
|
524 |
-
"precisions": [
|
525 |
-
0.7262631871182677,
|
526 |
-
0.49221902017291064,
|
527 |
-
0.36309167165967643,
|
528 |
-
0.272613849033063
|
529 |
-
],
|
530 |
-
"bp": 1.0,
|
531 |
-
"sys_len": 1801,
|
532 |
-
"ref_len": 1734,
|
533 |
-
"sacrebleu": 0.4337147141407253,
|
534 |
-
"score": 0.4337147141407253,
|
535 |
-
"score_name": "sacrebleu",
|
536 |
-
"score_ci_low": 0.3842057657729977,
|
537 |
-
"score_ci_high": 0.4730390019325389,
|
538 |
-
"sacrebleu_ci_low": 0.3842057657729977,
|
539 |
-
"sacrebleu_ci_high": 0.4730390019325389
|
540 |
-
},
|
541 |
-
"mt_flores_101_deu_eng": {
|
542 |
-
"num_of_instances": 34,
|
543 |
-
"counts": [
|
544 |
-
718,
|
545 |
-
461,
|
546 |
-
323,
|
547 |
-
234
|
548 |
-
],
|
549 |
-
"totals": [
|
550 |
-
1016,
|
551 |
-
982,
|
552 |
-
948,
|
553 |
-
914
|
554 |
-
],
|
555 |
-
"precisions": [
|
556 |
-
0.7066929133858268,
|
557 |
-
0.4694501018329939,
|
558 |
-
0.3407172995780591,
|
559 |
-
0.25601750547045954
|
560 |
-
],
|
561 |
-
"bp": 1.0,
|
562 |
-
"sys_len": 1016,
|
563 |
-
"ref_len": 960,
|
564 |
-
"sacrebleu": 0.4124497124322012,
|
565 |
-
"score": 0.4124497124322012,
|
566 |
-
"score_name": "sacrebleu",
|
567 |
-
"score_ci_low": 0.3505214366395574,
|
568 |
-
"score_ci_high": 0.4751525306662991,
|
569 |
-
"sacrebleu_ci_low": 0.3505214366395574,
|
570 |
-
"sacrebleu_ci_high": 0.4751525306662991
|
571 |
-
},
|
572 |
-
"score": 0.4230822132864632,
|
573 |
-
"score_name": "subsets_mean",
|
574 |
-
"num_of_instances": 100
|
575 |
-
},
|
576 |
-
"score": 0.5504925912574663,
|
577 |
-
"score_name": "subsets_mean",
|
578 |
-
"num_of_instances": 1300
|
579 |
-
}
|
580 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/bluebench/{2025-06-16T11-59-29_evaluation_results.json → 2025-06-19T11-21-54_evaluation_results.json}
RENAMED
@@ -1,14 +1,14 @@
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
-
"timestamp_utc": "2025-06-
|
4 |
"command_line_invocation": [
|
5 |
-
"/
|
6 |
"--tasks",
|
7 |
"benchmarks.bluebench",
|
8 |
"--model",
|
9 |
"cross_provider",
|
10 |
"--model_args",
|
11 |
-
"model_name=granite-3-3-8b-instruct,max_tokens=256",
|
12 |
"--output_path",
|
13 |
"./results/bluebench",
|
14 |
"--log_samples",
|
@@ -42,176 +42,157 @@
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
-
"unitxt_commit_hash": "
|
46 |
-
"python_version": "3.
|
47 |
"system": "Linux",
|
48 |
-
"system_version": "#1 SMP PREEMPT_DYNAMIC
|
49 |
"installed_packages": {
|
50 |
-
"
|
51 |
-
"
|
52 |
-
"
|
53 |
-
"
|
54 |
-
"
|
55 |
-
"
|
56 |
-
"
|
57 |
-
"
|
|
|
58 |
"mecab-ko": "1.0.1",
|
59 |
-
"
|
60 |
-
"
|
61 |
-
"
|
62 |
-
"python-dotenv": "1.1.0",
|
63 |
-
"accelerate": "1.7.0",
|
64 |
-
"httpx-sse": "0.4.0",
|
65 |
-
"pillow": "11.2.1",
|
66 |
-
"certifi": "2025.4.26",
|
67 |
-
"pyparsing": "3.2.3",
|
68 |
-
"nvidia-cusparselt-cu12": "0.6.3",
|
69 |
-
"tzdata": "2025.2",
|
70 |
-
"torch": "2.7.0",
|
71 |
-
"MarkupSafe": "3.0.2",
|
72 |
-
"setuptools": "80.1.0",
|
73 |
-
"pydantic": "2.11.4",
|
74 |
-
"yarl": "1.20.0",
|
75 |
-
"importlib_metadata": "8.0.0",
|
76 |
-
"pydantic_core": "2.33.2",
|
77 |
-
"scipy": "1.15.3",
|
78 |
-
"annotated-types": "0.7.0",
|
79 |
-
"portalocker": "3.1.1",
|
80 |
-
"packaging": "24.2",
|
81 |
-
"Deprecated": "1.2.18",
|
82 |
-
"typing_extensions": "4.12.2",
|
83 |
-
"ibm-cos-sdk-s3transfer": "2.14.1",
|
84 |
-
"nvidia-cufft-cu12": "11.3.0.4",
|
85 |
-
"nvidia-cusolver-cu12": "11.7.1.2",
|
86 |
-
"diskcache": "5.6.3",
|
87 |
-
"fsspec": "2025.3.0",
|
88 |
-
"transformers": "4.51.3",
|
89 |
-
"platformdirs": "4.2.2",
|
90 |
-
"nvidia-cublas-cu12": "12.6.4.1",
|
91 |
-
"threadpoolctl": "3.6.0",
|
92 |
"jsonschema-specifications": "2025.4.1",
|
93 |
-
"
|
94 |
-
"
|
95 |
-
"
|
96 |
-
"
|
97 |
-
"
|
98 |
-
"
|
99 |
-
"
|
|
|
|
|
100 |
"nvidia-nvjitlink-cu12": "12.6.85",
|
101 |
-
"
|
102 |
-
"
|
103 |
-
"
|
104 |
-
"
|
105 |
-
"sympy": "1.14.0",
|
106 |
-
"requests": "2.32.2",
|
107 |
-
"charset-normalizer": "3.4.2",
|
108 |
-
"psutil": "7.0.0",
|
109 |
-
"pre_commit": "4.2.0",
|
110 |
-
"nodeenv": "1.9.1",
|
111 |
-
"colorama": "0.4.6",
|
112 |
-
"absl-py": "2.2.2",
|
113 |
"rouge_score": "0.1.2",
|
114 |
-
"
|
115 |
-
"multiprocess": "0.70.16",
|
116 |
-
"xxhash": "3.5.0",
|
117 |
-
"detect-secrets": "1.5.0",
|
118 |
-
"aiohttp": "3.11.18",
|
119 |
-
"frozenlist": "1.6.0",
|
120 |
-
"tabulate": "0.9.0",
|
121 |
-
"triton": "3.3.0",
|
122 |
-
"idna": "3.10",
|
123 |
-
"PyYAML": "6.0.2",
|
124 |
-
"ibm-cos-sdk-core": "2.14.1",
|
125 |
-
"nvidia-curand-cu12": "10.3.7.77",
|
126 |
-
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
127 |
-
"tiktoken": "0.9.0",
|
128 |
"aiosignal": "1.3.2",
|
129 |
-
"
|
|
|
|
|
|
|
|
|
|
|
130 |
"h11": "0.16.0",
|
131 |
-
"
|
132 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
"kiwisolver": "1.4.8",
|
134 |
-
"
|
135 |
-
"
|
136 |
-
"
|
137 |
-
"
|
138 |
-
"
|
139 |
-
"
|
140 |
-
"unitxt": "1.24.0",
|
141 |
-
"dill": "0.3.8",
|
142 |
-
"multidict": "6.4.3",
|
143 |
-
"conllu": "6.0.0",
|
144 |
-
"litellm": "1.69.3",
|
145 |
-
"joblib": "1.5.0",
|
146 |
-
"cycler": "0.12.1",
|
147 |
"pip": "25.1.1",
|
148 |
-
"
|
149 |
-
"
|
150 |
-
"fonttools": "4.58.0",
|
151 |
"datasets": "3.6.0",
|
152 |
-
"
|
153 |
-
"
|
154 |
-
"
|
155 |
-
"huggingface-hub": "0.31.2",
|
156 |
-
"aiohappyeyeballs": "2.6.1",
|
157 |
-
"sacrebleu": "2.5.1",
|
158 |
-
"pyarrow": "20.0.0",
|
159 |
-
"openai": "1.75.0",
|
160 |
-
"python-dateutil": "2.9.0.post0",
|
161 |
-
"pytz": "2025.2",
|
162 |
-
"contourpy": "1.3.2",
|
163 |
-
"pandas": "2.2.3",
|
164 |
"distro": "1.9.0",
|
165 |
-
"
|
166 |
-
"
|
167 |
-
"
|
168 |
-
"
|
169 |
-
"
|
170 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
"tokenizers": "0.21.1",
|
172 |
-
"
|
173 |
-
"nvidia-
|
174 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
"safetensors": "0.5.3",
|
176 |
-
"
|
177 |
-
"referencing": "0.36.2",
|
178 |
-
"networkx": "3.4.2",
|
179 |
-
"jsonschema": "4.23.0",
|
180 |
-
"zipp": "3.19.2",
|
181 |
"regex": "2024.11.6",
|
182 |
-
"
|
183 |
-
"
|
184 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
"jaraco.collections": "5.1.0",
|
|
|
|
|
|
|
186 |
"typeguard": "4.3.0",
|
|
|
187 |
"jaraco.text": "3.12.1",
|
188 |
-
"jaraco.context": "5.3.0",
|
189 |
-
"jaraco.functools": "4.0.1",
|
190 |
"more-itertools": "10.3.0",
|
191 |
-
"
|
192 |
-
"inflect": "7.3.1"
|
|
|
193 |
}
|
194 |
},
|
195 |
"results": {
|
196 |
"bias": {
|
197 |
"safety_bbq_age": {
|
198 |
-
"accuracy": 0.
|
199 |
-
"accuracy_ci_low": 0.
|
200 |
-
"accuracy_ci_high": 0.
|
201 |
"score_name": "accuracy",
|
202 |
-
"score": 0.
|
203 |
-
"score_ci_high": 0.
|
204 |
-
"score_ci_low": 0.
|
205 |
"num_of_instances": 90
|
206 |
},
|
207 |
"safety_bbq_disability_status": {
|
208 |
-
"accuracy": 0.
|
209 |
-
"accuracy_ci_low": 0.
|
210 |
-
"accuracy_ci_high": 0.
|
211 |
"score_name": "accuracy",
|
212 |
-
"score": 0.
|
213 |
-
"score_ci_high": 0.
|
214 |
-
"score_ci_low": 0.
|
215 |
"num_of_instances": 90
|
216 |
},
|
217 |
"safety_bbq_gender_identity": {
|
@@ -225,86 +206,86 @@
|
|
225 |
"num_of_instances": 90
|
226 |
},
|
227 |
"safety_bbq_nationality": {
|
228 |
-
"accuracy": 0.
|
229 |
-
"accuracy_ci_low": 0.
|
230 |
-
"accuracy_ci_high": 0.
|
231 |
"score_name": "accuracy",
|
232 |
-
"score": 0.
|
233 |
-
"score_ci_high": 0.
|
234 |
-
"score_ci_low": 0.
|
235 |
"num_of_instances": 90
|
236 |
},
|
237 |
"safety_bbq_physical_appearance": {
|
238 |
-
"accuracy": 0.
|
239 |
-
"accuracy_ci_low": 0.
|
240 |
-
"accuracy_ci_high": 0.
|
241 |
"score_name": "accuracy",
|
242 |
-
"score": 0.
|
243 |
-
"score_ci_high": 0.
|
244 |
-
"score_ci_low": 0.
|
245 |
"num_of_instances": 90
|
246 |
},
|
247 |
"safety_bbq_race_ethnicity": {
|
248 |
-
"accuracy": 0.
|
249 |
-
"accuracy_ci_low": 0.
|
250 |
-
"accuracy_ci_high": 0.
|
251 |
"score_name": "accuracy",
|
252 |
-
"score": 0.
|
253 |
-
"score_ci_high": 0.
|
254 |
-
"score_ci_low": 0.
|
255 |
"num_of_instances": 90
|
256 |
},
|
257 |
"safety_bbq_race_x_gender": {
|
258 |
-
"accuracy": 0.
|
259 |
-
"accuracy_ci_low": 0.
|
260 |
"accuracy_ci_high": 0.9444444444444444,
|
261 |
"score_name": "accuracy",
|
262 |
-
"score": 0.
|
263 |
"score_ci_high": 0.9444444444444444,
|
264 |
-
"score_ci_low": 0.
|
265 |
"num_of_instances": 90
|
266 |
},
|
267 |
"safety_bbq_race_x_ses": {
|
268 |
-
"accuracy": 0.
|
269 |
-
"accuracy_ci_low": 0.
|
270 |
-
"accuracy_ci_high": 0.
|
271 |
"score_name": "accuracy",
|
272 |
-
"score": 0.
|
273 |
-
"score_ci_high": 0.
|
274 |
-
"score_ci_low": 0.
|
275 |
"num_of_instances": 90
|
276 |
},
|
277 |
"safety_bbq_religion": {
|
278 |
-
"accuracy": 0.
|
279 |
-
"accuracy_ci_low": 0.
|
280 |
"accuracy_ci_high": 0.8444444444444444,
|
281 |
"score_name": "accuracy",
|
282 |
-
"score": 0.
|
283 |
"score_ci_high": 0.8444444444444444,
|
284 |
-
"score_ci_low": 0.
|
285 |
"num_of_instances": 90
|
286 |
},
|
287 |
"safety_bbq_ses": {
|
288 |
-
"accuracy": 0.
|
289 |
-
"accuracy_ci_low": 0.
|
290 |
-
"accuracy_ci_high": 0.
|
291 |
"score_name": "accuracy",
|
292 |
-
"score": 0.
|
293 |
-
"score_ci_high": 0.
|
294 |
-
"score_ci_low": 0.
|
295 |
"num_of_instances": 90
|
296 |
},
|
297 |
"safety_bbq_sexual_orientation": {
|
298 |
-
"accuracy": 0.
|
299 |
-
"accuracy_ci_low": 0.
|
300 |
-
"accuracy_ci_high": 0.
|
301 |
"score_name": "accuracy",
|
302 |
-
"score": 0.
|
303 |
-
"score_ci_high": 0.
|
304 |
-
"score_ci_low": 0.
|
305 |
"num_of_instances": 90
|
306 |
},
|
307 |
-
"score": 0.
|
308 |
"score_name": "subsets_mean",
|
309 |
"num_of_instances": 990
|
310 |
},
|
@@ -322,59 +303,69 @@
|
|
322 |
"entity_extraction": {
|
323 |
"universal_ner_en_ewt": {
|
324 |
"num_of_instances": 1000,
|
325 |
-
"f1_Person": 0.
|
326 |
-
"f1_Organization": 0.
|
327 |
-
"f1_Location": 0.
|
328 |
-
"f1_macro": 0.
|
329 |
-
"recall_macro": 0.
|
330 |
-
"precision_macro": 0.
|
331 |
-
"in_classes_support": 0.
|
332 |
-
"f1_micro": 0.
|
333 |
-
"recall_micro": 0.
|
334 |
-
"precision_micro": 0.
|
335 |
-
"score": 0.
|
336 |
"score_name": "f1_micro",
|
337 |
-
"score_ci_low": 0.
|
338 |
-
"score_ci_high": 0.
|
339 |
-
"f1_micro_ci_low": 0.
|
340 |
-
"f1_micro_ci_high": 0.
|
341 |
},
|
342 |
-
"score": 0.
|
343 |
"score_name": "subsets_mean",
|
344 |
"num_of_instances": 1000
|
345 |
},
|
346 |
"knowledge": {
|
347 |
"mmlu_pro_biology": {
|
348 |
-
"accuracy": 0.
|
349 |
-
"accuracy_ci_low": 0.
|
350 |
-
"accuracy_ci_high": 0.
|
351 |
"score_name": "accuracy",
|
352 |
-
"score": 0.
|
353 |
-
"score_ci_high": 0.
|
354 |
-
"score_ci_low": 0.
|
355 |
"num_of_instances": 71
|
356 |
},
|
357 |
"mmlu_pro_business": {
|
358 |
"accuracy": 0.19718309859154928,
|
359 |
"accuracy_ci_low": 0.11267605633802817,
|
360 |
-
"accuracy_ci_high": 0.
|
361 |
"score_name": "accuracy",
|
362 |
"score": 0.19718309859154928,
|
363 |
-
"score_ci_high": 0.
|
364 |
"score_ci_low": 0.11267605633802817,
|
365 |
"num_of_instances": 71
|
366 |
},
|
367 |
"mmlu_pro_chemistry": {
|
368 |
-
"accuracy": 0.
|
369 |
-
"accuracy_ci_low": 0.
|
370 |
-
"accuracy_ci_high": 0.
|
371 |
"score_name": "accuracy",
|
372 |
-
"score": 0.
|
373 |
-
"score_ci_high": 0.
|
374 |
-
"score_ci_low": 0.
|
375 |
"num_of_instances": 71
|
376 |
},
|
377 |
"mmlu_pro_computer_science": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
"accuracy": 0.38028169014084506,
|
379 |
"accuracy_ci_low": 0.2676056338028169,
|
380 |
"accuracy_ci_high": 0.49295774647887325,
|
@@ -384,375 +375,365 @@
|
|
384 |
"score_ci_low": 0.2676056338028169,
|
385 |
"num_of_instances": 71
|
386 |
},
|
387 |
-
"mmlu_pro_economics": {
|
388 |
-
"accuracy": 0.4084507042253521,
|
389 |
-
"accuracy_ci_low": 0.30985915492957744,
|
390 |
-
"accuracy_ci_high": 0.5211267605633803,
|
391 |
-
"score_name": "accuracy",
|
392 |
-
"score": 0.4084507042253521,
|
393 |
-
"score_ci_high": 0.5211267605633803,
|
394 |
-
"score_ci_low": 0.30985915492957744,
|
395 |
-
"num_of_instances": 71
|
396 |
-
},
|
397 |
"mmlu_pro_engineering": {
|
398 |
-
"accuracy": 0.
|
399 |
-
"accuracy_ci_low": 0.
|
400 |
-
"accuracy_ci_high": 0.
|
401 |
"score_name": "accuracy",
|
402 |
-
"score": 0.
|
403 |
-
"score_ci_high": 0.
|
404 |
-
"score_ci_low": 0.
|
405 |
"num_of_instances": 71
|
406 |
},
|
407 |
"mmlu_pro_health": {
|
408 |
-
"accuracy": 0.
|
409 |
-
"accuracy_ci_low": 0.
|
410 |
-
"accuracy_ci_high": 0.
|
411 |
"score_name": "accuracy",
|
412 |
-
"score": 0.
|
413 |
-
"score_ci_high": 0.
|
414 |
-
"score_ci_low": 0.
|
415 |
"num_of_instances": 71
|
416 |
},
|
417 |
"mmlu_pro_history": {
|
418 |
-
"accuracy": 0.
|
419 |
-
"accuracy_ci_low": 0.
|
420 |
"accuracy_ci_high": 0.4788732394366197,
|
421 |
"score_name": "accuracy",
|
422 |
-
"score": 0.
|
423 |
"score_ci_high": 0.4788732394366197,
|
424 |
-
"score_ci_low": 0.
|
425 |
"num_of_instances": 71
|
426 |
},
|
427 |
"mmlu_pro_law": {
|
428 |
-
"accuracy": 0.
|
429 |
-
"accuracy_ci_low": 0.
|
430 |
-
"accuracy_ci_high": 0.
|
431 |
"score_name": "accuracy",
|
432 |
-
"score": 0.
|
433 |
-
"score_ci_high": 0.
|
434 |
-
"score_ci_low": 0.
|
435 |
"num_of_instances": 71
|
436 |
},
|
437 |
"mmlu_pro_math": {
|
438 |
-
"accuracy": 0.
|
439 |
-
"accuracy_ci_low": 0.
|
440 |
-
"accuracy_ci_high": 0.
|
441 |
"score_name": "accuracy",
|
442 |
-
"score": 0.
|
443 |
-
"score_ci_high": 0.
|
444 |
-
"score_ci_low": 0.
|
445 |
"num_of_instances": 71
|
446 |
},
|
447 |
"mmlu_pro_other": {
|
448 |
-
"accuracy": 0.
|
449 |
-
"accuracy_ci_low": 0.
|
450 |
-
"accuracy_ci_high": 0.
|
451 |
"score_name": "accuracy",
|
452 |
-
"score": 0.
|
453 |
-
"score_ci_high": 0.
|
454 |
-
"score_ci_low": 0.
|
455 |
"num_of_instances": 71
|
456 |
},
|
457 |
"mmlu_pro_philosophy": {
|
458 |
-
"accuracy": 0.
|
459 |
-
"accuracy_ci_low": 0.
|
460 |
-
"accuracy_ci_high": 0.
|
461 |
"score_name": "accuracy",
|
462 |
-
"score": 0.
|
463 |
-
"score_ci_high": 0.
|
464 |
-
"score_ci_low": 0.
|
465 |
"num_of_instances": 71
|
466 |
},
|
467 |
"mmlu_pro_physics": {
|
468 |
-
"accuracy": 0.
|
469 |
-
"accuracy_ci_low": 0.
|
470 |
-
"accuracy_ci_high": 0.
|
471 |
"score_name": "accuracy",
|
472 |
-
"score": 0.
|
473 |
-
"score_ci_high": 0.
|
474 |
-
"score_ci_low": 0.
|
475 |
"num_of_instances": 71
|
476 |
},
|
477 |
"mmlu_pro_psychology": {
|
478 |
-
"accuracy": 0.
|
479 |
-
"accuracy_ci_low": 0.
|
480 |
-
"accuracy_ci_high": 0.
|
481 |
"score_name": "accuracy",
|
482 |
-
"score": 0.
|
483 |
-
"score_ci_high": 0.
|
484 |
-
"score_ci_low": 0.
|
485 |
"num_of_instances": 71
|
486 |
},
|
487 |
-
"score": 0.
|
488 |
"score_name": "subsets_mean",
|
489 |
"num_of_instances": 994
|
490 |
},
|
491 |
"legal": {
|
492 |
"legalbench_abercrombie": {
|
493 |
-
"f1_macro": 0.
|
494 |
-
"f1_suggestive": 0.
|
495 |
-
"f1_arbitrary": 0.
|
496 |
-
"f1_generic": 0.
|
497 |
-
"f1_fanciful": 0.
|
498 |
-
"f1_descriptive": 0.
|
499 |
-
"f1_macro_ci_low": 0.
|
500 |
-
"f1_macro_ci_high": 0.
|
501 |
"score_name": "f1_micro",
|
502 |
-
"score": 0.
|
503 |
-
"score_ci_high": 0.
|
504 |
-
"score_ci_low": 0.
|
505 |
"num_of_instances": 85,
|
506 |
-
"accuracy": 0.
|
507 |
-
"accuracy_ci_low": 0.
|
508 |
-
"accuracy_ci_high": 0.
|
509 |
-
"f1_micro": 0.
|
510 |
-
"f1_micro_ci_low": 0.
|
511 |
-
"f1_micro_ci_high": 0.
|
512 |
},
|
513 |
"legalbench_corporate_lobbying": {
|
514 |
-
"f1_macro": 0.
|
515 |
-
"f1_no": 0.
|
516 |
-
"f1_yes": 0.
|
517 |
-
"f1_macro_ci_low": 0.
|
518 |
-
"f1_macro_ci_high": 0.
|
519 |
"score_name": "f1_micro",
|
520 |
-
"score": 0.
|
521 |
-
"score_ci_high": 0.
|
522 |
-
"score_ci_low": 0.
|
523 |
"num_of_instances": 200,
|
524 |
-
"accuracy": 0.
|
525 |
-
"accuracy_ci_low": 0.
|
526 |
-
"accuracy_ci_high": 0.
|
527 |
-
"f1_micro": 0.
|
528 |
-
"f1_micro_ci_low": 0.
|
529 |
-
"f1_micro_ci_high": 0.
|
530 |
},
|
531 |
"legalbench_function_of_decision_section": {
|
532 |
-
"f1_macro": 0.
|
533 |
-
"f1_conclusion": 0.
|
534 |
-
"f1_decree": 0.
|
535 |
-
"f1_issue": 0.
|
536 |
-
"
|
|
|
537 |
"f1_facts": 0.21621621621621623,
|
538 |
-
"f1_procedural history": 0.
|
539 |
-
"
|
540 |
-
"
|
541 |
-
"f1_macro_ci_high": 0.3464281083472716,
|
542 |
"score_name": "f1_micro",
|
543 |
-
"score": 0.
|
544 |
-
"score_ci_high": 0.
|
545 |
-
"score_ci_low": 0.
|
546 |
"num_of_instances": 200,
|
547 |
-
"accuracy": 0.
|
548 |
-
"accuracy_ci_low": 0.
|
549 |
-
"accuracy_ci_high": 0.
|
550 |
-
"f1_micro": 0.
|
551 |
-
"f1_micro_ci_low": 0.
|
552 |
-
"f1_micro_ci_high": 0.
|
553 |
},
|
554 |
"legalbench_international_citizenship_questions": {
|
555 |
-
"f1_macro": 0.
|
556 |
-
"f1_yes": 0.
|
557 |
-
"f1_no": 0.
|
558 |
-
"f1_macro_ci_low": 0.
|
559 |
-
"f1_macro_ci_high": 0.
|
560 |
"score_name": "f1_micro",
|
561 |
-
"score": 0.
|
562 |
-
"score_ci_high": 0.
|
563 |
-
"score_ci_low": 0.
|
564 |
"num_of_instances": 200,
|
565 |
-
"accuracy": 0.
|
566 |
-
"accuracy_ci_low": 0.
|
567 |
-
"accuracy_ci_high": 0.
|
568 |
-
"f1_micro": 0.
|
569 |
-
"f1_micro_ci_low": 0.
|
570 |
-
"f1_micro_ci_high": 0.
|
571 |
},
|
572 |
"legalbench_proa": {
|
573 |
-
"f1_macro": 0.
|
574 |
-
"f1_yes": 0.
|
575 |
"f1_no": 0.8461538461538461,
|
576 |
-
"f1_macro_ci_low": 0.
|
577 |
-
"f1_macro_ci_high": 0.
|
578 |
"score_name": "f1_micro",
|
579 |
-
"score": 0.
|
580 |
-
"score_ci_high": 0.
|
581 |
-
"score_ci_low": 0.
|
582 |
"num_of_instances": 85,
|
583 |
-
"accuracy": 0.
|
584 |
-
"accuracy_ci_low": 0.
|
585 |
-
"accuracy_ci_high": 0.
|
586 |
-
"f1_micro": 0.
|
587 |
-
"f1_micro_ci_low": 0.
|
588 |
-
"f1_micro_ci_high": 0.
|
589 |
},
|
590 |
-
"score": 0.
|
591 |
"score_name": "subsets_mean",
|
592 |
"num_of_instances": 770
|
593 |
},
|
594 |
"news_classification": {
|
595 |
"20_newsgroups_short": {
|
596 |
-
"f1_macro": 0.
|
597 |
-
"f1_cars": 0.
|
598 |
-
"f1_pc hardware": 0.
|
599 |
-
"f1_windows x": 0.
|
600 |
-
"
|
601 |
-
"
|
602 |
-
"f1_religion": 0.
|
603 |
-
"f1_medicine": 0.
|
604 |
-
"
|
605 |
"f1_microsoft windows": 0.39436619718309857,
|
606 |
-
"f1_middle east": 0.
|
607 |
-
"
|
608 |
-
"
|
609 |
-
"
|
610 |
-
"
|
611 |
-
"
|
612 |
"f1_space": 0.5569620253164557,
|
613 |
-
"f1_cryptography": 0.
|
614 |
-
"f1_baseball": 0.
|
615 |
"f1_hockey": 0.859504132231405,
|
616 |
-
"f1_electronics": 0.
|
617 |
-
"f1_macro_ci_low": 0.
|
618 |
-
"f1_macro_ci_high": 0.
|
619 |
"score_name": "f1_micro",
|
620 |
-
"score": 0.
|
621 |
-
"score_ci_high": 0.
|
622 |
-
"score_ci_low": 0.
|
623 |
"num_of_instances": 1000,
|
624 |
-
"accuracy": 0.
|
625 |
-
"accuracy_ci_low": 0.
|
626 |
-
"accuracy_ci_high": 0.
|
627 |
-
"f1_micro": 0.
|
628 |
-
"f1_micro_ci_low": 0.
|
629 |
-
"f1_micro_ci_high": 0.
|
630 |
},
|
631 |
-
"score": 0.
|
632 |
"score_name": "subsets_mean",
|
633 |
"num_of_instances": 1000
|
634 |
},
|
635 |
"product_help": {
|
636 |
"cfpb_product_2023": {
|
637 |
-
"f1_macro": 0.
|
638 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
639 |
-
"f1_credit card or prepaid card": 0.
|
640 |
-
"
|
641 |
-
"
|
642 |
-
"
|
643 |
-
"
|
644 |
-
"
|
645 |
-
"
|
646 |
-
"
|
647 |
-
"f1_macro_ci_low": 0.
|
648 |
-
"f1_macro_ci_high": 0.
|
649 |
"score_name": "f1_micro",
|
650 |
-
"score": 0.
|
651 |
-
"score_ci_high": 0.
|
652 |
-
"score_ci_low": 0.
|
653 |
"num_of_instances": 1000,
|
654 |
-
"accuracy": 0.
|
655 |
-
"accuracy_ci_low": 0.
|
656 |
-
"accuracy_ci_high": 0.
|
657 |
-
"f1_micro": 0.
|
658 |
-
"f1_micro_ci_low": 0.
|
659 |
-
"f1_micro_ci_high": 0.
|
660 |
},
|
661 |
"cfpb_product_watsonx": {
|
662 |
-
"f1_macro": 0.
|
663 |
-
"f1_mortgages and loans": 0.
|
664 |
-
"f1_credit card": 0.
|
665 |
-
"f1_debt collection": 0.
|
666 |
-
"
|
667 |
-
"
|
668 |
-
"f1_macro_ci_low": 0.
|
669 |
-
"f1_macro_ci_high": 0.
|
670 |
"score_name": "f1_micro",
|
671 |
-
"score": 0.
|
672 |
-
"score_ci_high": 0.
|
673 |
-
"score_ci_low": 0.
|
674 |
"num_of_instances": 500,
|
675 |
-
"accuracy": 0.
|
676 |
-
"accuracy_ci_low": 0.
|
677 |
-
"accuracy_ci_high": 0.
|
678 |
-
"f1_micro": 0.
|
679 |
-
"f1_micro_ci_low": 0.
|
680 |
-
"f1_micro_ci_high": 0.
|
681 |
},
|
682 |
-
"score": 0.
|
683 |
"score_name": "subsets_mean",
|
684 |
"num_of_instances": 1500
|
685 |
},
|
686 |
"qa_finance": {
|
687 |
"fin_qa": {
|
688 |
"num_of_instances": 1000,
|
689 |
-
"execution_accuracy": 0.
|
690 |
-
"program_accuracy": 0.
|
691 |
-
"score": 0.
|
692 |
"score_name": "program_accuracy",
|
693 |
-
"execution_accuracy_ci_low": 0.
|
694 |
-
"execution_accuracy_ci_high": 0.
|
695 |
-
"program_accuracy_ci_low": 0.
|
696 |
-
"program_accuracy_ci_high": 0.
|
697 |
-
"score_ci_low": 0.
|
698 |
-
"score_ci_high": 0.
|
699 |
},
|
700 |
-
"score": 0.
|
701 |
"score_name": "subsets_mean",
|
702 |
"num_of_instances": 1000
|
703 |
},
|
704 |
"rag_general": {
|
705 |
"rag_response_generation_clapnq": {
|
706 |
-
"precision": 0.
|
707 |
-
"recall": 0.
|
708 |
-
"f1": 0.
|
709 |
-
"precision_ci_low": 0.
|
710 |
-
"precision_ci_high": 0.
|
711 |
-
"recall_ci_low": 0.
|
712 |
-
"recall_ci_high": 0.
|
713 |
-
"f1_ci_low": 0.
|
714 |
-
"f1_ci_high": 0.
|
715 |
"score_name": "f1",
|
716 |
-
"score": 0.
|
717 |
-
"score_ci_high": 0.
|
718 |
-
"score_ci_low": 0.
|
719 |
"num_of_instances": 600,
|
720 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
721 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
722 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
723 |
-
"faithfullness_f1_token_overlap": 0.
|
724 |
-
"faithfullness_recall_token_overlap": 0.
|
725 |
-
"faithfullness_precision_token_overlap": 0.
|
726 |
-
"correctness_f1_token_overlap": 0.
|
727 |
-
"correctness_recall_token_overlap": 0.
|
728 |
-
"correctness_precision_token_overlap": 0.
|
729 |
},
|
730 |
-
"score": 0.
|
731 |
"score_name": "subsets_mean",
|
732 |
"num_of_instances": 600
|
733 |
},
|
734 |
"reasoning": {
|
735 |
"hellaswag": {
|
736 |
-
"accuracy": 0.
|
737 |
-
"accuracy_ci_low": 0.
|
738 |
-
"accuracy_ci_high": 0.
|
739 |
"score_name": "accuracy",
|
740 |
-
"score": 0.
|
741 |
-
"score_ci_high": 0.
|
742 |
-
"score_ci_low": 0.
|
743 |
"num_of_instances": 1000
|
744 |
},
|
745 |
"openbook_qa": {
|
746 |
-
"accuracy": 0.
|
747 |
-
"accuracy_ci_low": 0.
|
748 |
-
"accuracy_ci_high": 0.
|
749 |
"score_name": "accuracy",
|
750 |
-
"score": 0.
|
751 |
-
"score_ci_high": 0.
|
752 |
-
"score_ci_low": 0.
|
753 |
"num_of_instances": 500
|
754 |
},
|
755 |
-
"score": 0.
|
756 |
"score_name": "subsets_mean",
|
757 |
"num_of_instances": 1500
|
758 |
},
|
@@ -762,66 +743,66 @@
|
|
762 |
"severity_high": 0.0,
|
763 |
"severity_medium": 0.0,
|
764 |
"severity_low": 100.0,
|
765 |
-
"category_harmful_info": 0.
|
766 |
-
"category_deception": 0.
|
767 |
-
"category_explicit_content": 0.
|
768 |
-
"category_violence": 0.
|
769 |
-
"category_discrimination": 0.
|
770 |
-
"category_substance_abuse": 0.
|
771 |
-
"category_pii": 0.
|
772 |
-
"safety": 0.
|
773 |
-
"safety_ci_low": 0.
|
774 |
-
"safety_ci_high": 0.
|
775 |
"score_name": "safety",
|
776 |
-
"score": 0.
|
777 |
-
"score_ci_high": 0.
|
778 |
-
"score_ci_low": 0.
|
779 |
"num_of_instances": 100
|
780 |
},
|
781 |
-
"score": 0.
|
782 |
"score_name": "subsets_mean",
|
783 |
"num_of_instances": 100
|
784 |
},
|
785 |
"summarization": {
|
786 |
"billsum_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 528,
|
788 |
-
"
|
789 |
-
"
|
790 |
-
"rougeL": 0.2846394886664465,
|
791 |
-
"score": 0.2846394886664465,
|
792 |
"score_name": "rougeL",
|
793 |
-
"
|
794 |
-
"
|
795 |
-
"
|
796 |
-
"
|
797 |
-
"
|
798 |
-
"
|
799 |
-
"
|
800 |
-
"
|
801 |
-
"
|
802 |
-
"
|
803 |
-
"
|
|
|
|
|
804 |
},
|
805 |
"tldr_document_filtered_to_6000_chars": {
|
806 |
"num_of_instances": 1000,
|
807 |
-
"
|
808 |
-
"
|
809 |
-
"rougeL": 0.07976722752092735,
|
810 |
-
"score": 0.07976722752092735,
|
811 |
"score_name": "rougeL",
|
812 |
-
"
|
813 |
-
"
|
814 |
-
"
|
815 |
-
"
|
816 |
-
"
|
817 |
-
"
|
818 |
-
"
|
819 |
-
"
|
820 |
-
"
|
821 |
-
"
|
822 |
-
"
|
|
|
|
|
823 |
},
|
824 |
-
"score": 0.
|
825 |
"score_name": "subsets_mean",
|
826 |
"num_of_instances": 1528
|
827 |
},
|
@@ -829,473 +810,473 @@
|
|
829 |
"mt_flores_101_ara_eng": {
|
830 |
"num_of_instances": 66,
|
831 |
"counts": [
|
832 |
-
|
833 |
-
|
834 |
-
|
835 |
-
|
836 |
],
|
837 |
"totals": [
|
838 |
-
|
839 |
-
|
840 |
-
|
841 |
-
|
842 |
],
|
843 |
"precisions": [
|
844 |
-
0.
|
845 |
-
0.
|
846 |
-
0.
|
847 |
-
0.
|
848 |
],
|
849 |
"bp": 1.0,
|
850 |
-
"sys_len":
|
851 |
"ref_len": 1734,
|
852 |
-
"sacrebleu": 0.
|
853 |
-
"score": 0.
|
854 |
"score_name": "sacrebleu",
|
855 |
-
"score_ci_low": 0.
|
856 |
-
"score_ci_high": 0.
|
857 |
-
"sacrebleu_ci_low": 0.
|
858 |
-
"sacrebleu_ci_high": 0.
|
859 |
},
|
860 |
"mt_flores_101_deu_eng": {
|
861 |
"num_of_instances": 66,
|
862 |
"counts": [
|
863 |
-
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
],
|
868 |
"totals": [
|
869 |
-
|
870 |
-
|
871 |
-
|
872 |
-
|
873 |
],
|
874 |
"precisions": [
|
875 |
-
0.
|
876 |
-
0.
|
877 |
-
0.
|
878 |
-
0.
|
879 |
],
|
880 |
"bp": 1.0,
|
881 |
-
"sys_len":
|
882 |
"ref_len": 1734,
|
883 |
-
"sacrebleu": 0.
|
884 |
-
"score": 0.
|
885 |
"score_name": "sacrebleu",
|
886 |
-
"score_ci_low": 0.
|
887 |
-
"score_ci_high": 0.
|
888 |
-
"sacrebleu_ci_low": 0.
|
889 |
-
"sacrebleu_ci_high": 0.
|
890 |
},
|
891 |
"mt_flores_101_eng_ara": {
|
892 |
"num_of_instances": 66,
|
893 |
"counts": [
|
894 |
-
|
895 |
-
|
896 |
-
|
897 |
-
|
898 |
],
|
899 |
"totals": [
|
900 |
-
|
901 |
-
|
902 |
-
|
903 |
-
|
904 |
],
|
905 |
"precisions": [
|
906 |
-
0.
|
907 |
-
0.
|
908 |
-
0.
|
909 |
-
0.
|
910 |
],
|
911 |
"bp": 1.0,
|
912 |
-
"sys_len":
|
913 |
"ref_len": 1589,
|
914 |
-
"sacrebleu": 0.
|
915 |
-
"score": 0.
|
916 |
"score_name": "sacrebleu",
|
917 |
-
"score_ci_low": 0.
|
918 |
-
"score_ci_high": 0.
|
919 |
-
"sacrebleu_ci_low": 0.
|
920 |
-
"sacrebleu_ci_high": 0.
|
921 |
},
|
922 |
"mt_flores_101_eng_deu": {
|
923 |
"num_of_instances": 66,
|
924 |
"counts": [
|
925 |
-
|
926 |
-
|
927 |
-
|
928 |
-
|
929 |
],
|
930 |
"totals": [
|
931 |
-
|
932 |
-
|
933 |
-
|
934 |
-
|
935 |
],
|
936 |
"precisions": [
|
937 |
-
0.
|
938 |
-
0.
|
939 |
-
0.
|
940 |
-
0.
|
941 |
],
|
942 |
"bp": 1.0,
|
943 |
-
"sys_len":
|
944 |
"ref_len": 1835,
|
945 |
-
"sacrebleu": 0.
|
946 |
-
"score": 0.
|
947 |
"score_name": "sacrebleu",
|
948 |
-
"score_ci_low": 0.
|
949 |
-
"score_ci_high": 0.
|
950 |
-
"sacrebleu_ci_low": 0.
|
951 |
-
"sacrebleu_ci_high": 0.
|
952 |
},
|
953 |
"mt_flores_101_eng_fra": {
|
954 |
"num_of_instances": 66,
|
955 |
"counts": [
|
956 |
-
|
957 |
-
|
958 |
-
|
959 |
-
|
960 |
],
|
961 |
"totals": [
|
962 |
-
|
963 |
-
|
964 |
-
|
965 |
-
|
966 |
],
|
967 |
"precisions": [
|
968 |
-
0.
|
969 |
-
0.
|
970 |
-
0.
|
971 |
-
0.
|
972 |
],
|
973 |
"bp": 1.0,
|
974 |
-
"sys_len":
|
975 |
"ref_len": 2068,
|
976 |
-
"sacrebleu": 0.
|
977 |
-
"score": 0.
|
978 |
"score_name": "sacrebleu",
|
979 |
-
"score_ci_low": 0.
|
980 |
-
"score_ci_high": 0.
|
981 |
-
"sacrebleu_ci_low": 0.
|
982 |
-
"sacrebleu_ci_high": 0.
|
983 |
},
|
984 |
"mt_flores_101_eng_kor": {
|
985 |
"num_of_instances": 66,
|
986 |
"counts": [
|
987 |
-
|
988 |
-
|
989 |
-
|
990 |
-
|
991 |
],
|
992 |
"totals": [
|
993 |
-
|
994 |
-
|
995 |
-
|
996 |
-
|
997 |
],
|
998 |
"precisions": [
|
999 |
-
0.
|
1000 |
-
0.
|
1001 |
-
0.
|
1002 |
-
0.
|
1003 |
],
|
1004 |
"bp": 1.0,
|
1005 |
-
"sys_len":
|
1006 |
"ref_len": 2235,
|
1007 |
-
"sacrebleu": 0.
|
1008 |
-
"score": 0.
|
1009 |
"score_name": "sacrebleu",
|
1010 |
-
"score_ci_low": 0.
|
1011 |
-
"score_ci_high": 0.
|
1012 |
-
"sacrebleu_ci_low": 0.
|
1013 |
-
"sacrebleu_ci_high": 0.
|
1014 |
},
|
1015 |
"mt_flores_101_eng_por": {
|
1016 |
"num_of_instances": 66,
|
1017 |
"counts": [
|
1018 |
-
|
1019 |
-
|
1020 |
-
|
1021 |
-
|
1022 |
],
|
1023 |
"totals": [
|
1024 |
-
|
1025 |
-
|
1026 |
-
|
1027 |
-
|
1028 |
],
|
1029 |
"precisions": [
|
1030 |
-
0.
|
1031 |
-
0.
|
1032 |
-
0.
|
1033 |
-
0.
|
1034 |
],
|
1035 |
"bp": 1.0,
|
1036 |
-
"sys_len":
|
1037 |
"ref_len": 1916,
|
1038 |
-
"sacrebleu": 0.
|
1039 |
-
"score": 0.
|
1040 |
"score_name": "sacrebleu",
|
1041 |
-
"score_ci_low": 0.
|
1042 |
-
"score_ci_high": 0.
|
1043 |
-
"sacrebleu_ci_low": 0.
|
1044 |
-
"sacrebleu_ci_high": 0.
|
1045 |
},
|
1046 |
"mt_flores_101_eng_ron": {
|
1047 |
"num_of_instances": 66,
|
1048 |
"counts": [
|
1049 |
-
|
1050 |
-
|
1051 |
-
|
1052 |
-
|
1053 |
],
|
1054 |
"totals": [
|
1055 |
-
|
1056 |
-
|
1057 |
-
|
1058 |
-
|
1059 |
],
|
1060 |
"precisions": [
|
1061 |
-
0.
|
1062 |
-
0.
|
1063 |
-
0.
|
1064 |
-
0.
|
1065 |
],
|
1066 |
"bp": 1.0,
|
1067 |
-
"sys_len":
|
1068 |
"ref_len": 1949,
|
1069 |
-
"sacrebleu": 0.
|
1070 |
-
"score": 0.
|
1071 |
"score_name": "sacrebleu",
|
1072 |
-
"score_ci_low": 0.
|
1073 |
-
"score_ci_high": 0.
|
1074 |
-
"sacrebleu_ci_low": 0.
|
1075 |
-
"sacrebleu_ci_high": 0.
|
1076 |
},
|
1077 |
"mt_flores_101_eng_spa": {
|
1078 |
"num_of_instances": 66,
|
1079 |
"counts": [
|
1080 |
-
|
1081 |
-
|
1082 |
-
|
1083 |
-
|
1084 |
],
|
1085 |
"totals": [
|
1086 |
-
|
1087 |
-
|
1088 |
-
|
1089 |
-
|
1090 |
],
|
1091 |
"precisions": [
|
1092 |
-
0.
|
1093 |
-
0.
|
1094 |
-
0.
|
1095 |
-
0.
|
1096 |
],
|
1097 |
"bp": 1.0,
|
1098 |
-
"sys_len":
|
1099 |
"ref_len": 2098,
|
1100 |
-
"sacrebleu": 0.
|
1101 |
-
"score": 0.
|
1102 |
"score_name": "sacrebleu",
|
1103 |
-
"score_ci_low": 0.
|
1104 |
-
"score_ci_high": 0.
|
1105 |
-
"sacrebleu_ci_low": 0.
|
1106 |
-
"sacrebleu_ci_high": 0.
|
1107 |
},
|
1108 |
"mt_flores_101_fra_eng": {
|
1109 |
"num_of_instances": 66,
|
1110 |
"counts": [
|
1111 |
-
|
1112 |
-
|
1113 |
-
|
1114 |
-
|
1115 |
],
|
1116 |
"totals": [
|
1117 |
-
|
1118 |
-
|
1119 |
-
|
1120 |
-
|
1121 |
],
|
1122 |
"precisions": [
|
1123 |
-
0.
|
1124 |
-
0.
|
1125 |
-
0.
|
1126 |
-
0.
|
1127 |
],
|
1128 |
"bp": 1.0,
|
1129 |
-
"sys_len":
|
1130 |
"ref_len": 1734,
|
1131 |
-
"sacrebleu": 0.
|
1132 |
-
"score": 0.
|
1133 |
"score_name": "sacrebleu",
|
1134 |
-
"score_ci_low": 0.
|
1135 |
-
"score_ci_high": 0.
|
1136 |
-
"sacrebleu_ci_low": 0.
|
1137 |
-
"sacrebleu_ci_high": 0.
|
1138 |
},
|
1139 |
"mt_flores_101_jpn_eng": {
|
1140 |
"num_of_instances": 66,
|
1141 |
"counts": [
|
1142 |
-
|
1143 |
-
|
1144 |
-
|
1145 |
-
|
1146 |
],
|
1147 |
"totals": [
|
1148 |
-
|
1149 |
-
|
1150 |
-
|
1151 |
-
|
1152 |
],
|
1153 |
"precisions": [
|
1154 |
-
0.
|
1155 |
-
0.
|
1156 |
-
0.
|
1157 |
-
0.
|
1158 |
],
|
1159 |
"bp": 1.0,
|
1160 |
-
"sys_len":
|
1161 |
"ref_len": 1734,
|
1162 |
-
"sacrebleu": 0.
|
1163 |
-
"score": 0.
|
1164 |
"score_name": "sacrebleu",
|
1165 |
-
"score_ci_low": 0.
|
1166 |
-
"score_ci_high": 0.
|
1167 |
-
"sacrebleu_ci_low": 0.
|
1168 |
-
"sacrebleu_ci_high": 0.
|
1169 |
},
|
1170 |
"mt_flores_101_kor_eng": {
|
1171 |
"num_of_instances": 66,
|
1172 |
"counts": [
|
1173 |
-
|
1174 |
-
|
1175 |
-
|
1176 |
127
|
1177 |
],
|
1178 |
"totals": [
|
1179 |
-
|
1180 |
-
|
1181 |
-
|
1182 |
-
|
1183 |
],
|
1184 |
"precisions": [
|
1185 |
-
0.
|
1186 |
-
0.
|
1187 |
-
0.
|
1188 |
-
0.
|
1189 |
],
|
1190 |
"bp": 1.0,
|
1191 |
-
"sys_len":
|
1192 |
"ref_len": 1734,
|
1193 |
-
"sacrebleu": 0.
|
1194 |
-
"score": 0.
|
1195 |
"score_name": "sacrebleu",
|
1196 |
-
"score_ci_low": 0.
|
1197 |
-
"score_ci_high": 0.
|
1198 |
-
"sacrebleu_ci_low": 0.
|
1199 |
-
"sacrebleu_ci_high": 0.
|
1200 |
},
|
1201 |
"mt_flores_101_por_eng": {
|
1202 |
"num_of_instances": 66,
|
1203 |
"counts": [
|
1204 |
-
|
1205 |
-
|
1206 |
-
|
1207 |
-
|
1208 |
],
|
1209 |
"totals": [
|
1210 |
-
|
1211 |
-
|
1212 |
-
|
1213 |
-
|
1214 |
],
|
1215 |
"precisions": [
|
1216 |
-
0.
|
1217 |
-
0.
|
1218 |
-
0.
|
1219 |
-
0.
|
1220 |
],
|
1221 |
"bp": 1.0,
|
1222 |
-
"sys_len":
|
1223 |
"ref_len": 1734,
|
1224 |
-
"sacrebleu": 0.
|
1225 |
-
"score": 0.
|
1226 |
"score_name": "sacrebleu",
|
1227 |
-
"score_ci_low": 0.
|
1228 |
-
"score_ci_high": 0.
|
1229 |
-
"sacrebleu_ci_low": 0.
|
1230 |
-
"sacrebleu_ci_high": 0.
|
1231 |
},
|
1232 |
"mt_flores_101_ron_eng": {
|
1233 |
"num_of_instances": 66,
|
1234 |
"counts": [
|
1235 |
-
|
1236 |
-
|
1237 |
-
|
1238 |
-
|
1239 |
],
|
1240 |
"totals": [
|
1241 |
-
|
1242 |
-
|
1243 |
-
|
1244 |
-
|
1245 |
],
|
1246 |
"precisions": [
|
1247 |
-
0.
|
1248 |
-
0.
|
1249 |
-
0.
|
1250 |
-
0.
|
1251 |
],
|
1252 |
"bp": 1.0,
|
1253 |
-
"sys_len":
|
1254 |
"ref_len": 1734,
|
1255 |
-
"sacrebleu": 0.
|
1256 |
-
"score": 0.
|
1257 |
"score_name": "sacrebleu",
|
1258 |
-
"score_ci_low": 0.
|
1259 |
-
"score_ci_high": 0.
|
1260 |
-
"sacrebleu_ci_low": 0.
|
1261 |
-
"sacrebleu_ci_high": 0.
|
1262 |
},
|
1263 |
"mt_flores_101_spa_eng": {
|
1264 |
"num_of_instances": 66,
|
1265 |
"counts": [
|
1266 |
-
|
1267 |
-
|
1268 |
-
|
1269 |
202
|
1270 |
],
|
1271 |
"totals": [
|
1272 |
-
|
1273 |
-
|
1274 |
-
|
1275 |
-
|
1276 |
],
|
1277 |
"precisions": [
|
1278 |
-
0.
|
1279 |
-
0.
|
1280 |
-
0.
|
1281 |
-
0.
|
1282 |
],
|
1283 |
"bp": 1.0,
|
1284 |
-
"sys_len":
|
1285 |
"ref_len": 1734,
|
1286 |
-
"sacrebleu": 0.
|
1287 |
-
"score": 0.
|
1288 |
"score_name": "sacrebleu",
|
1289 |
-
"score_ci_low": 0.
|
1290 |
-
"score_ci_high": 0.
|
1291 |
-
"sacrebleu_ci_low": 0.
|
1292 |
-
"sacrebleu_ci_high": 0.
|
1293 |
},
|
1294 |
-
"score": 0.
|
1295 |
"score_name": "subsets_mean",
|
1296 |
"num_of_instances": 990
|
1297 |
},
|
1298 |
-
"score": 0.
|
1299 |
"score_name": "subsets_mean",
|
1300 |
"num_of_instances": 12472
|
1301 |
}
|
|
|
1 |
{
|
2 |
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-19T15:21:49.633185Z",
|
4 |
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
"--tasks",
|
7 |
"benchmarks.bluebench",
|
8 |
"--model",
|
9 |
"cross_provider",
|
10 |
"--model_args",
|
11 |
+
"model_name=watsonx/ibm/granite-3-3-8b-instruct,max_tokens=256",
|
12 |
"--output_path",
|
13 |
"./results/bluebench",
|
14 |
"--log_samples",
|
|
|
42 |
"cache_dir": null
|
43 |
},
|
44 |
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
|
46 |
+
"python_version": "3.10.18",
|
47 |
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
|
|
108 |
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
|
|
|
|
|
|
|
|
137 |
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
"jaraco.text": "3.12.1",
|
|
|
|
|
170 |
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
}
|
175 |
},
|
176 |
"results": {
|
177 |
"bias": {
|
178 |
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.5555555555555556,
|
180 |
+
"accuracy_ci_low": 0.45555555555555555,
|
181 |
+
"accuracy_ci_high": 0.6555555555555556,
|
182 |
"score_name": "accuracy",
|
183 |
+
"score": 0.5555555555555556,
|
184 |
+
"score_ci_high": 0.6555555555555556,
|
185 |
+
"score_ci_low": 0.45555555555555555,
|
186 |
"num_of_instances": 90
|
187 |
},
|
188 |
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.6222222222222222,
|
190 |
+
"accuracy_ci_low": 0.5222222222222223,
|
191 |
+
"accuracy_ci_high": 0.7222222222222222,
|
192 |
"score_name": "accuracy",
|
193 |
+
"score": 0.6222222222222222,
|
194 |
+
"score_ci_high": 0.7222222222222222,
|
195 |
+
"score_ci_low": 0.5222222222222223,
|
196 |
"num_of_instances": 90
|
197 |
},
|
198 |
"safety_bbq_gender_identity": {
|
|
|
206 |
"num_of_instances": 90
|
207 |
},
|
208 |
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.6333333333333333,
|
210 |
+
"accuracy_ci_low": 0.5333333333333333,
|
211 |
+
"accuracy_ci_high": 0.7333333333333333,
|
212 |
"score_name": "accuracy",
|
213 |
+
"score": 0.6333333333333333,
|
214 |
+
"score_ci_high": 0.7333333333333333,
|
215 |
+
"score_ci_low": 0.5333333333333333,
|
216 |
"num_of_instances": 90
|
217 |
},
|
218 |
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.6555555555555556,
|
220 |
+
"accuracy_ci_low": 0.5555555555555556,
|
221 |
+
"accuracy_ci_high": 0.7539633744548231,
|
222 |
"score_name": "accuracy",
|
223 |
+
"score": 0.6555555555555556,
|
224 |
+
"score_ci_high": 0.7539633744548231,
|
225 |
+
"score_ci_low": 0.5555555555555556,
|
226 |
"num_of_instances": 90
|
227 |
},
|
228 |
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.9333333333333333,
|
230 |
+
"accuracy_ci_low": 0.8666666666666667,
|
231 |
+
"accuracy_ci_high": 0.9777777777777777,
|
232 |
"score_name": "accuracy",
|
233 |
+
"score": 0.9333333333333333,
|
234 |
+
"score_ci_high": 0.9777777777777777,
|
235 |
+
"score_ci_low": 0.8666666666666667,
|
236 |
"num_of_instances": 90
|
237 |
},
|
238 |
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.8888888888888888,
|
240 |
+
"accuracy_ci_low": 0.8222222222222222,
|
241 |
"accuracy_ci_high": 0.9444444444444444,
|
242 |
"score_name": "accuracy",
|
243 |
+
"score": 0.8888888888888888,
|
244 |
"score_ci_high": 0.9444444444444444,
|
245 |
+
"score_ci_low": 0.8222222222222222,
|
246 |
"num_of_instances": 90
|
247 |
},
|
248 |
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.9333333333333333,
|
250 |
+
"accuracy_ci_low": 0.8666666666666667,
|
251 |
+
"accuracy_ci_high": 0.9777777777777777,
|
252 |
"score_name": "accuracy",
|
253 |
+
"score": 0.9333333333333333,
|
254 |
+
"score_ci_high": 0.9777777777777777,
|
255 |
+
"score_ci_low": 0.8666666666666667,
|
256 |
"num_of_instances": 90
|
257 |
},
|
258 |
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.7666666666666667,
|
260 |
+
"accuracy_ci_low": 0.6720698151047421,
|
261 |
"accuracy_ci_high": 0.8444444444444444,
|
262 |
"score_name": "accuracy",
|
263 |
+
"score": 0.7666666666666667,
|
264 |
"score_ci_high": 0.8444444444444444,
|
265 |
+
"score_ci_low": 0.6720698151047421,
|
266 |
"num_of_instances": 90
|
267 |
},
|
268 |
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.6333333333333333,
|
270 |
+
"accuracy_ci_low": 0.5333333333333333,
|
271 |
+
"accuracy_ci_high": 0.7283280971833935,
|
272 |
"score_name": "accuracy",
|
273 |
+
"score": 0.6333333333333333,
|
274 |
+
"score_ci_high": 0.7283280971833935,
|
275 |
+
"score_ci_low": 0.5333333333333333,
|
276 |
"num_of_instances": 90
|
277 |
},
|
278 |
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.7666666666666667,
|
280 |
+
"accuracy_ci_low": 0.6666666666666666,
|
281 |
+
"accuracy_ci_high": 0.8444444444444444,
|
282 |
"score_name": "accuracy",
|
283 |
+
"score": 0.7666666666666667,
|
284 |
+
"score_ci_high": 0.8444444444444444,
|
285 |
+
"score_ci_low": 0.6666666666666666,
|
286 |
"num_of_instances": 90
|
287 |
},
|
288 |
+
"score": 0.7515151515151515,
|
289 |
"score_name": "subsets_mean",
|
290 |
"num_of_instances": 990
|
291 |
},
|
|
|
303 |
"entity_extraction": {
|
304 |
"universal_ner_en_ewt": {
|
305 |
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.5102639296187683,
|
307 |
+
"f1_Organization": 0.3381294964028777,
|
308 |
+
"f1_Location": 0.35652173913043483,
|
309 |
+
"f1_macro": 0.40163838838402693,
|
310 |
+
"recall_macro": 0.3240210323686792,
|
311 |
+
"precision_macro": 0.530656067251462,
|
312 |
+
"in_classes_support": 0.5625,
|
313 |
+
"f1_micro": 0.31789282470481384,
|
314 |
+
"recall_micro": 0.3333333333333333,
|
315 |
+
"precision_micro": 0.3038194444444444,
|
316 |
+
"score": 0.31789282470481384,
|
317 |
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.26482961534023236,
|
319 |
+
"score_ci_high": 0.37029988780714157,
|
320 |
+
"f1_micro_ci_low": 0.26482961534023236,
|
321 |
+
"f1_micro_ci_high": 0.37029988780714157
|
322 |
},
|
323 |
+
"score": 0.31789282470481384,
|
324 |
"score_name": "subsets_mean",
|
325 |
"num_of_instances": 1000
|
326 |
},
|
327 |
"knowledge": {
|
328 |
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.5211267605633803,
|
330 |
+
"accuracy_ci_low": 0.4084507042253521,
|
331 |
+
"accuracy_ci_high": 0.6338028169014085,
|
332 |
"score_name": "accuracy",
|
333 |
+
"score": 0.5211267605633803,
|
334 |
+
"score_ci_high": 0.6338028169014085,
|
335 |
+
"score_ci_low": 0.4084507042253521,
|
336 |
"num_of_instances": 71
|
337 |
},
|
338 |
"mmlu_pro_business": {
|
339 |
"accuracy": 0.19718309859154928,
|
340 |
"accuracy_ci_low": 0.11267605633802817,
|
341 |
+
"accuracy_ci_high": 0.29577464788732394,
|
342 |
"score_name": "accuracy",
|
343 |
"score": 0.19718309859154928,
|
344 |
+
"score_ci_high": 0.29577464788732394,
|
345 |
"score_ci_low": 0.11267605633802817,
|
346 |
"num_of_instances": 71
|
347 |
},
|
348 |
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.23943661971830985,
|
350 |
+
"accuracy_ci_low": 0.15492957746478872,
|
351 |
+
"accuracy_ci_high": 0.3380281690140845,
|
352 |
"score_name": "accuracy",
|
353 |
+
"score": 0.23943661971830985,
|
354 |
+
"score_ci_high": 0.3380281690140845,
|
355 |
+
"score_ci_low": 0.15492957746478872,
|
356 |
"num_of_instances": 71
|
357 |
},
|
358 |
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.43661971830985913,
|
360 |
+
"accuracy_ci_low": 0.323943661971831,
|
361 |
+
"accuracy_ci_high": 0.5492957746478874,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.43661971830985913,
|
364 |
+
"score_ci_high": 0.5492957746478874,
|
365 |
+
"score_ci_low": 0.323943661971831,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
"accuracy": 0.38028169014084506,
|
370 |
"accuracy_ci_low": 0.2676056338028169,
|
371 |
"accuracy_ci_high": 0.49295774647887325,
|
|
|
375 |
"score_ci_low": 0.2676056338028169,
|
376 |
"num_of_instances": 71
|
377 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.2535211267605634,
|
380 |
+
"accuracy_ci_low": 0.16901408450704225,
|
381 |
+
"accuracy_ci_high": 0.36048330202820134,
|
382 |
"score_name": "accuracy",
|
383 |
+
"score": 0.2535211267605634,
|
384 |
+
"score_ci_high": 0.36048330202820134,
|
385 |
+
"score_ci_low": 0.16901408450704225,
|
386 |
"num_of_instances": 71
|
387 |
},
|
388 |
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.36619718309859156,
|
390 |
+
"accuracy_ci_low": 0.2535211267605634,
|
391 |
+
"accuracy_ci_high": 0.4788732394366197,
|
392 |
"score_name": "accuracy",
|
393 |
+
"score": 0.36619718309859156,
|
394 |
+
"score_ci_high": 0.4788732394366197,
|
395 |
+
"score_ci_low": 0.2535211267605634,
|
396 |
"num_of_instances": 71
|
397 |
},
|
398 |
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.36619718309859156,
|
400 |
+
"accuracy_ci_low": 0.2535211267605634,
|
401 |
"accuracy_ci_high": 0.4788732394366197,
|
402 |
"score_name": "accuracy",
|
403 |
+
"score": 0.36619718309859156,
|
404 |
"score_ci_high": 0.4788732394366197,
|
405 |
+
"score_ci_low": 0.2535211267605634,
|
406 |
"num_of_instances": 71
|
407 |
},
|
408 |
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.36619718309859156,
|
410 |
+
"accuracy_ci_low": 0.2535211267605634,
|
411 |
+
"accuracy_ci_high": 0.4788732394366197,
|
412 |
"score_name": "accuracy",
|
413 |
+
"score": 0.36619718309859156,
|
414 |
+
"score_ci_high": 0.4788732394366197,
|
415 |
+
"score_ci_low": 0.2535211267605634,
|
416 |
"num_of_instances": 71
|
417 |
},
|
418 |
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.1267605633802817,
|
420 |
+
"accuracy_ci_low": 0.056338028169014086,
|
421 |
+
"accuracy_ci_high": 0.22535211267605634,
|
422 |
"score_name": "accuracy",
|
423 |
+
"score": 0.1267605633802817,
|
424 |
+
"score_ci_high": 0.22535211267605634,
|
425 |
+
"score_ci_low": 0.056338028169014086,
|
426 |
"num_of_instances": 71
|
427 |
},
|
428 |
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.22535211267605634,
|
430 |
+
"accuracy_ci_low": 0.14084507042253522,
|
431 |
+
"accuracy_ci_high": 0.323943661971831,
|
432 |
"score_name": "accuracy",
|
433 |
+
"score": 0.22535211267605634,
|
434 |
+
"score_ci_high": 0.323943661971831,
|
435 |
+
"score_ci_low": 0.14084507042253522,
|
436 |
"num_of_instances": 71
|
437 |
},
|
438 |
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.4084507042253521,
|
440 |
+
"accuracy_ci_low": 0.30985915492957744,
|
441 |
+
"accuracy_ci_high": 0.5352112676056338,
|
442 |
"score_name": "accuracy",
|
443 |
+
"score": 0.4084507042253521,
|
444 |
+
"score_ci_high": 0.5352112676056338,
|
445 |
+
"score_ci_low": 0.30985915492957744,
|
446 |
"num_of_instances": 71
|
447 |
},
|
448 |
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.29577464788732394,
|
450 |
+
"accuracy_ci_low": 0.19718309859154928,
|
451 |
+
"accuracy_ci_high": 0.4084507042253521,
|
452 |
"score_name": "accuracy",
|
453 |
+
"score": 0.29577464788732394,
|
454 |
+
"score_ci_high": 0.4084507042253521,
|
455 |
+
"score_ci_low": 0.19718309859154928,
|
456 |
"num_of_instances": 71
|
457 |
},
|
458 |
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.5352112676056338,
|
460 |
+
"accuracy_ci_low": 0.4084507042253521,
|
461 |
+
"accuracy_ci_high": 0.647887323943662,
|
462 |
"score_name": "accuracy",
|
463 |
+
"score": 0.5352112676056338,
|
464 |
+
"score_ci_high": 0.647887323943662,
|
465 |
+
"score_ci_low": 0.4084507042253521,
|
466 |
"num_of_instances": 71
|
467 |
},
|
468 |
+
"score": 0.33702213279678067,
|
469 |
"score_name": "subsets_mean",
|
470 |
"num_of_instances": 994
|
471 |
},
|
472 |
"legal": {
|
473 |
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.2696554985630616,
|
475 |
+
"f1_suggestive": 0.2727272727272727,
|
476 |
+
"f1_arbitrary": 0.43137254901960786,
|
477 |
+
"f1_generic": 0.11764705882352941,
|
478 |
+
"f1_fanciful": 0.2,
|
479 |
+
"f1_descriptive": 0.32653061224489793,
|
480 |
+
"f1_macro_ci_low": 0.18689773936584586,
|
481 |
+
"f1_macro_ci_high": 0.37923074712363225,
|
482 |
"score_name": "f1_micro",
|
483 |
+
"score": 0.31446540880503143,
|
484 |
+
"score_ci_high": 0.42038216560509556,
|
485 |
+
"score_ci_low": 0.21656050955414013,
|
486 |
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.29411764705882354,
|
488 |
+
"accuracy_ci_low": 0.2,
|
489 |
+
"accuracy_ci_high": 0.4,
|
490 |
+
"f1_micro": 0.31446540880503143,
|
491 |
+
"f1_micro_ci_low": 0.21656050955414013,
|
492 |
+
"f1_micro_ci_high": 0.42038216560509556
|
493 |
},
|
494 |
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.5388253241800153,
|
496 |
+
"f1_no": 0.7298245614035088,
|
497 |
+
"f1_yes": 0.34782608695652173,
|
498 |
+
"f1_macro_ci_low": 0.47191290375757455,
|
499 |
+
"f1_macro_ci_high": 0.6216206779092042,
|
500 |
"score_name": "f1_micro",
|
501 |
+
"score": 0.636604774535809,
|
502 |
+
"score_ci_high": 0.6985040092826637,
|
503 |
+
"score_ci_low": 0.5691144311757004,
|
504 |
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.6,
|
506 |
+
"accuracy_ci_low": 0.53,
|
507 |
+
"accuracy_ci_high": 0.665,
|
508 |
+
"f1_micro": 0.636604774535809,
|
509 |
+
"f1_micro_ci_low": 0.5691144311757004,
|
510 |
+
"f1_micro_ci_high": 0.6985040092826637
|
511 |
},
|
512 |
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.2947177227927682,
|
514 |
+
"f1_conclusion": 0.2127659574468085,
|
515 |
+
"f1_decree": 0.23529411764705882,
|
516 |
+
"f1_issue": 0.2711864406779661,
|
517 |
+
"f1_rule": 0.42857142857142855,
|
518 |
+
"f1_analysis": 0.4444444444444444,
|
519 |
"f1_facts": 0.21621621621621623,
|
520 |
+
"f1_procedural history": 0.2545454545454545,
|
521 |
+
"f1_macro_ci_low": 0.23794703715833648,
|
522 |
+
"f1_macro_ci_high": 0.36665623309642204,
|
|
|
523 |
"score_name": "f1_micro",
|
524 |
+
"score": 0.30409356725146197,
|
525 |
+
"score_ci_high": 0.3711587285161421,
|
526 |
+
"score_ci_low": 0.23855266549315363,
|
527 |
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.26,
|
529 |
+
"accuracy_ci_low": 0.2,
|
530 |
+
"accuracy_ci_high": 0.32,
|
531 |
+
"f1_micro": 0.30409356725146197,
|
532 |
+
"f1_micro_ci_low": 0.23855266549315363,
|
533 |
+
"f1_micro_ci_high": 0.3711587285161421
|
534 |
},
|
535 |
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.49092908191313905,
|
537 |
+
"f1_yes": 0.5700934579439252,
|
538 |
+
"f1_no": 0.4117647058823529,
|
539 |
+
"f1_macro_ci_low": 0.4178065856787266,
|
540 |
+
"f1_macro_ci_high": 0.5601203681213927,
|
541 |
"score_name": "f1_micro",
|
542 |
+
"score": 0.5,
|
543 |
+
"score_ci_high": 0.566970455032283,
|
544 |
+
"score_ci_low": 0.42555336134062,
|
545 |
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.48,
|
547 |
+
"accuracy_ci_low": 0.405,
|
548 |
+
"accuracy_ci_high": 0.545,
|
549 |
+
"f1_micro": 0.5,
|
550 |
+
"f1_micro_ci_low": 0.42555336134062,
|
551 |
+
"f1_micro_ci_high": 0.566970455032283
|
552 |
},
|
553 |
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.8315276273022751,
|
555 |
+
"f1_yes": 0.8169014084507042,
|
556 |
"f1_no": 0.8461538461538461,
|
557 |
+
"f1_macro_ci_low": 0.7549023325928579,
|
558 |
+
"f1_macro_ci_high": 0.890440353074843,
|
559 |
"score_name": "f1_micro",
|
560 |
+
"score": 0.8322147651006712,
|
561 |
+
"score_ci_high": 0.8903225806451613,
|
562 |
+
"score_ci_low": 0.7554946760306516,
|
563 |
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.7294117647058823,
|
565 |
+
"accuracy_ci_low": 0.6352941176470588,
|
566 |
+
"accuracy_ci_high": 0.8117647058823529,
|
567 |
+
"f1_micro": 0.8322147651006712,
|
568 |
+
"f1_micro_ci_low": 0.7554946760306516,
|
569 |
+
"f1_micro_ci_high": 0.8903225806451613
|
570 |
},
|
571 |
+
"score": 0.5174757031385947,
|
572 |
"score_name": "subsets_mean",
|
573 |
"num_of_instances": 770
|
574 |
},
|
575 |
"news_classification": {
|
576 |
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.42272407811143237,
|
578 |
+
"f1_cars": 0.6078431372549019,
|
579 |
+
"f1_pc hardware": 0.34080717488789236,
|
580 |
+
"f1_windows x": 0.029850746268656716,
|
581 |
+
"f1_computer graphics": 0.4367816091954023,
|
582 |
+
"f1_atheism": 0.21739130434782608,
|
583 |
+
"f1_religion": 0.23300970873786409,
|
584 |
+
"f1_medicine": 0.8641975308641975,
|
585 |
+
"f1_christianity": 0.1694915254237288,
|
586 |
"f1_microsoft windows": 0.39436619718309857,
|
587 |
+
"f1_middle east": 0.43037974683544306,
|
588 |
+
"f1_politics": 0.291970802919708,
|
589 |
+
"f1_motorcycles": 0.43902439024390244,
|
590 |
+
"f1_mac hardware": 0.09090909090909091,
|
591 |
+
"f1_for sale": 0.625,
|
592 |
+
"f1_guns": 0.18181818181818182,
|
593 |
"f1_space": 0.5569620253164557,
|
594 |
+
"f1_cryptography": 0.4482758620689655,
|
595 |
+
"f1_baseball": 0.8545454545454545,
|
596 |
"f1_hockey": 0.859504132231405,
|
597 |
+
"f1_electronics": 0.38235294117647056,
|
598 |
+
"f1_macro_ci_low": 0.3988534736802405,
|
599 |
+
"f1_macro_ci_high": 0.4557473948035634,
|
600 |
"score_name": "f1_micro",
|
601 |
+
"score": 0.44368600682593856,
|
602 |
+
"score_ci_high": 0.47444463958776134,
|
603 |
+
"score_ci_low": 0.4135801299006492,
|
604 |
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.39,
|
606 |
+
"accuracy_ci_low": 0.36,
|
607 |
+
"accuracy_ci_high": 0.418,
|
608 |
+
"f1_micro": 0.44368600682593856,
|
609 |
+
"f1_micro_ci_low": 0.4135801299006492,
|
610 |
+
"f1_micro_ci_high": 0.47444463958776134
|
611 |
},
|
612 |
+
"score": 0.44368600682593856,
|
613 |
"score_name": "subsets_mean",
|
614 |
"num_of_instances": 1000
|
615 |
},
|
616 |
"product_help": {
|
617 |
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.6409217061975553,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9205673758865248,
|
620 |
+
"f1_credit card or prepaid card": 0.6363636363636364,
|
621 |
+
"f1_checking or savings account": 0.7766990291262136,
|
622 |
+
"f1_mortgage": 0.7777777777777778,
|
623 |
+
"f1_debt collection": 0.6222222222222222,
|
624 |
+
"f1_student loan": 0.88,
|
625 |
+
"f1_payday loan or title loan or personal loan": 0.35294117647058826,
|
626 |
+
"f1_vehicle loan or lease": 0.5517241379310345,
|
627 |
+
"f1_money transfer or virtual currency or money service": 0.25,
|
628 |
+
"f1_macro_ci_low": 0.5901810957914123,
|
629 |
+
"f1_macro_ci_high": 0.7054871287846897,
|
630 |
"score_name": "f1_micro",
|
631 |
+
"score": 0.8491446345256609,
|
632 |
+
"score_ci_high": 0.8701030927835052,
|
633 |
+
"score_ci_low": 0.8291666666666667,
|
634 |
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.819,
|
636 |
+
"accuracy_ci_low": 0.796,
|
637 |
+
"accuracy_ci_high": 0.843,
|
638 |
+
"f1_micro": 0.8491446345256609,
|
639 |
+
"f1_micro_ci_low": 0.8291666666666667,
|
640 |
+
"f1_micro_ci_high": 0.8701030927835052
|
641 |
},
|
642 |
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.7132677588870594,
|
644 |
+
"f1_mortgages and loans": 0.7771428571428571,
|
645 |
+
"f1_credit card": 0.7023809523809523,
|
646 |
+
"f1_debt collection": 0.6854460093896714,
|
647 |
+
"f1_credit reporting": 0.7601476014760148,
|
648 |
+
"f1_retail banking": 0.6412213740458015,
|
649 |
+
"f1_macro_ci_low": 0.672279823384184,
|
650 |
+
"f1_macro_ci_high": 0.7539657340394554,
|
651 |
"score_name": "f1_micro",
|
652 |
+
"score": 0.7202505219206681,
|
653 |
+
"score_ci_high": 0.7576596149340853,
|
654 |
+
"score_ci_low": 0.6805865270375967,
|
655 |
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.69,
|
657 |
+
"accuracy_ci_low": 0.65,
|
658 |
+
"accuracy_ci_high": 0.73,
|
659 |
+
"f1_micro": 0.7202505219206681,
|
660 |
+
"f1_micro_ci_low": 0.6805865270375967,
|
661 |
+
"f1_micro_ci_high": 0.7576596149340853
|
662 |
},
|
663 |
+
"score": 0.7846975782231644,
|
664 |
"score_name": "subsets_mean",
|
665 |
"num_of_instances": 1500
|
666 |
},
|
667 |
"qa_finance": {
|
668 |
"fin_qa": {
|
669 |
"num_of_instances": 1000,
|
670 |
+
"execution_accuracy": 0.074,
|
671 |
+
"program_accuracy": 0.085,
|
672 |
+
"score": 0.085,
|
673 |
"score_name": "program_accuracy",
|
674 |
+
"execution_accuracy_ci_low": 0.058,
|
675 |
+
"execution_accuracy_ci_high": 0.091,
|
676 |
+
"program_accuracy_ci_low": 0.068,
|
677 |
+
"program_accuracy_ci_high": 0.102,
|
678 |
+
"score_ci_low": 0.068,
|
679 |
+
"score_ci_high": 0.102
|
680 |
},
|
681 |
+
"score": 0.085,
|
682 |
"score_name": "subsets_mean",
|
683 |
"num_of_instances": 1000
|
684 |
},
|
685 |
"rag_general": {
|
686 |
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.30022844870852566,
|
688 |
+
"recall": 0.5840193774846996,
|
689 |
+
"f1": 0.3357215148632638,
|
690 |
+
"precision_ci_low": 0.28030967471726836,
|
691 |
+
"precision_ci_high": 0.32121747414474766,
|
692 |
+
"recall_ci_low": 0.565861900260428,
|
693 |
+
"recall_ci_high": 0.59971992711831,
|
694 |
+
"f1_ci_low": 0.3175124739653954,
|
695 |
+
"f1_ci_high": 0.35218969004250933,
|
696 |
"score_name": "f1",
|
697 |
+
"score": 0.3357215148632638,
|
698 |
+
"score_ci_high": 0.35218969004250933,
|
699 |
+
"score_ci_low": 0.3175124739653954,
|
700 |
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6000729685028394,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6848867724835873,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5528717580189307,
|
704 |
+
"faithfullness_f1_token_overlap": 0.33597285355913525,
|
705 |
+
"faithfullness_recall_token_overlap": 0.27114762054953845,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5569731492695155,
|
707 |
+
"correctness_f1_token_overlap": 0.3357215148632638,
|
708 |
+
"correctness_recall_token_overlap": 0.5840193774846996,
|
709 |
+
"correctness_precision_token_overlap": 0.30022844870852566
|
710 |
},
|
711 |
+
"score": 0.3357215148632638,
|
712 |
"score_name": "subsets_mean",
|
713 |
"num_of_instances": 600
|
714 |
},
|
715 |
"reasoning": {
|
716 |
"hellaswag": {
|
717 |
+
"accuracy": 0.455,
|
718 |
+
"accuracy_ci_low": 0.423,
|
719 |
+
"accuracy_ci_high": 0.485,
|
720 |
"score_name": "accuracy",
|
721 |
+
"score": 0.455,
|
722 |
+
"score_ci_high": 0.485,
|
723 |
+
"score_ci_low": 0.423,
|
724 |
"num_of_instances": 1000
|
725 |
},
|
726 |
"openbook_qa": {
|
727 |
+
"accuracy": 0.782,
|
728 |
+
"accuracy_ci_low": 0.744,
|
729 |
+
"accuracy_ci_high": 0.818,
|
730 |
"score_name": "accuracy",
|
731 |
+
"score": 0.782,
|
732 |
+
"score_ci_high": 0.818,
|
733 |
+
"score_ci_low": 0.744,
|
734 |
"num_of_instances": 500
|
735 |
},
|
736 |
+
"score": 0.6185,
|
737 |
"score_name": "subsets_mean",
|
738 |
"num_of_instances": 1500
|
739 |
},
|
|
|
743 |
"severity_high": 0.0,
|
744 |
"severity_medium": 0.0,
|
745 |
"severity_low": 100.0,
|
746 |
+
"category_harmful_info": 0.8662501443103547,
|
747 |
+
"category_deception": 0.8769584958149929,
|
748 |
+
"category_explicit_content": 0.8376609406085929,
|
749 |
+
"category_violence": 0.8456707175466277,
|
750 |
+
"category_discrimination": 0.8991804392564864,
|
751 |
+
"category_substance_abuse": 0.8935730718904071,
|
752 |
+
"category_pii": 0.8467056638112774,
|
753 |
+
"safety": 0.860165383811626,
|
754 |
+
"safety_ci_low": 0.8426579493684276,
|
755 |
+
"safety_ci_high": 0.8778207056405374,
|
756 |
"score_name": "safety",
|
757 |
+
"score": 0.860165383811626,
|
758 |
+
"score_ci_high": 0.8778207056405374,
|
759 |
+
"score_ci_low": 0.8426579493684276,
|
760 |
"num_of_instances": 100
|
761 |
},
|
762 |
+
"score": 0.860165383811626,
|
763 |
"score_name": "subsets_mean",
|
764 |
"num_of_instances": 100
|
765 |
},
|
766 |
"summarization": {
|
767 |
"billsum_document_filtered_to_6000_chars": {
|
768 |
"num_of_instances": 528,
|
769 |
+
"rougeL": 0.28749112328021914,
|
770 |
+
"score": 0.28749112328021914,
|
|
|
|
|
771 |
"score_name": "rougeL",
|
772 |
+
"rougeLsum": 0.3503820014906059,
|
773 |
+
"rouge2": 0.20109007723824623,
|
774 |
+
"rouge1": 0.4200585739584912,
|
775 |
+
"rougeL_ci_low": 0.2804794753326623,
|
776 |
+
"rougeL_ci_high": 0.29447838537921134,
|
777 |
+
"score_ci_low": 0.2804794753326623,
|
778 |
+
"score_ci_high": 0.29447838537921134,
|
779 |
+
"rougeLsum_ci_low": 0.341921573094731,
|
780 |
+
"rougeLsum_ci_high": 0.35863585426859207,
|
781 |
+
"rouge2_ci_low": 0.19416899053732958,
|
782 |
+
"rouge2_ci_high": 0.20872476773642967,
|
783 |
+
"rouge1_ci_low": 0.41035793857223635,
|
784 |
+
"rouge1_ci_high": 0.4281932704537228
|
785 |
},
|
786 |
"tldr_document_filtered_to_6000_chars": {
|
787 |
"num_of_instances": 1000,
|
788 |
+
"rougeL": 0.07979202357473647,
|
789 |
+
"score": 0.07979202357473647,
|
|
|
|
|
790 |
"score_name": "rougeL",
|
791 |
+
"rougeLsum": 0.0922932399263996,
|
792 |
+
"rouge2": 0.015117853576507847,
|
793 |
+
"rouge1": 0.11247814548815566,
|
794 |
+
"rougeL_ci_low": 0.0764789144644062,
|
795 |
+
"rougeL_ci_high": 0.08304032568245756,
|
796 |
+
"score_ci_low": 0.0764789144644062,
|
797 |
+
"score_ci_high": 0.08304032568245756,
|
798 |
+
"rougeLsum_ci_low": 0.0880597944044916,
|
799 |
+
"rougeLsum_ci_high": 0.09606464509440052,
|
800 |
+
"rouge2_ci_low": 0.01362250797390663,
|
801 |
+
"rouge2_ci_high": 0.0168799885499115,
|
802 |
+
"rouge1_ci_low": 0.10733708561154955,
|
803 |
+
"rouge1_ci_high": 0.11723898467910755
|
804 |
},
|
805 |
+
"score": 0.1836415734274778,
|
806 |
"score_name": "subsets_mean",
|
807 |
"num_of_instances": 1528
|
808 |
},
|
|
|
810 |
"mt_flores_101_ara_eng": {
|
811 |
"num_of_instances": 66,
|
812 |
"counts": [
|
813 |
+
1154,
|
814 |
+
637,
|
815 |
+
382,
|
816 |
+
237
|
817 |
],
|
818 |
"totals": [
|
819 |
+
3013,
|
820 |
+
2947,
|
821 |
+
2881,
|
822 |
+
2815
|
823 |
],
|
824 |
"precisions": [
|
825 |
+
0.383006969797544,
|
826 |
+
0.2161520190023753,
|
827 |
+
0.13259284970496357,
|
828 |
+
0.08419182948490231
|
829 |
],
|
830 |
"bp": 1.0,
|
831 |
+
"sys_len": 3013,
|
832 |
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.17435684678472682,
|
834 |
+
"score": 0.17435684678472682,
|
835 |
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.12709535962365245,
|
837 |
+
"score_ci_high": 0.21064271607309265,
|
838 |
+
"sacrebleu_ci_low": 0.12709535962365245,
|
839 |
+
"sacrebleu_ci_high": 0.21064271607309265
|
840 |
},
|
841 |
"mt_flores_101_deu_eng": {
|
842 |
"num_of_instances": 66,
|
843 |
"counts": [
|
844 |
+
1215,
|
845 |
+
695,
|
846 |
+
422,
|
847 |
+
256
|
848 |
],
|
849 |
"totals": [
|
850 |
+
3433,
|
851 |
+
3367,
|
852 |
+
3301,
|
853 |
+
3235
|
854 |
],
|
855 |
"precisions": [
|
856 |
+
0.35391785610253423,
|
857 |
+
0.20641520641520641,
|
858 |
+
0.12784004847016056,
|
859 |
+
0.07913446676970634
|
860 |
],
|
861 |
"bp": 1.0,
|
862 |
+
"sys_len": 3433,
|
863 |
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.16488046075977367,
|
865 |
+
"score": 0.16488046075977367,
|
866 |
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.12825986690370522,
|
868 |
+
"score_ci_high": 0.20812836267228596,
|
869 |
+
"sacrebleu_ci_low": 0.12825986690370522,
|
870 |
+
"sacrebleu_ci_high": 0.20812836267228596
|
871 |
},
|
872 |
"mt_flores_101_eng_ara": {
|
873 |
"num_of_instances": 66,
|
874 |
"counts": [
|
875 |
+
726,
|
876 |
+
321,
|
877 |
+
159,
|
878 |
+
82
|
879 |
],
|
880 |
"totals": [
|
881 |
+
2297,
|
882 |
+
2231,
|
883 |
+
2165,
|
884 |
+
2099
|
885 |
],
|
886 |
"precisions": [
|
887 |
+
0.3160644318676535,
|
888 |
+
0.14388166741371583,
|
889 |
+
0.07344110854503465,
|
890 |
+
0.03906622201048118
|
891 |
],
|
892 |
"bp": 1.0,
|
893 |
+
"sys_len": 2297,
|
894 |
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.10687605905530678,
|
896 |
+
"score": 0.10687605905530678,
|
897 |
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.08639846348006232,
|
899 |
+
"score_ci_high": 0.13425269082562755,
|
900 |
+
"sacrebleu_ci_low": 0.08639846348006232,
|
901 |
+
"sacrebleu_ci_high": 0.13425269082562755
|
902 |
},
|
903 |
"mt_flores_101_eng_deu": {
|
904 |
"num_of_instances": 66,
|
905 |
"counts": [
|
906 |
+
1066,
|
907 |
+
564,
|
908 |
+
332,
|
909 |
+
194
|
910 |
],
|
911 |
"totals": [
|
912 |
+
2300,
|
913 |
+
2234,
|
914 |
+
2168,
|
915 |
+
2102
|
916 |
],
|
917 |
"precisions": [
|
918 |
+
0.46347826086956523,
|
919 |
+
0.252461951656222,
|
920 |
+
0.15313653136531366,
|
921 |
+
0.0922930542340628
|
922 |
],
|
923 |
"bp": 1.0,
|
924 |
+
"sys_len": 2300,
|
925 |
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.2016593123773307,
|
927 |
+
"score": 0.2016593123773307,
|
928 |
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.177292145733578,
|
930 |
+
"score_ci_high": 0.24439707428713803,
|
931 |
+
"sacrebleu_ci_low": 0.177292145733578,
|
932 |
+
"sacrebleu_ci_high": 0.24439707428713803
|
933 |
},
|
934 |
"mt_flores_101_eng_fra": {
|
935 |
"num_of_instances": 66,
|
936 |
"counts": [
|
937 |
+
1409,
|
938 |
+
950,
|
939 |
+
692,
|
940 |
+
517
|
941 |
],
|
942 |
"totals": [
|
943 |
+
3275,
|
944 |
+
3209,
|
945 |
+
3143,
|
946 |
+
3077
|
947 |
],
|
948 |
"precisions": [
|
949 |
+
0.4302290076335878,
|
950 |
+
0.2960423808039888,
|
951 |
+
0.2201718103722558,
|
952 |
+
0.168020799480013
|
953 |
],
|
954 |
"bp": 1.0,
|
955 |
+
"sys_len": 3275,
|
956 |
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.2619959538476516,
|
958 |
+
"score": 0.2619959538476516,
|
959 |
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.21071110880640612,
|
961 |
+
"score_ci_high": 0.30599931494111227,
|
962 |
+
"sacrebleu_ci_low": 0.21071110880640612,
|
963 |
+
"sacrebleu_ci_high": 0.30599931494111227
|
964 |
},
|
965 |
"mt_flores_101_eng_kor": {
|
966 |
"num_of_instances": 66,
|
967 |
"counts": [
|
968 |
+
1096,
|
969 |
+
465,
|
970 |
+
233,
|
971 |
+
132
|
972 |
],
|
973 |
"totals": [
|
974 |
+
3883,
|
975 |
+
3817,
|
976 |
+
3751,
|
977 |
+
3685
|
978 |
],
|
979 |
"precisions": [
|
980 |
+
0.28225598763842386,
|
981 |
+
0.12182342153523709,
|
982 |
+
0.0621167688616369,
|
983 |
+
0.03582089552238806
|
984 |
],
|
985 |
"bp": 1.0,
|
986 |
+
"sys_len": 3883,
|
987 |
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.09352545142421302,
|
989 |
+
"score": 0.09352545142421302,
|
990 |
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.0763987126727994,
|
992 |
+
"score_ci_high": 0.11617390981932266,
|
993 |
+
"sacrebleu_ci_low": 0.0763987126727994,
|
994 |
+
"sacrebleu_ci_high": 0.11617390981932266
|
995 |
},
|
996 |
"mt_flores_101_eng_por": {
|
997 |
"num_of_instances": 66,
|
998 |
"counts": [
|
999 |
+
1328,
|
1000 |
+
850,
|
1001 |
+
588,
|
1002 |
+
412
|
1003 |
],
|
1004 |
"totals": [
|
1005 |
+
3030,
|
1006 |
+
2964,
|
1007 |
+
2898,
|
1008 |
+
2832
|
1009 |
],
|
1010 |
"precisions": [
|
1011 |
+
0.4382838283828383,
|
1012 |
+
0.286774628879892,
|
1013 |
+
0.2028985507246377,
|
1014 |
+
0.14548022598870058
|
1015 |
],
|
1016 |
"bp": 1.0,
|
1017 |
+
"sys_len": 3030,
|
1018 |
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.2467997817029595,
|
1020 |
+
"score": 0.2467997817029595,
|
1021 |
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.193392163449652,
|
1023 |
+
"score_ci_high": 0.2974642241791255,
|
1024 |
+
"sacrebleu_ci_low": 0.193392163449652,
|
1025 |
+
"sacrebleu_ci_high": 0.2974642241791255
|
1026 |
},
|
1027 |
"mt_flores_101_eng_ron": {
|
1028 |
"num_of_instances": 66,
|
1029 |
"counts": [
|
1030 |
+
930,
|
1031 |
+
400,
|
1032 |
+
214,
|
1033 |
+
123
|
1034 |
],
|
1035 |
"totals": [
|
1036 |
+
2961,
|
1037 |
+
2895,
|
1038 |
+
2829,
|
1039 |
+
2763
|
1040 |
],
|
1041 |
"precisions": [
|
1042 |
+
0.3140830800405269,
|
1043 |
+
0.1381692573402418,
|
1044 |
+
0.07564510427712973,
|
1045 |
+
0.04451682953311618
|
1046 |
],
|
1047 |
"bp": 1.0,
|
1048 |
+
"sys_len": 2961,
|
1049 |
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.1099487393546487,
|
1051 |
+
"score": 0.1099487393546487,
|
1052 |
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.08284384518142485,
|
1054 |
+
"score_ci_high": 0.13880651312628609,
|
1055 |
+
"sacrebleu_ci_low": 0.08284384518142485,
|
1056 |
+
"sacrebleu_ci_high": 0.13880651312628609
|
1057 |
},
|
1058 |
"mt_flores_101_eng_spa": {
|
1059 |
"num_of_instances": 66,
|
1060 |
"counts": [
|
1061 |
+
1217,
|
1062 |
+
624,
|
1063 |
+
347,
|
1064 |
+
198
|
1065 |
],
|
1066 |
"totals": [
|
1067 |
+
3045,
|
1068 |
+
2979,
|
1069 |
+
2913,
|
1070 |
+
2847
|
1071 |
],
|
1072 |
"precisions": [
|
1073 |
+
0.399671592775041,
|
1074 |
+
0.20946626384692849,
|
1075 |
+
0.11912118091314795,
|
1076 |
+
0.06954689146469968
|
1077 |
],
|
1078 |
"bp": 1.0,
|
1079 |
+
"sys_len": 3045,
|
1080 |
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.1622822499255264,
|
1082 |
+
"score": 0.1622822499255264,
|
1083 |
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.13321857221475644,
|
1085 |
+
"score_ci_high": 0.19390301665624113,
|
1086 |
+
"sacrebleu_ci_low": 0.13321857221475644,
|
1087 |
+
"sacrebleu_ci_high": 0.19390301665624113
|
1088 |
},
|
1089 |
"mt_flores_101_fra_eng": {
|
1090 |
"num_of_instances": 66,
|
1091 |
"counts": [
|
1092 |
+
1236,
|
1093 |
+
735,
|
1094 |
+
470,
|
1095 |
+
308
|
1096 |
],
|
1097 |
"totals": [
|
1098 |
+
2952,
|
1099 |
+
2886,
|
1100 |
+
2820,
|
1101 |
+
2754
|
1102 |
],
|
1103 |
"precisions": [
|
1104 |
+
0.4186991869918699,
|
1105 |
+
0.25467775467775466,
|
1106 |
+
0.16666666666666669,
|
1107 |
+
0.11183732752360204
|
1108 |
],
|
1109 |
"bp": 1.0,
|
1110 |
+
"sys_len": 2952,
|
1111 |
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.2111456628673961,
|
1113 |
+
"score": 0.2111456628673961,
|
1114 |
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.1728340034401921,
|
1116 |
+
"score_ci_high": 0.26908287892628974,
|
1117 |
+
"sacrebleu_ci_low": 0.1728340034401921,
|
1118 |
+
"sacrebleu_ci_high": 0.26908287892628974
|
1119 |
},
|
1120 |
"mt_flores_101_jpn_eng": {
|
1121 |
"num_of_instances": 66,
|
1122 |
"counts": [
|
1123 |
+
1018,
|
1124 |
+
437,
|
1125 |
+
232,
|
1126 |
+
128
|
1127 |
],
|
1128 |
"totals": [
|
1129 |
+
3130,
|
1130 |
+
3064,
|
1131 |
+
2998,
|
1132 |
+
2932
|
1133 |
],
|
1134 |
"precisions": [
|
1135 |
+
0.3252396166134185,
|
1136 |
+
0.14262402088772846,
|
1137 |
+
0.07738492328218813,
|
1138 |
+
0.04365620736698499
|
1139 |
],
|
1140 |
"bp": 1.0,
|
1141 |
+
"sys_len": 3130,
|
1142 |
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.11188570922324435,
|
1144 |
+
"score": 0.11188570922324435,
|
1145 |
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.09154049326122426,
|
1147 |
+
"score_ci_high": 0.13827539969992217,
|
1148 |
+
"sacrebleu_ci_low": 0.09154049326122426,
|
1149 |
+
"sacrebleu_ci_high": 0.13827539969992217
|
1150 |
},
|
1151 |
"mt_flores_101_kor_eng": {
|
1152 |
"num_of_instances": 66,
|
1153 |
"counts": [
|
1154 |
+
986,
|
1155 |
+
447,
|
1156 |
+
233,
|
1157 |
127
|
1158 |
],
|
1159 |
"totals": [
|
1160 |
+
3637,
|
1161 |
+
3571,
|
1162 |
+
3505,
|
1163 |
+
3439
|
1164 |
],
|
1165 |
"precisions": [
|
1166 |
+
0.27110255705251585,
|
1167 |
+
0.12517502100252031,
|
1168 |
+
0.06647646219686162,
|
1169 |
+
0.03692933992439663
|
1170 |
],
|
1171 |
"bp": 1.0,
|
1172 |
+
"sys_len": 3637,
|
1173 |
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.09553723823741646,
|
1175 |
+
"score": 0.09553723823741646,
|
1176 |
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.06933902828362079,
|
1178 |
+
"score_ci_high": 0.1273472328564688,
|
1179 |
+
"sacrebleu_ci_low": 0.06933902828362079,
|
1180 |
+
"sacrebleu_ci_high": 0.1273472328564688
|
1181 |
},
|
1182 |
"mt_flores_101_por_eng": {
|
1183 |
"num_of_instances": 66,
|
1184 |
"counts": [
|
1185 |
+
1286,
|
1186 |
+
834,
|
1187 |
+
587,
|
1188 |
+
419
|
1189 |
],
|
1190 |
"totals": [
|
1191 |
+
3404,
|
1192 |
+
3338,
|
1193 |
+
3272,
|
1194 |
+
3206
|
1195 |
],
|
1196 |
"precisions": [
|
1197 |
+
0.37779083431257343,
|
1198 |
+
0.24985020970641103,
|
1199 |
+
0.17940097799511,
|
1200 |
+
0.13069245165315035
|
1201 |
],
|
1202 |
"bp": 1.0,
|
1203 |
+
"sys_len": 3404,
|
1204 |
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.21689603438287544,
|
1206 |
+
"score": 0.21689603438287544,
|
1207 |
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.18174547190909165,
|
1209 |
+
"score_ci_high": 0.2734022486576191,
|
1210 |
+
"sacrebleu_ci_low": 0.18174547190909165,
|
1211 |
+
"sacrebleu_ci_high": 0.2734022486576191
|
1212 |
},
|
1213 |
"mt_flores_101_ron_eng": {
|
1214 |
"num_of_instances": 66,
|
1215 |
"counts": [
|
1216 |
+
1208,
|
1217 |
+
675,
|
1218 |
+
430,
|
1219 |
+
279
|
1220 |
],
|
1221 |
"totals": [
|
1222 |
+
3677,
|
1223 |
+
3611,
|
1224 |
+
3545,
|
1225 |
+
3479
|
1226 |
],
|
1227 |
"precisions": [
|
1228 |
+
0.32852869186837097,
|
1229 |
+
0.1869288285793409,
|
1230 |
+
0.12129760225669958,
|
1231 |
+
0.08019545846507617
|
1232 |
],
|
1233 |
"bp": 1.0,
|
1234 |
+
"sys_len": 3677,
|
1235 |
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.15633740352446387,
|
1237 |
+
"score": 0.15633740352446387,
|
1238 |
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.12255450743419968,
|
1240 |
+
"score_ci_high": 0.17971859902386644,
|
1241 |
+
"sacrebleu_ci_low": 0.12255450743419968,
|
1242 |
+
"sacrebleu_ci_high": 0.17971859902386644
|
1243 |
},
|
1244 |
"mt_flores_101_spa_eng": {
|
1245 |
"num_of_instances": 66,
|
1246 |
"counts": [
|
1247 |
+
1135,
|
1248 |
+
581,
|
1249 |
+
336,
|
1250 |
202
|
1251 |
],
|
1252 |
"totals": [
|
1253 |
+
3533,
|
1254 |
+
3467,
|
1255 |
+
3401,
|
1256 |
+
3335
|
1257 |
],
|
1258 |
"precisions": [
|
1259 |
+
0.3212567223322955,
|
1260 |
+
0.16758004038073263,
|
1261 |
+
0.09879447221405468,
|
1262 |
+
0.06056971514242879
|
1263 |
],
|
1264 |
"bp": 1.0,
|
1265 |
+
"sys_len": 3533,
|
1266 |
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.133972503470666,
|
1268 |
+
"score": 0.133972503470666,
|
1269 |
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.10251876459928583,
|
1271 |
+
"score_ci_high": 0.17481307519673603,
|
1272 |
+
"sacrebleu_ci_low": 0.10251876459928583,
|
1273 |
+
"sacrebleu_ci_high": 0.17481307519673603
|
1274 |
},
|
1275 |
+
"score": 0.1632066271292133,
|
1276 |
"score_name": "subsets_mean",
|
1277 |
"num_of_instances": 990
|
1278 |
},
|
1279 |
+
"score": 0.4537326535720019,
|
1280 |
"score_name": "subsets_mean",
|
1281 |
"num_of_instances": 12472
|
1282 |
}
|
results/bluebench/2025-06-19T15-57-45_evaluation_results.json
ADDED
@@ -0,0 +1,1283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-19T19:57:39.981261Z",
|
4 |
+
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
+
"--tasks",
|
7 |
+
"benchmarks.bluebench",
|
8 |
+
"--model",
|
9 |
+
"cross_provider",
|
10 |
+
"--model_args",
|
11 |
+
"model_name=watsonx/ibm/granite-3-2-8b-instruct,max_tokens=256",
|
12 |
+
"--output_path",
|
13 |
+
"./results/bluebench",
|
14 |
+
"--log_samples",
|
15 |
+
"--trust_remote_code",
|
16 |
+
"--batch_size",
|
17 |
+
"8",
|
18 |
+
"--verbosity",
|
19 |
+
"ERROR"
|
20 |
+
],
|
21 |
+
"parsed_arguments": {
|
22 |
+
"tasks": [
|
23 |
+
"benchmarks.bluebench"
|
24 |
+
],
|
25 |
+
"split": "test",
|
26 |
+
"num_fewshots": null,
|
27 |
+
"limit": null,
|
28 |
+
"batch_size": 8,
|
29 |
+
"model": "watsonx/ibm/granite-3-2-8b-instruct",
|
30 |
+
"model_args": {
|
31 |
+
"max_tokens": 256
|
32 |
+
},
|
33 |
+
"gen_kwargs": null,
|
34 |
+
"chat_template_kwargs": null,
|
35 |
+
"output_path": "./results/bluebench",
|
36 |
+
"output_file_prefix": "evaluation_results",
|
37 |
+
"log_samples": true,
|
38 |
+
"verbosity": "ERROR",
|
39 |
+
"apply_chat_template": false,
|
40 |
+
"trust_remote_code": true,
|
41 |
+
"disable_hf_cache": false,
|
42 |
+
"cache_dir": null
|
43 |
+
},
|
44 |
+
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
|
46 |
+
"python_version": "3.10.18",
|
47 |
+
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
+
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
+
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
63 |
+
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
78 |
+
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
80 |
+
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
+
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
+
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
105 |
+
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
108 |
+
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
112 |
+
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
+
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
+
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
137 |
+
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
+
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
+
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
+
"jaraco.text": "3.12.1",
|
170 |
+
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"results": {
|
177 |
+
"bias": {
|
178 |
+
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.6444444444444445,
|
180 |
+
"accuracy_ci_low": 0.5444444444444444,
|
181 |
+
"accuracy_ci_high": 0.7444444444444445,
|
182 |
+
"score_name": "accuracy",
|
183 |
+
"score": 0.6444444444444445,
|
184 |
+
"score_ci_high": 0.7444444444444445,
|
185 |
+
"score_ci_low": 0.5444444444444444,
|
186 |
+
"num_of_instances": 90
|
187 |
+
},
|
188 |
+
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.7222222222222222,
|
190 |
+
"accuracy_ci_low": 0.6111111111111112,
|
191 |
+
"accuracy_ci_high": 0.8,
|
192 |
+
"score_name": "accuracy",
|
193 |
+
"score": 0.7222222222222222,
|
194 |
+
"score_ci_high": 0.8,
|
195 |
+
"score_ci_low": 0.6111111111111112,
|
196 |
+
"num_of_instances": 90
|
197 |
+
},
|
198 |
+
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.9111111111111111,
|
200 |
+
"accuracy_ci_low": 0.8333333333333334,
|
201 |
+
"accuracy_ci_high": 0.9555555555555556,
|
202 |
+
"score_name": "accuracy",
|
203 |
+
"score": 0.9111111111111111,
|
204 |
+
"score_ci_high": 0.9555555555555556,
|
205 |
+
"score_ci_low": 0.8333333333333334,
|
206 |
+
"num_of_instances": 90
|
207 |
+
},
|
208 |
+
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.7111111111111111,
|
210 |
+
"accuracy_ci_low": 0.6111111111111112,
|
211 |
+
"accuracy_ci_high": 0.8,
|
212 |
+
"score_name": "accuracy",
|
213 |
+
"score": 0.7111111111111111,
|
214 |
+
"score_ci_high": 0.8,
|
215 |
+
"score_ci_low": 0.6111111111111112,
|
216 |
+
"num_of_instances": 90
|
217 |
+
},
|
218 |
+
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.8333333333333334,
|
220 |
+
"accuracy_ci_low": 0.7444444444444445,
|
221 |
+
"accuracy_ci_high": 0.9,
|
222 |
+
"score_name": "accuracy",
|
223 |
+
"score": 0.8333333333333334,
|
224 |
+
"score_ci_high": 0.9,
|
225 |
+
"score_ci_low": 0.7444444444444445,
|
226 |
+
"num_of_instances": 90
|
227 |
+
},
|
228 |
+
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.9777777777777777,
|
230 |
+
"accuracy_ci_low": 0.9333333333333333,
|
231 |
+
"accuracy_ci_high": 1.0,
|
232 |
+
"score_name": "accuracy",
|
233 |
+
"score": 0.9777777777777777,
|
234 |
+
"score_ci_high": 1.0,
|
235 |
+
"score_ci_low": 0.9333333333333333,
|
236 |
+
"num_of_instances": 90
|
237 |
+
},
|
238 |
+
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.9333333333333333,
|
240 |
+
"accuracy_ci_low": 0.8666666666666667,
|
241 |
+
"accuracy_ci_high": 0.9777777777777777,
|
242 |
+
"score_name": "accuracy",
|
243 |
+
"score": 0.9333333333333333,
|
244 |
+
"score_ci_high": 0.9777777777777777,
|
245 |
+
"score_ci_low": 0.8666666666666667,
|
246 |
+
"num_of_instances": 90
|
247 |
+
},
|
248 |
+
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.9444444444444444,
|
250 |
+
"accuracy_ci_low": 0.8777777777777778,
|
251 |
+
"accuracy_ci_high": 0.9777777777777777,
|
252 |
+
"score_name": "accuracy",
|
253 |
+
"score": 0.9444444444444444,
|
254 |
+
"score_ci_high": 0.9777777777777777,
|
255 |
+
"score_ci_low": 0.8777777777777778,
|
256 |
+
"num_of_instances": 90
|
257 |
+
},
|
258 |
+
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.8111111111111111,
|
260 |
+
"accuracy_ci_low": 0.7116197011994875,
|
261 |
+
"accuracy_ci_high": 0.8888888888888888,
|
262 |
+
"score_name": "accuracy",
|
263 |
+
"score": 0.8111111111111111,
|
264 |
+
"score_ci_high": 0.8888888888888888,
|
265 |
+
"score_ci_low": 0.7116197011994875,
|
266 |
+
"num_of_instances": 90
|
267 |
+
},
|
268 |
+
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.6888888888888889,
|
270 |
+
"accuracy_ci_low": 0.5777777777777777,
|
271 |
+
"accuracy_ci_high": 0.7777777777777778,
|
272 |
+
"score_name": "accuracy",
|
273 |
+
"score": 0.6888888888888889,
|
274 |
+
"score_ci_high": 0.7777777777777778,
|
275 |
+
"score_ci_low": 0.5777777777777777,
|
276 |
+
"num_of_instances": 90
|
277 |
+
},
|
278 |
+
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.8,
|
280 |
+
"accuracy_ci_low": 0.7111111111111111,
|
281 |
+
"accuracy_ci_high": 0.8666666666666667,
|
282 |
+
"score_name": "accuracy",
|
283 |
+
"score": 0.8,
|
284 |
+
"score_ci_high": 0.8666666666666667,
|
285 |
+
"score_ci_low": 0.7111111111111111,
|
286 |
+
"num_of_instances": 90
|
287 |
+
},
|
288 |
+
"score": 0.8161616161616162,
|
289 |
+
"score_name": "subsets_mean",
|
290 |
+
"num_of_instances": 990
|
291 |
+
},
|
292 |
+
"chatbot_abilities": {
|
293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
+
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.5,
|
296 |
+
"score": 0.5,
|
297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
+
},
|
299 |
+
"score": 0.5,
|
300 |
+
"score_name": "subsets_mean",
|
301 |
+
"num_of_instances": 500
|
302 |
+
},
|
303 |
+
"entity_extraction": {
|
304 |
+
"universal_ner_en_ewt": {
|
305 |
+
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.592375366568915,
|
307 |
+
"f1_Location": 0.3668122270742358,
|
308 |
+
"f1_Organization": 0.45367412140575075,
|
309 |
+
"f1_macro": 0.4709539050163005,
|
310 |
+
"recall_macro": 0.3969630056026483,
|
311 |
+
"precision_macro": 0.5946970285442043,
|
312 |
+
"in_classes_support": 0.7649572649572649,
|
313 |
+
"f1_micro": 0.4310171198388721,
|
314 |
+
"recall_micro": 0.4076190476190476,
|
315 |
+
"precision_micro": 0.45726495726495725,
|
316 |
+
"score": 0.4310171198388721,
|
317 |
+
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.36016345918404075,
|
319 |
+
"score_ci_high": 0.48021577272630167,
|
320 |
+
"f1_micro_ci_low": 0.36016345918404075,
|
321 |
+
"f1_micro_ci_high": 0.48021577272630167
|
322 |
+
},
|
323 |
+
"score": 0.4310171198388721,
|
324 |
+
"score_name": "subsets_mean",
|
325 |
+
"num_of_instances": 1000
|
326 |
+
},
|
327 |
+
"knowledge": {
|
328 |
+
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.5352112676056338,
|
330 |
+
"accuracy_ci_low": 0.43661971830985913,
|
331 |
+
"accuracy_ci_high": 0.647887323943662,
|
332 |
+
"score_name": "accuracy",
|
333 |
+
"score": 0.5352112676056338,
|
334 |
+
"score_ci_high": 0.647887323943662,
|
335 |
+
"score_ci_low": 0.43661971830985913,
|
336 |
+
"num_of_instances": 71
|
337 |
+
},
|
338 |
+
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.22535211267605634,
|
340 |
+
"accuracy_ci_low": 0.14084507042253522,
|
341 |
+
"accuracy_ci_high": 0.323943661971831,
|
342 |
+
"score_name": "accuracy",
|
343 |
+
"score": 0.22535211267605634,
|
344 |
+
"score_ci_high": 0.323943661971831,
|
345 |
+
"score_ci_low": 0.14084507042253522,
|
346 |
+
"num_of_instances": 71
|
347 |
+
},
|
348 |
+
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.18309859154929578,
|
350 |
+
"accuracy_ci_low": 0.10639771966263252,
|
351 |
+
"accuracy_ci_high": 0.29577464788732394,
|
352 |
+
"score_name": "accuracy",
|
353 |
+
"score": 0.18309859154929578,
|
354 |
+
"score_ci_high": 0.29577464788732394,
|
355 |
+
"score_ci_low": 0.10639771966263252,
|
356 |
+
"num_of_instances": 71
|
357 |
+
},
|
358 |
+
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.323943661971831,
|
360 |
+
"accuracy_ci_low": 0.2112676056338028,
|
361 |
+
"accuracy_ci_high": 0.43661971830985913,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.323943661971831,
|
364 |
+
"score_ci_high": 0.43661971830985913,
|
365 |
+
"score_ci_low": 0.2112676056338028,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.4507042253521127,
|
370 |
+
"accuracy_ci_low": 0.323943661971831,
|
371 |
+
"accuracy_ci_high": 0.5664724235461314,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.4507042253521127,
|
374 |
+
"score_ci_high": 0.5664724235461314,
|
375 |
+
"score_ci_low": 0.323943661971831,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.36619718309859156,
|
380 |
+
"accuracy_ci_low": 0.2535211267605634,
|
381 |
+
"accuracy_ci_high": 0.4788732394366197,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.36619718309859156,
|
384 |
+
"score_ci_high": 0.4788732394366197,
|
385 |
+
"score_ci_low": 0.2535211267605634,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.3380281690140845,
|
390 |
+
"accuracy_ci_low": 0.22535211267605634,
|
391 |
+
"accuracy_ci_high": 0.4647887323943662,
|
392 |
+
"score_name": "accuracy",
|
393 |
+
"score": 0.3380281690140845,
|
394 |
+
"score_ci_high": 0.4647887323943662,
|
395 |
+
"score_ci_low": 0.22535211267605634,
|
396 |
+
"num_of_instances": 71
|
397 |
+
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.4084507042253521,
|
400 |
+
"accuracy_ci_low": 0.29577464788732394,
|
401 |
+
"accuracy_ci_high": 0.5211267605633803,
|
402 |
+
"score_name": "accuracy",
|
403 |
+
"score": 0.4084507042253521,
|
404 |
+
"score_ci_high": 0.5211267605633803,
|
405 |
+
"score_ci_low": 0.29577464788732394,
|
406 |
+
"num_of_instances": 71
|
407 |
+
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.3380281690140845,
|
410 |
+
"accuracy_ci_low": 0.23943661971830985,
|
411 |
+
"accuracy_ci_high": 0.4647887323943662,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.3380281690140845,
|
414 |
+
"score_ci_high": 0.4647887323943662,
|
415 |
+
"score_ci_low": 0.23943661971830985,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.11267605633802817,
|
420 |
+
"accuracy_ci_low": 0.056338028169014086,
|
421 |
+
"accuracy_ci_high": 0.19757759490217996,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.11267605633802817,
|
424 |
+
"score_ci_high": 0.19757759490217996,
|
425 |
+
"score_ci_low": 0.056338028169014086,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
+
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.22535211267605634,
|
430 |
+
"accuracy_ci_low": 0.14084507042253522,
|
431 |
+
"accuracy_ci_high": 0.323943661971831,
|
432 |
+
"score_name": "accuracy",
|
433 |
+
"score": 0.22535211267605634,
|
434 |
+
"score_ci_high": 0.323943661971831,
|
435 |
+
"score_ci_low": 0.14084507042253522,
|
436 |
+
"num_of_instances": 71
|
437 |
+
},
|
438 |
+
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.4225352112676056,
|
440 |
+
"accuracy_ci_low": 0.30985915492957744,
|
441 |
+
"accuracy_ci_high": 0.5352112676056338,
|
442 |
+
"score_name": "accuracy",
|
443 |
+
"score": 0.4225352112676056,
|
444 |
+
"score_ci_high": 0.5352112676056338,
|
445 |
+
"score_ci_low": 0.30985915492957744,
|
446 |
+
"num_of_instances": 71
|
447 |
+
},
|
448 |
+
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.22535211267605634,
|
450 |
+
"accuracy_ci_low": 0.14084507042253522,
|
451 |
+
"accuracy_ci_high": 0.3380281690140845,
|
452 |
+
"score_name": "accuracy",
|
453 |
+
"score": 0.22535211267605634,
|
454 |
+
"score_ci_high": 0.3380281690140845,
|
455 |
+
"score_ci_low": 0.14084507042253522,
|
456 |
+
"num_of_instances": 71
|
457 |
+
},
|
458 |
+
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.5211267605633803,
|
460 |
+
"accuracy_ci_low": 0.4084507042253521,
|
461 |
+
"accuracy_ci_high": 0.6338028169014085,
|
462 |
+
"score_name": "accuracy",
|
463 |
+
"score": 0.5211267605633803,
|
464 |
+
"score_ci_high": 0.6338028169014085,
|
465 |
+
"score_ci_low": 0.4084507042253521,
|
466 |
+
"num_of_instances": 71
|
467 |
+
},
|
468 |
+
"score": 0.33400402414486924,
|
469 |
+
"score_name": "subsets_mean",
|
470 |
+
"num_of_instances": 994
|
471 |
+
},
|
472 |
+
"legal": {
|
473 |
+
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.2827795486010496,
|
475 |
+
"f1_suggestive": 0.08333333333333333,
|
476 |
+
"f1_descriptive": 0.4444444444444444,
|
477 |
+
"f1_generic": 0.11764705882352941,
|
478 |
+
"f1_fanciful": 0.4827586206896552,
|
479 |
+
"f1_arbitrary": 0.2857142857142857,
|
480 |
+
"f1_macro_ci_low": 0.20381678012471904,
|
481 |
+
"f1_macro_ci_high": 0.38601597944875415,
|
482 |
+
"score_name": "f1_micro",
|
483 |
+
"score": 0.3253012048192771,
|
484 |
+
"score_ci_high": 0.42168674698795183,
|
485 |
+
"score_ci_low": 0.21686746987951808,
|
486 |
+
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.3176470588235294,
|
488 |
+
"accuracy_ci_low": 0.21176470588235294,
|
489 |
+
"accuracy_ci_high": 0.4117647058823529,
|
490 |
+
"f1_micro": 0.3253012048192771,
|
491 |
+
"f1_micro_ci_low": 0.21686746987951808,
|
492 |
+
"f1_micro_ci_high": 0.42168674698795183
|
493 |
+
},
|
494 |
+
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.5842293906810035,
|
496 |
+
"f1_no": 0.8129032258064516,
|
497 |
+
"f1_yes": 0.35555555555555557,
|
498 |
+
"f1_macro_ci_low": 0.5123650296064088,
|
499 |
+
"f1_macro_ci_high": 0.6612083568605307,
|
500 |
+
"score_name": "f1_micro",
|
501 |
+
"score": 0.71,
|
502 |
+
"score_ci_high": 0.765,
|
503 |
+
"score_ci_low": 0.64,
|
504 |
+
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.71,
|
506 |
+
"accuracy_ci_low": 0.64,
|
507 |
+
"accuracy_ci_high": 0.765,
|
508 |
+
"f1_micro": 0.71,
|
509 |
+
"f1_micro_ci_low": 0.64,
|
510 |
+
"f1_micro_ci_high": 0.765
|
511 |
+
},
|
512 |
+
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.23684055980437102,
|
514 |
+
"f1_conclusion": 0.12,
|
515 |
+
"f1_issue": 0.2682926829268293,
|
516 |
+
"f1_decree": 0.17647058823529413,
|
517 |
+
"f1_rule": 0.4155844155844156,
|
518 |
+
"f1_analysis": 0.2608695652173913,
|
519 |
+
"f1_facts": 0.16666666666666666,
|
520 |
+
"f1_procedural history": 0.25,
|
521 |
+
"f1_macro_ci_low": 0.18399933651413464,
|
522 |
+
"f1_macro_ci_high": 0.3012128675188143,
|
523 |
+
"score_name": "f1_micro",
|
524 |
+
"score": 0.2570694087403599,
|
525 |
+
"score_ci_high": 0.31910866448170155,
|
526 |
+
"score_ci_low": 0.19563743957580057,
|
527 |
+
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.25,
|
529 |
+
"accuracy_ci_low": 0.19,
|
530 |
+
"accuracy_ci_high": 0.31,
|
531 |
+
"f1_micro": 0.2570694087403599,
|
532 |
+
"f1_micro_ci_low": 0.19563743957580057,
|
533 |
+
"f1_micro_ci_high": 0.31910866448170155
|
534 |
+
},
|
535 |
+
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.45179063360881544,
|
537 |
+
"f1_yes": 0.5702479338842975,
|
538 |
+
"f1_no": 0.3333333333333333,
|
539 |
+
"f1_macro_ci_low": 0.3881370275424478,
|
540 |
+
"f1_macro_ci_high": 0.5208583506164292,
|
541 |
+
"score_name": "f1_micro",
|
542 |
+
"score": 0.47738693467336685,
|
543 |
+
"score_ci_high": 0.5454545454545454,
|
544 |
+
"score_ci_low": 0.41102756892230574,
|
545 |
+
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.475,
|
547 |
+
"accuracy_ci_low": 0.4062357598667403,
|
548 |
+
"accuracy_ci_high": 0.54,
|
549 |
+
"f1_micro": 0.47738693467336685,
|
550 |
+
"f1_micro_ci_low": 0.41102756892230574,
|
551 |
+
"f1_micro_ci_high": 0.5454545454545454
|
552 |
+
},
|
553 |
+
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.7797888386123679,
|
555 |
+
"f1_yes": 0.7647058823529411,
|
556 |
+
"f1_no": 0.7948717948717948,
|
557 |
+
"f1_macro_ci_low": 0.686770027516329,
|
558 |
+
"f1_macro_ci_high": 0.847201812396528,
|
559 |
+
"score_name": "f1_micro",
|
560 |
+
"score": 0.7808219178082192,
|
561 |
+
"score_ci_high": 0.847682119205298,
|
562 |
+
"score_ci_low": 0.6846573729523644,
|
563 |
+
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.6705882352941176,
|
565 |
+
"accuracy_ci_low": 0.5647058823529412,
|
566 |
+
"accuracy_ci_high": 0.7529411764705882,
|
567 |
+
"f1_micro": 0.7808219178082192,
|
568 |
+
"f1_micro_ci_low": 0.6846573729523644,
|
569 |
+
"f1_micro_ci_high": 0.847682119205298
|
570 |
+
},
|
571 |
+
"score": 0.5101158932082446,
|
572 |
+
"score_name": "subsets_mean",
|
573 |
+
"num_of_instances": 770
|
574 |
+
},
|
575 |
+
"news_classification": {
|
576 |
+
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.4855115011669257,
|
578 |
+
"f1_cars": 0.8,
|
579 |
+
"f1_windows x": 0.05555555555555555,
|
580 |
+
"f1_atheism": 0.17777777777777778,
|
581 |
+
"f1_cryptography": 0.4444444444444444,
|
582 |
+
"f1_religion": 0.23404255319148937,
|
583 |
+
"f1_medicine": 0.8,
|
584 |
+
"f1_christianity": 0.36619718309859156,
|
585 |
+
"f1_computer graphics": 0.3652173913043478,
|
586 |
+
"f1_microsoft windows": 0.19047619047619047,
|
587 |
+
"f1_middle east": 0.4675324675324675,
|
588 |
+
"f1_motorcycles": 0.693069306930693,
|
589 |
+
"f1_politics": 0.313953488372093,
|
590 |
+
"f1_pc hardware": 0.4292682926829268,
|
591 |
+
"f1_mac hardware": 0.2972972972972973,
|
592 |
+
"f1_for sale": 0.7058823529411765,
|
593 |
+
"f1_guns": 0.34375,
|
594 |
+
"f1_space": 0.6888888888888889,
|
595 |
+
"f1_baseball": 0.8909090909090909,
|
596 |
+
"f1_hockey": 0.8709677419354839,
|
597 |
+
"f1_electronics": 0.575,
|
598 |
+
"f1_macro_ci_low": 0.4606519053067645,
|
599 |
+
"f1_macro_ci_high": 0.5114904866418184,
|
600 |
+
"score_name": "f1_micro",
|
601 |
+
"score": 0.5034666666666666,
|
602 |
+
"score_ci_high": 0.5288163691152058,
|
603 |
+
"score_ci_low": 0.4713054725252697,
|
604 |
+
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.472,
|
606 |
+
"accuracy_ci_low": 0.44038730175462776,
|
607 |
+
"accuracy_ci_high": 0.497,
|
608 |
+
"f1_micro": 0.5034666666666666,
|
609 |
+
"f1_micro_ci_low": 0.4713054725252697,
|
610 |
+
"f1_micro_ci_high": 0.5288163691152058
|
611 |
+
},
|
612 |
+
"score": 0.5034666666666666,
|
613 |
+
"score_name": "subsets_mean",
|
614 |
+
"num_of_instances": 1000
|
615 |
+
},
|
616 |
+
"product_help": {
|
617 |
+
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.607364388794758,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9122807017543859,
|
620 |
+
"f1_credit card or prepaid card": 0.6666666666666666,
|
621 |
+
"f1_debt collection": 0.6075949367088608,
|
622 |
+
"f1_checking or savings account": 0.673469387755102,
|
623 |
+
"f1_money transfer or virtual currency or money service": 0.5777777777777777,
|
624 |
+
"f1_vehicle loan or lease": 0.37037037037037035,
|
625 |
+
"f1_mortgage": 0.6666666666666666,
|
626 |
+
"f1_payday loan or title loan or personal loan": 0.2222222222222222,
|
627 |
+
"f1_student loan": 0.7692307692307693,
|
628 |
+
"f1_macro_ci_low": 0.5603416421881502,
|
629 |
+
"f1_macro_ci_high": 0.6682100489708924,
|
630 |
+
"score_name": "f1_micro",
|
631 |
+
"score": 0.8273716951788491,
|
632 |
+
"score_ci_high": 0.8505803933787175,
|
633 |
+
"score_ci_low": 0.8031586690475525,
|
634 |
+
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.798,
|
636 |
+
"accuracy_ci_low": 0.7700141366334644,
|
637 |
+
"accuracy_ci_high": 0.8228416338853977,
|
638 |
+
"f1_micro": 0.8273716951788491,
|
639 |
+
"f1_micro_ci_low": 0.8031586690475525,
|
640 |
+
"f1_micro_ci_high": 0.8505803933787175
|
641 |
+
},
|
642 |
+
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.6739108272125272,
|
644 |
+
"f1_mortgages and loans": 0.7719298245614035,
|
645 |
+
"f1_credit card": 0.7403314917127072,
|
646 |
+
"f1_retail banking": 0.5797101449275363,
|
647 |
+
"f1_debt collection": 0.5686274509803921,
|
648 |
+
"f1_credit reporting": 0.7089552238805971,
|
649 |
+
"f1_macro_ci_low": 0.6324404602189574,
|
650 |
+
"f1_macro_ci_high": 0.7147768248953918,
|
651 |
+
"score_name": "f1_micro",
|
652 |
+
"score": 0.6777546777546778,
|
653 |
+
"score_ci_high": 0.716590388897516,
|
654 |
+
"score_ci_low": 0.6352085235971857,
|
655 |
+
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.652,
|
657 |
+
"accuracy_ci_low": 0.608,
|
658 |
+
"accuracy_ci_high": 0.6909013646716825,
|
659 |
+
"f1_micro": 0.6777546777546778,
|
660 |
+
"f1_micro_ci_low": 0.6352085235971857,
|
661 |
+
"f1_micro_ci_high": 0.716590388897516
|
662 |
+
},
|
663 |
+
"score": 0.7525631864667635,
|
664 |
+
"score_name": "subsets_mean",
|
665 |
+
"num_of_instances": 1500
|
666 |
+
},
|
667 |
+
"qa_finance": {
|
668 |
+
"fin_qa": {
|
669 |
+
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.136,
|
671 |
+
"score": 0.136,
|
672 |
+
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.112,
|
674 |
+
"program_accuracy_ci_low": 0.114,
|
675 |
+
"program_accuracy_ci_high": 0.15532916889351497,
|
676 |
+
"score_ci_low": 0.114,
|
677 |
+
"score_ci_high": 0.15532916889351497,
|
678 |
+
"execution_accuracy_ci_low": 0.093,
|
679 |
+
"execution_accuracy_ci_high": 0.132
|
680 |
+
},
|
681 |
+
"score": 0.136,
|
682 |
+
"score_name": "subsets_mean",
|
683 |
+
"num_of_instances": 1000
|
684 |
+
},
|
685 |
+
"rag_general": {
|
686 |
+
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.34340385694506587,
|
688 |
+
"recall": 0.5730464338072634,
|
689 |
+
"f1": 0.3745794974329677,
|
690 |
+
"precision_ci_low": 0.3234187185304744,
|
691 |
+
"precision_ci_high": 0.3650589335945776,
|
692 |
+
"recall_ci_low": 0.5557745525912291,
|
693 |
+
"recall_ci_high": 0.5891018666307999,
|
694 |
+
"f1_ci_low": 0.3573010951324523,
|
695 |
+
"f1_ci_high": 0.3923997026360963,
|
696 |
+
"score_name": "f1",
|
697 |
+
"score": 0.3745794974329677,
|
698 |
+
"score_ci_high": 0.3923997026360963,
|
699 |
+
"score_ci_low": 0.3573010951324523,
|
700 |
+
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6226274134715398,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6971947036186854,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5803041330973308,
|
704 |
+
"faithfullness_f1_token_overlap": 0.333895617947972,
|
705 |
+
"faithfullness_recall_token_overlap": 0.2631184151415058,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5911531265193607,
|
707 |
+
"correctness_f1_token_overlap": 0.3745794974329677,
|
708 |
+
"correctness_recall_token_overlap": 0.5730464338072634,
|
709 |
+
"correctness_precision_token_overlap": 0.34340385694506587
|
710 |
+
},
|
711 |
+
"score": 0.3745794974329677,
|
712 |
+
"score_name": "subsets_mean",
|
713 |
+
"num_of_instances": 600
|
714 |
+
},
|
715 |
+
"reasoning": {
|
716 |
+
"hellaswag": {
|
717 |
+
"accuracy": 0.484,
|
718 |
+
"accuracy_ci_low": 0.454,
|
719 |
+
"accuracy_ci_high": 0.514,
|
720 |
+
"score_name": "accuracy",
|
721 |
+
"score": 0.484,
|
722 |
+
"score_ci_high": 0.514,
|
723 |
+
"score_ci_low": 0.454,
|
724 |
+
"num_of_instances": 1000
|
725 |
+
},
|
726 |
+
"openbook_qa": {
|
727 |
+
"accuracy": 0.806,
|
728 |
+
"accuracy_ci_low": 0.772,
|
729 |
+
"accuracy_ci_high": 0.84,
|
730 |
+
"score_name": "accuracy",
|
731 |
+
"score": 0.806,
|
732 |
+
"score_ci_high": 0.84,
|
733 |
+
"score_ci_low": 0.772,
|
734 |
+
"num_of_instances": 500
|
735 |
+
},
|
736 |
+
"score": 0.645,
|
737 |
+
"score_name": "subsets_mean",
|
738 |
+
"num_of_instances": 1500
|
739 |
+
},
|
740 |
+
"safety": {
|
741 |
+
"attaq_500": {
|
742 |
+
"severity_critical": 2.0,
|
743 |
+
"severity_high": 1.0,
|
744 |
+
"severity_medium": 1.0,
|
745 |
+
"severity_low": 96.0,
|
746 |
+
"category_harmful_info": 0.8320170236041858,
|
747 |
+
"category_deception": 0.80982066540405,
|
748 |
+
"category_explicit_content": 0.8411968264615898,
|
749 |
+
"category_violence": 0.8222103803300314,
|
750 |
+
"category_discrimination": 0.851427495597847,
|
751 |
+
"category_substance_abuse": 0.923489530881246,
|
752 |
+
"category_pii": 0.8410816744521812,
|
753 |
+
"safety": 0.8333571752993802,
|
754 |
+
"safety_ci_low": 0.8004974256643814,
|
755 |
+
"safety_ci_high": 0.8560529591545912,
|
756 |
+
"score_name": "safety",
|
757 |
+
"score": 0.8333571752993802,
|
758 |
+
"score_ci_high": 0.8560529591545912,
|
759 |
+
"score_ci_low": 0.8004974256643814,
|
760 |
+
"num_of_instances": 100
|
761 |
+
},
|
762 |
+
"score": 0.8333571752993802,
|
763 |
+
"score_name": "subsets_mean",
|
764 |
+
"num_of_instances": 100
|
765 |
+
},
|
766 |
+
"summarization": {
|
767 |
+
"billsum_document_filtered_to_6000_chars": {
|
768 |
+
"num_of_instances": 528,
|
769 |
+
"rouge2": 0.20644440587446686,
|
770 |
+
"rouge1": 0.4218186965603418,
|
771 |
+
"rougeL": 0.29171488107135435,
|
772 |
+
"score": 0.29171488107135435,
|
773 |
+
"score_name": "rougeL",
|
774 |
+
"rougeLsum": 0.3573848941644133,
|
775 |
+
"rouge2_ci_low": 0.19970855600689244,
|
776 |
+
"rouge2_ci_high": 0.2141764351715554,
|
777 |
+
"rouge1_ci_low": 0.41248489848485753,
|
778 |
+
"rouge1_ci_high": 0.4306004852492735,
|
779 |
+
"rougeL_ci_low": 0.284372337658834,
|
780 |
+
"rougeL_ci_high": 0.29907980889509783,
|
781 |
+
"score_ci_low": 0.284372337658834,
|
782 |
+
"score_ci_high": 0.29907980889509783,
|
783 |
+
"rougeLsum_ci_low": 0.3492659326802685,
|
784 |
+
"rougeLsum_ci_high": 0.36590481273391734
|
785 |
+
},
|
786 |
+
"tldr_document_filtered_to_6000_chars": {
|
787 |
+
"num_of_instances": 1000,
|
788 |
+
"rouge2": 0.0155586126994404,
|
789 |
+
"rouge1": 0.11530035033575219,
|
790 |
+
"rougeL": 0.0819857457679891,
|
791 |
+
"score": 0.0819857457679891,
|
792 |
+
"score_name": "rougeL",
|
793 |
+
"rougeLsum": 0.09494455096055868,
|
794 |
+
"rouge2_ci_low": 0.013676000237548778,
|
795 |
+
"rouge2_ci_high": 0.01748477110760906,
|
796 |
+
"rouge1_ci_low": 0.10949633575516456,
|
797 |
+
"rouge1_ci_high": 0.12012750847071728,
|
798 |
+
"rougeL_ci_low": 0.07832639561199897,
|
799 |
+
"rougeL_ci_high": 0.08543803609753609,
|
800 |
+
"score_ci_low": 0.07832639561199897,
|
801 |
+
"score_ci_high": 0.08543803609753609,
|
802 |
+
"rougeLsum_ci_low": 0.09027374868536467,
|
803 |
+
"rougeLsum_ci_high": 0.0990496989831643
|
804 |
+
},
|
805 |
+
"score": 0.18685031341967173,
|
806 |
+
"score_name": "subsets_mean",
|
807 |
+
"num_of_instances": 1528
|
808 |
+
},
|
809 |
+
"translation": {
|
810 |
+
"mt_flores_101_ara_eng": {
|
811 |
+
"num_of_instances": 66,
|
812 |
+
"counts": [
|
813 |
+
1129,
|
814 |
+
620,
|
815 |
+
369,
|
816 |
+
232
|
817 |
+
],
|
818 |
+
"totals": [
|
819 |
+
1854,
|
820 |
+
1788,
|
821 |
+
1722,
|
822 |
+
1656
|
823 |
+
],
|
824 |
+
"precisions": [
|
825 |
+
0.6089536138079827,
|
826 |
+
0.34675615212527966,
|
827 |
+
0.21428571428571427,
|
828 |
+
0.14009661835748793
|
829 |
+
],
|
830 |
+
"bp": 1.0,
|
831 |
+
"sys_len": 1854,
|
832 |
+
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.28216771071430846,
|
834 |
+
"score": 0.28216771071430846,
|
835 |
+
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.2361776507614392,
|
837 |
+
"score_ci_high": 0.31854760158610573,
|
838 |
+
"sacrebleu_ci_low": 0.2361776507614392,
|
839 |
+
"sacrebleu_ci_high": 0.31854760158610573
|
840 |
+
},
|
841 |
+
"mt_flores_101_deu_eng": {
|
842 |
+
"num_of_instances": 66,
|
843 |
+
"counts": [
|
844 |
+
1222,
|
845 |
+
719,
|
846 |
+
458,
|
847 |
+
298
|
848 |
+
],
|
849 |
+
"totals": [
|
850 |
+
1795,
|
851 |
+
1729,
|
852 |
+
1663,
|
853 |
+
1597
|
854 |
+
],
|
855 |
+
"precisions": [
|
856 |
+
0.6807799442896936,
|
857 |
+
0.4158473105841527,
|
858 |
+
0.27540589296452195,
|
859 |
+
0.18659987476518475
|
860 |
+
],
|
861 |
+
"bp": 1.0,
|
862 |
+
"sys_len": 1795,
|
863 |
+
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.34730121824258303,
|
865 |
+
"score": 0.34730121824258303,
|
866 |
+
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.29893861972188673,
|
868 |
+
"score_ci_high": 0.3929549500270372,
|
869 |
+
"sacrebleu_ci_low": 0.29893861972188673,
|
870 |
+
"sacrebleu_ci_high": 0.3929549500270372
|
871 |
+
},
|
872 |
+
"mt_flores_101_eng_ara": {
|
873 |
+
"num_of_instances": 66,
|
874 |
+
"counts": [
|
875 |
+
640,
|
876 |
+
243,
|
877 |
+
115,
|
878 |
+
51
|
879 |
+
],
|
880 |
+
"totals": [
|
881 |
+
2303,
|
882 |
+
2237,
|
883 |
+
2171,
|
884 |
+
2105
|
885 |
+
],
|
886 |
+
"precisions": [
|
887 |
+
0.2778983933999131,
|
888 |
+
0.1086276262852034,
|
889 |
+
0.05297098111469369,
|
890 |
+
0.024228028503562947
|
891 |
+
],
|
892 |
+
"bp": 1.0,
|
893 |
+
"sys_len": 2303,
|
894 |
+
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.07889429589395064,
|
896 |
+
"score": 0.07889429589395064,
|
897 |
+
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.05783007476819273,
|
899 |
+
"score_ci_high": 0.11334032544493618,
|
900 |
+
"sacrebleu_ci_low": 0.05783007476819273,
|
901 |
+
"sacrebleu_ci_high": 0.11334032544493618
|
902 |
+
},
|
903 |
+
"mt_flores_101_eng_deu": {
|
904 |
+
"num_of_instances": 66,
|
905 |
+
"counts": [
|
906 |
+
1100,
|
907 |
+
591,
|
908 |
+
353,
|
909 |
+
222
|
910 |
+
],
|
911 |
+
"totals": [
|
912 |
+
1847,
|
913 |
+
1781,
|
914 |
+
1715,
|
915 |
+
1649
|
916 |
+
],
|
917 |
+
"precisions": [
|
918 |
+
0.5955603681645912,
|
919 |
+
0.3318360471645143,
|
920 |
+
0.20583090379008745,
|
921 |
+
0.13462704669496664
|
922 |
+
],
|
923 |
+
"bp": 1.0,
|
924 |
+
"sys_len": 1847,
|
925 |
+
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.27203392188147313,
|
927 |
+
"score": 0.27203392188147313,
|
928 |
+
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.23527738150795005,
|
930 |
+
"score_ci_high": 0.3139246690058723,
|
931 |
+
"sacrebleu_ci_low": 0.23527738150795005,
|
932 |
+
"sacrebleu_ci_high": 0.3139246690058723
|
933 |
+
},
|
934 |
+
"mt_flores_101_eng_fra": {
|
935 |
+
"num_of_instances": 66,
|
936 |
+
"counts": [
|
937 |
+
1379,
|
938 |
+
931,
|
939 |
+
680,
|
940 |
+
508
|
941 |
+
],
|
942 |
+
"totals": [
|
943 |
+
2006,
|
944 |
+
1940,
|
945 |
+
1874,
|
946 |
+
1808
|
947 |
+
],
|
948 |
+
"precisions": [
|
949 |
+
0.6874376869391824,
|
950 |
+
0.4798969072164948,
|
951 |
+
0.3628601921024546,
|
952 |
+
0.2809734513274336
|
953 |
+
],
|
954 |
+
"bp": 0.9695654687972447,
|
955 |
+
"sys_len": 2006,
|
956 |
+
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.4152155549652011,
|
958 |
+
"score": 0.4152155549652011,
|
959 |
+
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.3836282048944182,
|
961 |
+
"score_ci_high": 0.4533991390034356,
|
962 |
+
"sacrebleu_ci_low": 0.3836282048944182,
|
963 |
+
"sacrebleu_ci_high": 0.4533991390034356
|
964 |
+
},
|
965 |
+
"mt_flores_101_eng_kor": {
|
966 |
+
"num_of_instances": 66,
|
967 |
+
"counts": [
|
968 |
+
1035,
|
969 |
+
422,
|
970 |
+
219,
|
971 |
+
125
|
972 |
+
],
|
973 |
+
"totals": [
|
974 |
+
3325,
|
975 |
+
3259,
|
976 |
+
3193,
|
977 |
+
3127
|
978 |
+
],
|
979 |
+
"precisions": [
|
980 |
+
0.3112781954887218,
|
981 |
+
0.12948757287511506,
|
982 |
+
0.0685875352333229,
|
983 |
+
0.03997441637352094
|
984 |
+
],
|
985 |
+
"bp": 1.0,
|
986 |
+
"sys_len": 3325,
|
987 |
+
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.10253001707008509,
|
989 |
+
"score": 0.10253001707008509,
|
990 |
+
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.08168831530386227,
|
992 |
+
"score_ci_high": 0.1371689608842245,
|
993 |
+
"sacrebleu_ci_low": 0.08168831530386227,
|
994 |
+
"sacrebleu_ci_high": 0.1371689608842245
|
995 |
+
},
|
996 |
+
"mt_flores_101_eng_por": {
|
997 |
+
"num_of_instances": 66,
|
998 |
+
"counts": [
|
999 |
+
1365,
|
1000 |
+
935,
|
1001 |
+
696,
|
1002 |
+
519
|
1003 |
+
],
|
1004 |
+
"totals": [
|
1005 |
+
1887,
|
1006 |
+
1821,
|
1007 |
+
1755,
|
1008 |
+
1689
|
1009 |
+
],
|
1010 |
+
"precisions": [
|
1011 |
+
0.7233704292527823,
|
1012 |
+
0.513454146073586,
|
1013 |
+
0.3965811965811966,
|
1014 |
+
0.3072824156305506
|
1015 |
+
],
|
1016 |
+
"bp": 0.9847491803389177,
|
1017 |
+
"sys_len": 1887,
|
1018 |
+
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.45421208890996323,
|
1020 |
+
"score": 0.45421208890996323,
|
1021 |
+
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.402877439448594,
|
1023 |
+
"score_ci_high": 0.49512376744317715,
|
1024 |
+
"sacrebleu_ci_low": 0.402877439448594,
|
1025 |
+
"sacrebleu_ci_high": 0.49512376744317715
|
1026 |
+
},
|
1027 |
+
"mt_flores_101_eng_ron": {
|
1028 |
+
"num_of_instances": 66,
|
1029 |
+
"counts": [
|
1030 |
+
930,
|
1031 |
+
427,
|
1032 |
+
223,
|
1033 |
+
122
|
1034 |
+
],
|
1035 |
+
"totals": [
|
1036 |
+
1966,
|
1037 |
+
1900,
|
1038 |
+
1834,
|
1039 |
+
1768
|
1040 |
+
],
|
1041 |
+
"precisions": [
|
1042 |
+
0.47304170905391657,
|
1043 |
+
0.22473684210526315,
|
1044 |
+
0.12159214830970556,
|
1045 |
+
0.06900452488687783
|
1046 |
+
],
|
1047 |
+
"bp": 1.0,
|
1048 |
+
"sys_len": 1966,
|
1049 |
+
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.17281809069385612,
|
1051 |
+
"score": 0.17281809069385612,
|
1052 |
+
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.1498451419191345,
|
1054 |
+
"score_ci_high": 0.20240539093526114,
|
1055 |
+
"sacrebleu_ci_low": 0.1498451419191345,
|
1056 |
+
"sacrebleu_ci_high": 0.20240539093526114
|
1057 |
+
},
|
1058 |
+
"mt_flores_101_eng_spa": {
|
1059 |
+
"num_of_instances": 66,
|
1060 |
+
"counts": [
|
1061 |
+
1186,
|
1062 |
+
603,
|
1063 |
+
337,
|
1064 |
+
189
|
1065 |
+
],
|
1066 |
+
"totals": [
|
1067 |
+
1982,
|
1068 |
+
1916,
|
1069 |
+
1850,
|
1070 |
+
1784
|
1071 |
+
],
|
1072 |
+
"precisions": [
|
1073 |
+
0.5983854692230071,
|
1074 |
+
0.31471816283924847,
|
1075 |
+
0.1821621621621622,
|
1076 |
+
0.10594170403587444
|
1077 |
+
],
|
1078 |
+
"bp": 0.9431530195225803,
|
1079 |
+
"sys_len": 1982,
|
1080 |
+
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.23157365627652982,
|
1082 |
+
"score": 0.23157365627652982,
|
1083 |
+
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.20035664766275735,
|
1085 |
+
"score_ci_high": 0.25766181006532113,
|
1086 |
+
"sacrebleu_ci_low": 0.20035664766275735,
|
1087 |
+
"sacrebleu_ci_high": 0.25766181006532113
|
1088 |
+
},
|
1089 |
+
"mt_flores_101_fra_eng": {
|
1090 |
+
"num_of_instances": 66,
|
1091 |
+
"counts": [
|
1092 |
+
1263,
|
1093 |
+
786,
|
1094 |
+
517,
|
1095 |
+
354
|
1096 |
+
],
|
1097 |
+
"totals": [
|
1098 |
+
1831,
|
1099 |
+
1765,
|
1100 |
+
1699,
|
1101 |
+
1633
|
1102 |
+
],
|
1103 |
+
"precisions": [
|
1104 |
+
0.6897870016384489,
|
1105 |
+
0.44532577903682724,
|
1106 |
+
0.3042966450853443,
|
1107 |
+
0.21677893447642377
|
1108 |
+
],
|
1109 |
+
"bp": 1.0,
|
1110 |
+
"sys_len": 1831,
|
1111 |
+
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.3772912827525828,
|
1113 |
+
"score": 0.3772912827525828,
|
1114 |
+
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.3318073525019781,
|
1116 |
+
"score_ci_high": 0.42160864308969,
|
1117 |
+
"sacrebleu_ci_low": 0.3318073525019781,
|
1118 |
+
"sacrebleu_ci_high": 0.42160864308969
|
1119 |
+
},
|
1120 |
+
"mt_flores_101_jpn_eng": {
|
1121 |
+
"num_of_instances": 66,
|
1122 |
+
"counts": [
|
1123 |
+
998,
|
1124 |
+
440,
|
1125 |
+
238,
|
1126 |
+
140
|
1127 |
+
],
|
1128 |
+
"totals": [
|
1129 |
+
1869,
|
1130 |
+
1803,
|
1131 |
+
1737,
|
1132 |
+
1671
|
1133 |
+
],
|
1134 |
+
"precisions": [
|
1135 |
+
0.5339753879079722,
|
1136 |
+
0.24403771491957849,
|
1137 |
+
0.13701784686240645,
|
1138 |
+
0.08378216636744465
|
1139 |
+
],
|
1140 |
+
"bp": 1.0,
|
1141 |
+
"sys_len": 1869,
|
1142 |
+
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.1966648424448395,
|
1144 |
+
"score": 0.1966648424448395,
|
1145 |
+
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.16525947632394583,
|
1147 |
+
"score_ci_high": 0.23654666880731012,
|
1148 |
+
"sacrebleu_ci_low": 0.16525947632394583,
|
1149 |
+
"sacrebleu_ci_high": 0.23654666880731012
|
1150 |
+
},
|
1151 |
+
"mt_flores_101_kor_eng": {
|
1152 |
+
"num_of_instances": 66,
|
1153 |
+
"counts": [
|
1154 |
+
952,
|
1155 |
+
450,
|
1156 |
+
247,
|
1157 |
+
136
|
1158 |
+
],
|
1159 |
+
"totals": [
|
1160 |
+
1808,
|
1161 |
+
1742,
|
1162 |
+
1676,
|
1163 |
+
1610
|
1164 |
+
],
|
1165 |
+
"precisions": [
|
1166 |
+
0.5265486725663717,
|
1167 |
+
0.25832376578645233,
|
1168 |
+
0.1473747016706444,
|
1169 |
+
0.084472049689441
|
1170 |
+
],
|
1171 |
+
"bp": 1.0,
|
1172 |
+
"sys_len": 1808,
|
1173 |
+
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.2028545317121833,
|
1175 |
+
"score": 0.2028545317121833,
|
1176 |
+
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.17683650189535868,
|
1178 |
+
"score_ci_high": 0.25600334996477725,
|
1179 |
+
"sacrebleu_ci_low": 0.17683650189535868,
|
1180 |
+
"sacrebleu_ci_high": 0.25600334996477725
|
1181 |
+
},
|
1182 |
+
"mt_flores_101_por_eng": {
|
1183 |
+
"num_of_instances": 66,
|
1184 |
+
"counts": [
|
1185 |
+
1256,
|
1186 |
+
812,
|
1187 |
+
560,
|
1188 |
+
392
|
1189 |
+
],
|
1190 |
+
"totals": [
|
1191 |
+
1782,
|
1192 |
+
1716,
|
1193 |
+
1650,
|
1194 |
+
1584
|
1195 |
+
],
|
1196 |
+
"precisions": [
|
1197 |
+
0.7048260381593715,
|
1198 |
+
0.4731934731934732,
|
1199 |
+
0.33939393939393936,
|
1200 |
+
0.2474747474747475
|
1201 |
+
],
|
1202 |
+
"bp": 1.0,
|
1203 |
+
"sys_len": 1782,
|
1204 |
+
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.409108887747912,
|
1206 |
+
"score": 0.409108887747912,
|
1207 |
+
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.36492758260668207,
|
1209 |
+
"score_ci_high": 0.46703692233170646,
|
1210 |
+
"sacrebleu_ci_low": 0.36492758260668207,
|
1211 |
+
"sacrebleu_ci_high": 0.46703692233170646
|
1212 |
+
},
|
1213 |
+
"mt_flores_101_ron_eng": {
|
1214 |
+
"num_of_instances": 66,
|
1215 |
+
"counts": [
|
1216 |
+
1197,
|
1217 |
+
729,
|
1218 |
+
477,
|
1219 |
+
310
|
1220 |
+
],
|
1221 |
+
"totals": [
|
1222 |
+
1815,
|
1223 |
+
1749,
|
1224 |
+
1683,
|
1225 |
+
1617
|
1226 |
+
],
|
1227 |
+
"precisions": [
|
1228 |
+
0.659504132231405,
|
1229 |
+
0.41680960548885077,
|
1230 |
+
0.28342245989304815,
|
1231 |
+
0.191713048855906
|
1232 |
+
],
|
1233 |
+
"bp": 1.0,
|
1234 |
+
"sys_len": 1815,
|
1235 |
+
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.34959104085020015,
|
1237 |
+
"score": 0.34959104085020015,
|
1238 |
+
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.30660403979630557,
|
1240 |
+
"score_ci_high": 0.3928177080574808,
|
1241 |
+
"sacrebleu_ci_low": 0.30660403979630557,
|
1242 |
+
"sacrebleu_ci_high": 0.3928177080574808
|
1243 |
+
},
|
1244 |
+
"mt_flores_101_spa_eng": {
|
1245 |
+
"num_of_instances": 66,
|
1246 |
+
"counts": [
|
1247 |
+
1110,
|
1248 |
+
579,
|
1249 |
+
330,
|
1250 |
+
191
|
1251 |
+
],
|
1252 |
+
"totals": [
|
1253 |
+
1811,
|
1254 |
+
1745,
|
1255 |
+
1679,
|
1256 |
+
1613
|
1257 |
+
],
|
1258 |
+
"precisions": [
|
1259 |
+
0.6129210381004969,
|
1260 |
+
0.33180515759312323,
|
1261 |
+
0.19654556283502087,
|
1262 |
+
0.11841289522628642
|
1263 |
+
],
|
1264 |
+
"bp": 1.0,
|
1265 |
+
"sys_len": 1811,
|
1266 |
+
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.2622934684900747,
|
1268 |
+
"score": 0.2622934684900747,
|
1269 |
+
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.22911977694045185,
|
1271 |
+
"score_ci_high": 0.30518353214635846,
|
1272 |
+
"sacrebleu_ci_low": 0.22911977694045185,
|
1273 |
+
"sacrebleu_ci_high": 0.30518353214635846
|
1274 |
+
},
|
1275 |
+
"score": 0.27697004057638286,
|
1276 |
+
"score_name": "subsets_mean",
|
1277 |
+
"num_of_instances": 990
|
1278 |
+
},
|
1279 |
+
"score": 0.484621964093495,
|
1280 |
+
"score_name": "subsets_mean",
|
1281 |
+
"num_of_instances": 12472
|
1282 |
+
}
|
1283 |
+
}
|
results/bluebench/2025-06-19T16-09-06_evaluation_results.json
ADDED
@@ -0,0 +1,1283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-19T20:09:01.492000Z",
|
4 |
+
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
+
"--tasks",
|
7 |
+
"benchmarks.bluebench",
|
8 |
+
"--model",
|
9 |
+
"cross_provider",
|
10 |
+
"--model_args",
|
11 |
+
"model_name=watsonx/ibm/granite-3-2b-instruct,max_tokens=256",
|
12 |
+
"--output_path",
|
13 |
+
"./results/bluebench",
|
14 |
+
"--log_samples",
|
15 |
+
"--trust_remote_code",
|
16 |
+
"--batch_size",
|
17 |
+
"8",
|
18 |
+
"--verbosity",
|
19 |
+
"ERROR"
|
20 |
+
],
|
21 |
+
"parsed_arguments": {
|
22 |
+
"tasks": [
|
23 |
+
"benchmarks.bluebench"
|
24 |
+
],
|
25 |
+
"split": "test",
|
26 |
+
"num_fewshots": null,
|
27 |
+
"limit": null,
|
28 |
+
"batch_size": 8,
|
29 |
+
"model": "watsonx/ibm/granite-3-2b-instruct",
|
30 |
+
"model_args": {
|
31 |
+
"max_tokens": 256
|
32 |
+
},
|
33 |
+
"gen_kwargs": null,
|
34 |
+
"chat_template_kwargs": null,
|
35 |
+
"output_path": "./results/bluebench",
|
36 |
+
"output_file_prefix": "evaluation_results",
|
37 |
+
"log_samples": true,
|
38 |
+
"verbosity": "ERROR",
|
39 |
+
"apply_chat_template": false,
|
40 |
+
"trust_remote_code": true,
|
41 |
+
"disable_hf_cache": false,
|
42 |
+
"cache_dir": null
|
43 |
+
},
|
44 |
+
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
|
46 |
+
"python_version": "3.10.18",
|
47 |
+
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
+
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
+
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
63 |
+
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
78 |
+
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
80 |
+
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
+
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
+
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
105 |
+
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
108 |
+
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
112 |
+
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
+
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
+
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
137 |
+
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
+
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
+
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
+
"jaraco.text": "3.12.1",
|
170 |
+
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"results": {
|
177 |
+
"bias": {
|
178 |
+
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.5777777777777777,
|
180 |
+
"accuracy_ci_low": 0.4777777777777778,
|
181 |
+
"accuracy_ci_high": 0.6777777777777778,
|
182 |
+
"score_name": "accuracy",
|
183 |
+
"score": 0.5777777777777777,
|
184 |
+
"score_ci_high": 0.6777777777777778,
|
185 |
+
"score_ci_low": 0.4777777777777778,
|
186 |
+
"num_of_instances": 90
|
187 |
+
},
|
188 |
+
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.6777777777777778,
|
190 |
+
"accuracy_ci_low": 0.5777777777777777,
|
191 |
+
"accuracy_ci_high": 0.7666666666666667,
|
192 |
+
"score_name": "accuracy",
|
193 |
+
"score": 0.6777777777777778,
|
194 |
+
"score_ci_high": 0.7666666666666667,
|
195 |
+
"score_ci_low": 0.5777777777777777,
|
196 |
+
"num_of_instances": 90
|
197 |
+
},
|
198 |
+
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.8111111111111111,
|
200 |
+
"accuracy_ci_low": 0.7222222222222222,
|
201 |
+
"accuracy_ci_high": 0.8777777777777778,
|
202 |
+
"score_name": "accuracy",
|
203 |
+
"score": 0.8111111111111111,
|
204 |
+
"score_ci_high": 0.8777777777777778,
|
205 |
+
"score_ci_low": 0.7222222222222222,
|
206 |
+
"num_of_instances": 90
|
207 |
+
},
|
208 |
+
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.43333333333333335,
|
210 |
+
"accuracy_ci_low": 0.3333333333333333,
|
211 |
+
"accuracy_ci_high": 0.5333333333333333,
|
212 |
+
"score_name": "accuracy",
|
213 |
+
"score": 0.43333333333333335,
|
214 |
+
"score_ci_high": 0.5333333333333333,
|
215 |
+
"score_ci_low": 0.3333333333333333,
|
216 |
+
"num_of_instances": 90
|
217 |
+
},
|
218 |
+
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.5888888888888889,
|
220 |
+
"accuracy_ci_low": 0.48197626978907726,
|
221 |
+
"accuracy_ci_high": 0.6888888888888889,
|
222 |
+
"score_name": "accuracy",
|
223 |
+
"score": 0.5888888888888889,
|
224 |
+
"score_ci_high": 0.6888888888888889,
|
225 |
+
"score_ci_low": 0.48197626978907726,
|
226 |
+
"num_of_instances": 90
|
227 |
+
},
|
228 |
+
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.8777777777777778,
|
230 |
+
"accuracy_ci_low": 0.8,
|
231 |
+
"accuracy_ci_high": 0.9333333333333333,
|
232 |
+
"score_name": "accuracy",
|
233 |
+
"score": 0.8777777777777778,
|
234 |
+
"score_ci_high": 0.9333333333333333,
|
235 |
+
"score_ci_low": 0.8,
|
236 |
+
"num_of_instances": 90
|
237 |
+
},
|
238 |
+
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.7444444444444445,
|
240 |
+
"accuracy_ci_low": 0.6444444444444445,
|
241 |
+
"accuracy_ci_high": 0.8333333333333334,
|
242 |
+
"score_name": "accuracy",
|
243 |
+
"score": 0.7444444444444445,
|
244 |
+
"score_ci_high": 0.8333333333333334,
|
245 |
+
"score_ci_low": 0.6444444444444445,
|
246 |
+
"num_of_instances": 90
|
247 |
+
},
|
248 |
+
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.6222222222222222,
|
250 |
+
"accuracy_ci_low": 0.5222222222222223,
|
251 |
+
"accuracy_ci_high": 0.7222222222222222,
|
252 |
+
"score_name": "accuracy",
|
253 |
+
"score": 0.6222222222222222,
|
254 |
+
"score_ci_high": 0.7222222222222222,
|
255 |
+
"score_ci_low": 0.5222222222222223,
|
256 |
+
"num_of_instances": 90
|
257 |
+
},
|
258 |
+
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.6,
|
260 |
+
"accuracy_ci_low": 0.5,
|
261 |
+
"accuracy_ci_high": 0.7,
|
262 |
+
"score_name": "accuracy",
|
263 |
+
"score": 0.6,
|
264 |
+
"score_ci_high": 0.7,
|
265 |
+
"score_ci_low": 0.5,
|
266 |
+
"num_of_instances": 90
|
267 |
+
},
|
268 |
+
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.6333333333333333,
|
270 |
+
"accuracy_ci_low": 0.5333333333333333,
|
271 |
+
"accuracy_ci_high": 0.7333333333333333,
|
272 |
+
"score_name": "accuracy",
|
273 |
+
"score": 0.6333333333333333,
|
274 |
+
"score_ci_high": 0.7333333333333333,
|
275 |
+
"score_ci_low": 0.5333333333333333,
|
276 |
+
"num_of_instances": 90
|
277 |
+
},
|
278 |
+
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.7,
|
280 |
+
"accuracy_ci_low": 0.5969530984549517,
|
281 |
+
"accuracy_ci_high": 0.7798809350059414,
|
282 |
+
"score_name": "accuracy",
|
283 |
+
"score": 0.7,
|
284 |
+
"score_ci_high": 0.7798809350059414,
|
285 |
+
"score_ci_low": 0.5969530984549517,
|
286 |
+
"num_of_instances": 90
|
287 |
+
},
|
288 |
+
"score": 0.6606060606060606,
|
289 |
+
"score_name": "subsets_mean",
|
290 |
+
"num_of_instances": 990
|
291 |
+
},
|
292 |
+
"chatbot_abilities": {
|
293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
+
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.5,
|
296 |
+
"score": 0.5,
|
297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
+
},
|
299 |
+
"score": 0.5,
|
300 |
+
"score_name": "subsets_mean",
|
301 |
+
"num_of_instances": 500
|
302 |
+
},
|
303 |
+
"entity_extraction": {
|
304 |
+
"universal_ner_en_ewt": {
|
305 |
+
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.4156626506024097,
|
307 |
+
"f1_Organization": 0.31372549019607837,
|
308 |
+
"f1_Location": 0.23140495867768596,
|
309 |
+
"f1_macro": 0.32026436649205803,
|
310 |
+
"recall_macro": 0.2686052593300962,
|
311 |
+
"precision_macro": 0.40524414740424186,
|
312 |
+
"in_classes_support": 0.6173913043478261,
|
313 |
+
"f1_micro": 0.26363636363636367,
|
314 |
+
"recall_micro": 0.2761904761904762,
|
315 |
+
"precision_micro": 0.25217391304347825,
|
316 |
+
"score": 0.26363636363636367,
|
317 |
+
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.22691460915409117,
|
319 |
+
"score_ci_high": 0.3036479298321143,
|
320 |
+
"f1_micro_ci_low": 0.22691460915409117,
|
321 |
+
"f1_micro_ci_high": 0.3036479298321143
|
322 |
+
},
|
323 |
+
"score": 0.26363636363636367,
|
324 |
+
"score_name": "subsets_mean",
|
325 |
+
"num_of_instances": 1000
|
326 |
+
},
|
327 |
+
"knowledge": {
|
328 |
+
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.43661971830985913,
|
330 |
+
"accuracy_ci_low": 0.323943661971831,
|
331 |
+
"accuracy_ci_high": 0.5633802816901409,
|
332 |
+
"score_name": "accuracy",
|
333 |
+
"score": 0.43661971830985913,
|
334 |
+
"score_ci_high": 0.5633802816901409,
|
335 |
+
"score_ci_low": 0.323943661971831,
|
336 |
+
"num_of_instances": 71
|
337 |
+
},
|
338 |
+
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.2112676056338028,
|
340 |
+
"accuracy_ci_low": 0.1267605633802817,
|
341 |
+
"accuracy_ci_high": 0.30985915492957744,
|
342 |
+
"score_name": "accuracy",
|
343 |
+
"score": 0.2112676056338028,
|
344 |
+
"score_ci_high": 0.30985915492957744,
|
345 |
+
"score_ci_low": 0.1267605633802817,
|
346 |
+
"num_of_instances": 71
|
347 |
+
},
|
348 |
+
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.23943661971830985,
|
350 |
+
"accuracy_ci_low": 0.14084507042253522,
|
351 |
+
"accuracy_ci_high": 0.3380281690140845,
|
352 |
+
"score_name": "accuracy",
|
353 |
+
"score": 0.23943661971830985,
|
354 |
+
"score_ci_high": 0.3380281690140845,
|
355 |
+
"score_ci_low": 0.14084507042253522,
|
356 |
+
"num_of_instances": 71
|
357 |
+
},
|
358 |
+
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.3380281690140845,
|
360 |
+
"accuracy_ci_low": 0.22535211267605634,
|
361 |
+
"accuracy_ci_high": 0.4507042253521127,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.3380281690140845,
|
364 |
+
"score_ci_high": 0.4507042253521127,
|
365 |
+
"score_ci_low": 0.22535211267605634,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.38028169014084506,
|
370 |
+
"accuracy_ci_low": 0.28169014084507044,
|
371 |
+
"accuracy_ci_high": 0.5070422535211268,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.38028169014084506,
|
374 |
+
"score_ci_high": 0.5070422535211268,
|
375 |
+
"score_ci_low": 0.28169014084507044,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.2112676056338028,
|
380 |
+
"accuracy_ci_low": 0.1267605633802817,
|
381 |
+
"accuracy_ci_high": 0.30985915492957744,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.2112676056338028,
|
384 |
+
"score_ci_high": 0.30985915492957744,
|
385 |
+
"score_ci_low": 0.1267605633802817,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.19718309859154928,
|
390 |
+
"accuracy_ci_low": 0.11267605633802817,
|
391 |
+
"accuracy_ci_high": 0.30985915492957744,
|
392 |
+
"score_name": "accuracy",
|
393 |
+
"score": 0.19718309859154928,
|
394 |
+
"score_ci_high": 0.30985915492957744,
|
395 |
+
"score_ci_low": 0.11267605633802817,
|
396 |
+
"num_of_instances": 71
|
397 |
+
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.2676056338028169,
|
400 |
+
"accuracy_ci_low": 0.15492957746478872,
|
401 |
+
"accuracy_ci_high": 0.38028169014084506,
|
402 |
+
"score_name": "accuracy",
|
403 |
+
"score": 0.2676056338028169,
|
404 |
+
"score_ci_high": 0.38028169014084506,
|
405 |
+
"score_ci_low": 0.15492957746478872,
|
406 |
+
"num_of_instances": 71
|
407 |
+
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.1267605633802817,
|
410 |
+
"accuracy_ci_low": 0.056338028169014086,
|
411 |
+
"accuracy_ci_high": 0.22535211267605634,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.1267605633802817,
|
414 |
+
"score_ci_high": 0.22535211267605634,
|
415 |
+
"score_ci_low": 0.056338028169014086,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.09859154929577464,
|
420 |
+
"accuracy_ci_low": 0.04225352112676056,
|
421 |
+
"accuracy_ci_high": 0.18309859154929578,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.09859154929577464,
|
424 |
+
"score_ci_high": 0.18309859154929578,
|
425 |
+
"score_ci_low": 0.04225352112676056,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
+
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.19718309859154928,
|
430 |
+
"accuracy_ci_low": 0.11267605633802817,
|
431 |
+
"accuracy_ci_high": 0.29577464788732394,
|
432 |
+
"score_name": "accuracy",
|
433 |
+
"score": 0.19718309859154928,
|
434 |
+
"score_ci_high": 0.29577464788732394,
|
435 |
+
"score_ci_low": 0.11267605633802817,
|
436 |
+
"num_of_instances": 71
|
437 |
+
},
|
438 |
+
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.28169014084507044,
|
440 |
+
"accuracy_ci_low": 0.18309859154929578,
|
441 |
+
"accuracy_ci_high": 0.39436619718309857,
|
442 |
+
"score_name": "accuracy",
|
443 |
+
"score": 0.28169014084507044,
|
444 |
+
"score_ci_high": 0.39436619718309857,
|
445 |
+
"score_ci_low": 0.18309859154929578,
|
446 |
+
"num_of_instances": 71
|
447 |
+
},
|
448 |
+
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.19718309859154928,
|
450 |
+
"accuracy_ci_low": 0.11267605633802817,
|
451 |
+
"accuracy_ci_high": 0.29577464788732394,
|
452 |
+
"score_name": "accuracy",
|
453 |
+
"score": 0.19718309859154928,
|
454 |
+
"score_ci_high": 0.29577464788732394,
|
455 |
+
"score_ci_low": 0.11267605633802817,
|
456 |
+
"num_of_instances": 71
|
457 |
+
},
|
458 |
+
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.38028169014084506,
|
460 |
+
"accuracy_ci_low": 0.28169014084507044,
|
461 |
+
"accuracy_ci_high": 0.5070422535211268,
|
462 |
+
"score_name": "accuracy",
|
463 |
+
"score": 0.38028169014084506,
|
464 |
+
"score_ci_high": 0.5070422535211268,
|
465 |
+
"score_ci_low": 0.28169014084507044,
|
466 |
+
"num_of_instances": 71
|
467 |
+
},
|
468 |
+
"score": 0.2545271629778672,
|
469 |
+
"score_name": "subsets_mean",
|
470 |
+
"num_of_instances": 994
|
471 |
+
},
|
472 |
+
"legal": {
|
473 |
+
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.40599479862637755,
|
475 |
+
"f1_suggestive": 0.16666666666666666,
|
476 |
+
"f1_generic": 0.2727272727272727,
|
477 |
+
"f1_arbitrary": 0.5263157894736842,
|
478 |
+
"f1_fanciful": 0.5777777777777777,
|
479 |
+
"f1_descriptive": 0.4864864864864865,
|
480 |
+
"f1_macro_ci_low": 0.31051398566318733,
|
481 |
+
"f1_macro_ci_high": 0.5136277650253285,
|
482 |
+
"score_name": "f1_micro",
|
483 |
+
"score": 0.4457831325301205,
|
484 |
+
"score_ci_high": 0.550817717180019,
|
485 |
+
"score_ci_low": 0.3373493975903614,
|
486 |
+
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.43529411764705883,
|
488 |
+
"accuracy_ci_low": 0.32941176470588235,
|
489 |
+
"accuracy_ci_high": 0.5411764705882353,
|
490 |
+
"f1_micro": 0.4457831325301205,
|
491 |
+
"f1_micro_ci_low": 0.3373493975903614,
|
492 |
+
"f1_micro_ci_high": 0.550817717180019
|
493 |
+
},
|
494 |
+
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.4964746474647465,
|
496 |
+
"f1_no": 0.7656765676567657,
|
497 |
+
"f1_yes": 0.22727272727272727,
|
498 |
+
"f1_macro_ci_low": 0.43223753510688345,
|
499 |
+
"f1_macro_ci_high": 0.5682608970547502,
|
500 |
+
"score_name": "f1_micro",
|
501 |
+
"score": 0.6445012787723785,
|
502 |
+
"score_ci_high": 0.7025641025641025,
|
503 |
+
"score_ci_low": 0.5728900255754475,
|
504 |
+
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.63,
|
506 |
+
"accuracy_ci_low": 0.56,
|
507 |
+
"accuracy_ci_high": 0.69,
|
508 |
+
"f1_micro": 0.6445012787723785,
|
509 |
+
"f1_micro_ci_low": 0.5728900255754475,
|
510 |
+
"f1_micro_ci_high": 0.7025641025641025
|
511 |
+
},
|
512 |
+
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.20456925120109243,
|
514 |
+
"f1_conclusion": 0.047619047619047616,
|
515 |
+
"f1_decree": 0.26666666666666666,
|
516 |
+
"f1_issue": 0.18947368421052632,
|
517 |
+
"f1_analysis": 0.3125,
|
518 |
+
"f1_facts": 0.2857142857142857,
|
519 |
+
"f1_procedural history": 0.19047619047619047,
|
520 |
+
"f1_rule": 0.13953488372093023,
|
521 |
+
"f1_macro_ci_low": 0.15195580870715297,
|
522 |
+
"f1_macro_ci_high": 0.2695847948134964,
|
523 |
+
"score_name": "f1_micro",
|
524 |
+
"score": 0.20911528150134048,
|
525 |
+
"score_ci_high": 0.2716626596010836,
|
526 |
+
"score_ci_low": 0.15343915343915343,
|
527 |
+
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.195,
|
529 |
+
"accuracy_ci_low": 0.14164584898806754,
|
530 |
+
"accuracy_ci_high": 0.255,
|
531 |
+
"f1_micro": 0.20911528150134048,
|
532 |
+
"f1_micro_ci_low": 0.15343915343915343,
|
533 |
+
"f1_micro_ci_high": 0.2716626596010836
|
534 |
+
},
|
535 |
+
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.45312172637754033,
|
537 |
+
"f1_yes": 0.5225225225225225,
|
538 |
+
"f1_no": 0.38372093023255816,
|
539 |
+
"f1_macro_ci_low": 0.3838533922470516,
|
540 |
+
"f1_macro_ci_high": 0.5196559838649608,
|
541 |
+
"score_name": "f1_micro",
|
542 |
+
"score": 0.4619289340101523,
|
543 |
+
"score_ci_high": 0.5291073254863808,
|
544 |
+
"score_ci_low": 0.39285714285714285,
|
545 |
+
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.455,
|
547 |
+
"accuracy_ci_low": 0.385,
|
548 |
+
"accuracy_ci_high": 0.525,
|
549 |
+
"f1_micro": 0.4619289340101523,
|
550 |
+
"f1_micro_ci_low": 0.39285714285714285,
|
551 |
+
"f1_micro_ci_high": 0.5291073254863808
|
552 |
+
},
|
553 |
+
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.7812889165628891,
|
555 |
+
"f1_yes": 0.7671232876712328,
|
556 |
+
"f1_no": 0.7954545454545454,
|
557 |
+
"f1_macro_ci_low": 0.6841107145759989,
|
558 |
+
"f1_macro_ci_high": 0.8570115576895313,
|
559 |
+
"score_name": "f1_micro",
|
560 |
+
"score": 0.782608695652174,
|
561 |
+
"score_ci_high": 0.8554216867469879,
|
562 |
+
"score_ci_low": 0.6867321408585169,
|
563 |
+
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.7411764705882353,
|
565 |
+
"accuracy_ci_low": 0.6470588235294118,
|
566 |
+
"accuracy_ci_high": 0.8235294117647058,
|
567 |
+
"f1_micro": 0.782608695652174,
|
568 |
+
"f1_micro_ci_low": 0.6867321408585169,
|
569 |
+
"f1_micro_ci_high": 0.8554216867469879
|
570 |
+
},
|
571 |
+
"score": 0.5087874644932331,
|
572 |
+
"score_name": "subsets_mean",
|
573 |
+
"num_of_instances": 770
|
574 |
+
},
|
575 |
+
"news_classification": {
|
576 |
+
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.34833143635091446,
|
578 |
+
"f1_cars": 0.6590909090909091,
|
579 |
+
"f1_windows x": 0.030303030303030304,
|
580 |
+
"f1_atheism": 0.19047619047619047,
|
581 |
+
"f1_christianity": 0.2,
|
582 |
+
"f1_religion": 0.19047619047619047,
|
583 |
+
"f1_medicine": 0.6567164179104478,
|
584 |
+
"f1_computer graphics": 0.34782608695652173,
|
585 |
+
"f1_microsoft windows": 0.29850746268656714,
|
586 |
+
"f1_middle east": 0.11764705882352941,
|
587 |
+
"f1_politics": 0.20754716981132076,
|
588 |
+
"f1_motorcycles": 0.43373493975903615,
|
589 |
+
"f1_pc hardware": 0.3973509933774834,
|
590 |
+
"f1_mac hardware": 0.3950617283950617,
|
591 |
+
"f1_electronics": 0.4186046511627907,
|
592 |
+
"f1_for sale": 0.08695652173913043,
|
593 |
+
"f1_guns": 0.14814814814814814,
|
594 |
+
"f1_space": 0.4935064935064935,
|
595 |
+
"f1_cryptography": 0.47368421052631576,
|
596 |
+
"f1_baseball": 0.6890756302521008,
|
597 |
+
"f1_hockey": 0.5319148936170213,
|
598 |
+
"f1_macro_ci_low": 0.32107229927440883,
|
599 |
+
"f1_macro_ci_high": 0.37798520058634305,
|
600 |
+
"score_name": "f1_micro",
|
601 |
+
"score": 0.3750771128932758,
|
602 |
+
"score_ci_high": 0.40812055333180686,
|
603 |
+
"score_ci_low": 0.3412059307716769,
|
604 |
+
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.304,
|
606 |
+
"accuracy_ci_low": 0.275,
|
607 |
+
"accuracy_ci_high": 0.333,
|
608 |
+
"f1_micro": 0.3750771128932758,
|
609 |
+
"f1_micro_ci_low": 0.3412059307716769,
|
610 |
+
"f1_micro_ci_high": 0.40812055333180686
|
611 |
+
},
|
612 |
+
"score": 0.3750771128932758,
|
613 |
+
"score_name": "subsets_mean",
|
614 |
+
"num_of_instances": 1000
|
615 |
+
},
|
616 |
+
"product_help": {
|
617 |
+
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.49544359373400404,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.8558139534883721,
|
620 |
+
"f1_credit card or prepaid card": 0.4778761061946903,
|
621 |
+
"f1_debt collection": 0.45517241379310347,
|
622 |
+
"f1_checking or savings account": 0.4791666666666667,
|
623 |
+
"f1_payday loan or title loan or personal loan": 0.1875,
|
624 |
+
"f1_vehicle loan or lease": 0.30303030303030304,
|
625 |
+
"f1_mortgage": 0.6909090909090909,
|
626 |
+
"f1_money transfer or virtual currency or money service": 0.34285714285714286,
|
627 |
+
"f1_student loan": 0.6666666666666666,
|
628 |
+
"f1_macro_ci_low": 0.437780332452402,
|
629 |
+
"f1_macro_ci_high": 0.5517479827666423,
|
630 |
+
"score_name": "f1_micro",
|
631 |
+
"score": 0.7417582417582418,
|
632 |
+
"score_ci_high": 0.7670380361466397,
|
633 |
+
"score_ci_low": 0.7139576080586072,
|
634 |
+
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.675,
|
636 |
+
"accuracy_ci_low": 0.644,
|
637 |
+
"accuracy_ci_high": 0.705,
|
638 |
+
"f1_micro": 0.7417582417582418,
|
639 |
+
"f1_micro_ci_low": 0.7139576080586072,
|
640 |
+
"f1_micro_ci_high": 0.7670380361466397
|
641 |
+
},
|
642 |
+
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.5417314583908202,
|
644 |
+
"f1_mortgages and loans": 0.6705202312138728,
|
645 |
+
"f1_credit card": 0.5815602836879432,
|
646 |
+
"f1_debt collection": 0.5073170731707317,
|
647 |
+
"f1_credit reporting": 0.6431372549019608,
|
648 |
+
"f1_retail banking": 0.30612244897959184,
|
649 |
+
"f1_macro_ci_low": 0.4990357022335705,
|
650 |
+
"f1_macro_ci_high": 0.5863760959900322,
|
651 |
+
"score_name": "f1_micro",
|
652 |
+
"score": 0.5688073394495413,
|
653 |
+
"score_ci_high": 0.6118783685965219,
|
654 |
+
"score_ci_low": 0.5227795175898966,
|
655 |
+
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.496,
|
657 |
+
"accuracy_ci_low": 0.452,
|
658 |
+
"accuracy_ci_high": 0.538,
|
659 |
+
"f1_micro": 0.5688073394495413,
|
660 |
+
"f1_micro_ci_low": 0.5227795175898966,
|
661 |
+
"f1_micro_ci_high": 0.6118783685965219
|
662 |
+
},
|
663 |
+
"score": 0.6552827906038916,
|
664 |
+
"score_name": "subsets_mean",
|
665 |
+
"num_of_instances": 1500
|
666 |
+
},
|
667 |
+
"qa_finance": {
|
668 |
+
"fin_qa": {
|
669 |
+
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.107,
|
671 |
+
"score": 0.107,
|
672 |
+
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.091,
|
674 |
+
"program_accuracy_ci_low": 0.08721723629561164,
|
675 |
+
"program_accuracy_ci_high": 0.126,
|
676 |
+
"score_ci_low": 0.08721723629561164,
|
677 |
+
"score_ci_high": 0.126,
|
678 |
+
"execution_accuracy_ci_low": 0.074,
|
679 |
+
"execution_accuracy_ci_high": 0.109
|
680 |
+
},
|
681 |
+
"score": 0.107,
|
682 |
+
"score_name": "subsets_mean",
|
683 |
+
"num_of_instances": 1000
|
684 |
+
},
|
685 |
+
"rag_general": {
|
686 |
+
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.27411911391097254,
|
688 |
+
"recall": 0.5230720245824972,
|
689 |
+
"f1": 0.30495867779564406,
|
690 |
+
"precision_ci_low": 0.2547573973545031,
|
691 |
+
"precision_ci_high": 0.2921448180759812,
|
692 |
+
"recall_ci_low": 0.5070386905327111,
|
693 |
+
"recall_ci_high": 0.5400839972442947,
|
694 |
+
"f1_ci_low": 0.28775327483111873,
|
695 |
+
"f1_ci_high": 0.32083305804218265,
|
696 |
+
"score_name": "f1",
|
697 |
+
"score": 0.30495867779564406,
|
698 |
+
"score_ci_high": 0.32083305804218265,
|
699 |
+
"score_ci_low": 0.28775327483111873,
|
700 |
+
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5782615457475185,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6598645970225334,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5351804447919131,
|
704 |
+
"faithfullness_f1_token_overlap": 0.34366402575185945,
|
705 |
+
"faithfullness_recall_token_overlap": 0.27898727759494896,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5756506161288797,
|
707 |
+
"correctness_f1_token_overlap": 0.30495867779564406,
|
708 |
+
"correctness_recall_token_overlap": 0.5230720245824972,
|
709 |
+
"correctness_precision_token_overlap": 0.27411911391097254
|
710 |
+
},
|
711 |
+
"score": 0.30495867779564406,
|
712 |
+
"score_name": "subsets_mean",
|
713 |
+
"num_of_instances": 600
|
714 |
+
},
|
715 |
+
"reasoning": {
|
716 |
+
"hellaswag": {
|
717 |
+
"accuracy": 0.388,
|
718 |
+
"accuracy_ci_low": 0.358,
|
719 |
+
"accuracy_ci_high": 0.417,
|
720 |
+
"score_name": "accuracy",
|
721 |
+
"score": 0.388,
|
722 |
+
"score_ci_high": 0.417,
|
723 |
+
"score_ci_low": 0.358,
|
724 |
+
"num_of_instances": 1000
|
725 |
+
},
|
726 |
+
"openbook_qa": {
|
727 |
+
"accuracy": 0.728,
|
728 |
+
"accuracy_ci_low": 0.692,
|
729 |
+
"accuracy_ci_high": 0.764,
|
730 |
+
"score_name": "accuracy",
|
731 |
+
"score": 0.728,
|
732 |
+
"score_ci_high": 0.764,
|
733 |
+
"score_ci_low": 0.692,
|
734 |
+
"num_of_instances": 500
|
735 |
+
},
|
736 |
+
"score": 0.558,
|
737 |
+
"score_name": "subsets_mean",
|
738 |
+
"num_of_instances": 1500
|
739 |
+
},
|
740 |
+
"safety": {
|
741 |
+
"attaq_500": {
|
742 |
+
"severity_critical": 0.0,
|
743 |
+
"severity_high": 0.0,
|
744 |
+
"severity_medium": 1.0,
|
745 |
+
"severity_low": 99.0,
|
746 |
+
"category_harmful_info": 0.8488387024207483,
|
747 |
+
"category_deception": 0.866081887573907,
|
748 |
+
"category_explicit_content": 0.8272491132821699,
|
749 |
+
"category_violence": 0.812888801746839,
|
750 |
+
"category_discrimination": 0.8425866682378074,
|
751 |
+
"category_substance_abuse": 0.9123246139950222,
|
752 |
+
"category_pii": 0.7867479754818811,
|
753 |
+
"safety": 0.8366520641242258,
|
754 |
+
"safety_ci_low": 0.8141729035472625,
|
755 |
+
"safety_ci_high": 0.8588271358338173,
|
756 |
+
"score_name": "safety",
|
757 |
+
"score": 0.8366520641242258,
|
758 |
+
"score_ci_high": 0.8588271358338173,
|
759 |
+
"score_ci_low": 0.8141729035472625,
|
760 |
+
"num_of_instances": 100
|
761 |
+
},
|
762 |
+
"score": 0.8366520641242258,
|
763 |
+
"score_name": "subsets_mean",
|
764 |
+
"num_of_instances": 100
|
765 |
+
},
|
766 |
+
"summarization": {
|
767 |
+
"billsum_document_filtered_to_6000_chars": {
|
768 |
+
"num_of_instances": 528,
|
769 |
+
"rouge2": 0.1946123321830468,
|
770 |
+
"rouge1": 0.4165480993021762,
|
771 |
+
"rougeLsum": 0.3481713172994178,
|
772 |
+
"rougeL": 0.2807625515077543,
|
773 |
+
"score": 0.2807625515077543,
|
774 |
+
"score_name": "rougeL",
|
775 |
+
"rouge2_ci_low": 0.18807199323963344,
|
776 |
+
"rouge2_ci_high": 0.2007897635489223,
|
777 |
+
"rouge1_ci_low": 0.40802841361294645,
|
778 |
+
"rouge1_ci_high": 0.4240881706518218,
|
779 |
+
"rougeLsum_ci_low": 0.34046630242406717,
|
780 |
+
"rougeLsum_ci_high": 0.35565311375927156,
|
781 |
+
"rougeL_ci_low": 0.2745389110131783,
|
782 |
+
"rougeL_ci_high": 0.2870232677361269,
|
783 |
+
"score_ci_low": 0.2745389110131783,
|
784 |
+
"score_ci_high": 0.2870232677361269
|
785 |
+
},
|
786 |
+
"tldr_document_filtered_to_6000_chars": {
|
787 |
+
"num_of_instances": 1000,
|
788 |
+
"rouge2": 0.013499529737597162,
|
789 |
+
"rouge1": 0.1111081768530587,
|
790 |
+
"rougeLsum": 0.0922891945088228,
|
791 |
+
"rougeL": 0.07992633322696455,
|
792 |
+
"score": 0.07992633322696455,
|
793 |
+
"score_name": "rougeL",
|
794 |
+
"rouge2_ci_low": 0.012006599571612698,
|
795 |
+
"rouge2_ci_high": 0.015167305255576668,
|
796 |
+
"rouge1_ci_low": 0.10596612811589602,
|
797 |
+
"rouge1_ci_high": 0.11561580840527891,
|
798 |
+
"rougeLsum_ci_low": 0.08846385121818591,
|
799 |
+
"rougeLsum_ci_high": 0.09604727885686246,
|
800 |
+
"rougeL_ci_low": 0.0765698806517895,
|
801 |
+
"rougeL_ci_high": 0.0830415577853562,
|
802 |
+
"score_ci_low": 0.0765698806517895,
|
803 |
+
"score_ci_high": 0.0830415577853562
|
804 |
+
},
|
805 |
+
"score": 0.18034444236735941,
|
806 |
+
"score_name": "subsets_mean",
|
807 |
+
"num_of_instances": 1528
|
808 |
+
},
|
809 |
+
"translation": {
|
810 |
+
"mt_flores_101_ara_eng": {
|
811 |
+
"num_of_instances": 66,
|
812 |
+
"counts": [
|
813 |
+
1002,
|
814 |
+
497,
|
815 |
+
282,
|
816 |
+
169
|
817 |
+
],
|
818 |
+
"totals": [
|
819 |
+
1844,
|
820 |
+
1778,
|
821 |
+
1712,
|
822 |
+
1646
|
823 |
+
],
|
824 |
+
"precisions": [
|
825 |
+
0.5433839479392625,
|
826 |
+
0.2795275590551181,
|
827 |
+
0.1647196261682243,
|
828 |
+
0.10267314702308626
|
829 |
+
],
|
830 |
+
"bp": 1.0,
|
831 |
+
"sys_len": 1844,
|
832 |
+
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.22513002244943295,
|
834 |
+
"score": 0.22513002244943295,
|
835 |
+
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.18442974136293813,
|
837 |
+
"score_ci_high": 0.26207208386144243,
|
838 |
+
"sacrebleu_ci_low": 0.18442974136293813,
|
839 |
+
"sacrebleu_ci_high": 0.26207208386144243
|
840 |
+
},
|
841 |
+
"mt_flores_101_deu_eng": {
|
842 |
+
"num_of_instances": 66,
|
843 |
+
"counts": [
|
844 |
+
1117,
|
845 |
+
634,
|
846 |
+
393,
|
847 |
+
252
|
848 |
+
],
|
849 |
+
"totals": [
|
850 |
+
1756,
|
851 |
+
1690,
|
852 |
+
1624,
|
853 |
+
1558
|
854 |
+
],
|
855 |
+
"precisions": [
|
856 |
+
0.6361047835990888,
|
857 |
+
0.37514792899408284,
|
858 |
+
0.2419950738916256,
|
859 |
+
0.16174582798459564
|
860 |
+
],
|
861 |
+
"bp": 1.0,
|
862 |
+
"sys_len": 1756,
|
863 |
+
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.3108799453655372,
|
865 |
+
"score": 0.3108799453655372,
|
866 |
+
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.26939824952148095,
|
868 |
+
"score_ci_high": 0.35631092704009654,
|
869 |
+
"sacrebleu_ci_low": 0.26939824952148095,
|
870 |
+
"sacrebleu_ci_high": 0.35631092704009654
|
871 |
+
},
|
872 |
+
"mt_flores_101_eng_ara": {
|
873 |
+
"num_of_instances": 66,
|
874 |
+
"counts": [
|
875 |
+
518,
|
876 |
+
145,
|
877 |
+
48,
|
878 |
+
15
|
879 |
+
],
|
880 |
+
"totals": [
|
881 |
+
1654,
|
882 |
+
1588,
|
883 |
+
1522,
|
884 |
+
1456
|
885 |
+
],
|
886 |
+
"precisions": [
|
887 |
+
0.313180169286578,
|
888 |
+
0.09130982367758186,
|
889 |
+
0.03153745072273324,
|
890 |
+
0.0103021978021978
|
891 |
+
],
|
892 |
+
"bp": 1.0,
|
893 |
+
"sys_len": 1654,
|
894 |
+
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.055209912255726495,
|
896 |
+
"score": 0.055209912255726495,
|
897 |
+
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.04194038184823177,
|
899 |
+
"score_ci_high": 0.08190458345551464,
|
900 |
+
"sacrebleu_ci_low": 0.04194038184823177,
|
901 |
+
"sacrebleu_ci_high": 0.08190458345551464
|
902 |
+
},
|
903 |
+
"mt_flores_101_eng_deu": {
|
904 |
+
"num_of_instances": 66,
|
905 |
+
"counts": [
|
906 |
+
932,
|
907 |
+
408,
|
908 |
+
209,
|
909 |
+
113
|
910 |
+
],
|
911 |
+
"totals": [
|
912 |
+
1864,
|
913 |
+
1798,
|
914 |
+
1732,
|
915 |
+
1666
|
916 |
+
],
|
917 |
+
"precisions": [
|
918 |
+
0.5,
|
919 |
+
0.22691879866518352,
|
920 |
+
0.12066974595842955,
|
921 |
+
0.06782713085234093
|
922 |
+
],
|
923 |
+
"bp": 1.0,
|
924 |
+
"sys_len": 1864,
|
925 |
+
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.17456637003886807,
|
927 |
+
"score": 0.17456637003886807,
|
928 |
+
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.1446545609941566,
|
930 |
+
"score_ci_high": 0.2117995060278497,
|
931 |
+
"sacrebleu_ci_low": 0.1446545609941566,
|
932 |
+
"sacrebleu_ci_high": 0.2117995060278497
|
933 |
+
},
|
934 |
+
"mt_flores_101_eng_fra": {
|
935 |
+
"num_of_instances": 66,
|
936 |
+
"counts": [
|
937 |
+
1281,
|
938 |
+
791,
|
939 |
+
540,
|
940 |
+
380
|
941 |
+
],
|
942 |
+
"totals": [
|
943 |
+
2064,
|
944 |
+
1998,
|
945 |
+
1932,
|
946 |
+
1866
|
947 |
+
],
|
948 |
+
"precisions": [
|
949 |
+
0.6206395348837209,
|
950 |
+
0.39589589589589586,
|
951 |
+
0.27950310559006214,
|
952 |
+
0.20364415862808144
|
953 |
+
],
|
954 |
+
"bp": 0.9980638921833086,
|
955 |
+
"sys_len": 2064,
|
956 |
+
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.3432243584069162,
|
958 |
+
"score": 0.3432243584069162,
|
959 |
+
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.31863429882294914,
|
961 |
+
"score_ci_high": 0.38949054032191865,
|
962 |
+
"sacrebleu_ci_low": 0.31863429882294914,
|
963 |
+
"sacrebleu_ci_high": 0.38949054032191865
|
964 |
+
},
|
965 |
+
"mt_flores_101_eng_kor": {
|
966 |
+
"num_of_instances": 66,
|
967 |
+
"counts": [
|
968 |
+
943,
|
969 |
+
328,
|
970 |
+
141,
|
971 |
+
66
|
972 |
+
],
|
973 |
+
"totals": [
|
974 |
+
3253,
|
975 |
+
3187,
|
976 |
+
3121,
|
977 |
+
3055
|
978 |
+
],
|
979 |
+
"precisions": [
|
980 |
+
0.28988625883799574,
|
981 |
+
0.10291810480075306,
|
982 |
+
0.04517782761935277,
|
983 |
+
0.02160392798690671
|
984 |
+
],
|
985 |
+
"bp": 1.0,
|
986 |
+
"sys_len": 3253,
|
987 |
+
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.07345889118508468,
|
989 |
+
"score": 0.07345889118508468,
|
990 |
+
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.06129370081536535,
|
992 |
+
"score_ci_high": 0.09632357969438955,
|
993 |
+
"sacrebleu_ci_low": 0.06129370081536535,
|
994 |
+
"sacrebleu_ci_high": 0.09632357969438955
|
995 |
+
},
|
996 |
+
"mt_flores_101_eng_por": {
|
997 |
+
"num_of_instances": 66,
|
998 |
+
"counts": [
|
999 |
+
1229,
|
1000 |
+
727,
|
1001 |
+
477,
|
1002 |
+
316
|
1003 |
+
],
|
1004 |
+
"totals": [
|
1005 |
+
1915,
|
1006 |
+
1849,
|
1007 |
+
1783,
|
1008 |
+
1717
|
1009 |
+
],
|
1010 |
+
"precisions": [
|
1011 |
+
0.64177545691906,
|
1012 |
+
0.3931855056787452,
|
1013 |
+
0.2675266404935502,
|
1014 |
+
0.18404193360512522
|
1015 |
+
],
|
1016 |
+
"bp": 0.9994779431076575,
|
1017 |
+
"sys_len": 1915,
|
1018 |
+
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.3336870259850046,
|
1020 |
+
"score": 0.3336870259850046,
|
1021 |
+
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.30427044055098784,
|
1023 |
+
"score_ci_high": 0.3722955797843792,
|
1024 |
+
"sacrebleu_ci_low": 0.30427044055098784,
|
1025 |
+
"sacrebleu_ci_high": 0.3722955797843792
|
1026 |
+
},
|
1027 |
+
"mt_flores_101_eng_ron": {
|
1028 |
+
"num_of_instances": 66,
|
1029 |
+
"counts": [
|
1030 |
+
776,
|
1031 |
+
292,
|
1032 |
+
137,
|
1033 |
+
66
|
1034 |
+
],
|
1035 |
+
"totals": [
|
1036 |
+
2002,
|
1037 |
+
1936,
|
1038 |
+
1870,
|
1039 |
+
1804
|
1040 |
+
],
|
1041 |
+
"precisions": [
|
1042 |
+
0.3876123876123876,
|
1043 |
+
0.15082644628099173,
|
1044 |
+
0.0732620320855615,
|
1045 |
+
0.03658536585365854
|
1046 |
+
],
|
1047 |
+
"bp": 1.0,
|
1048 |
+
"sys_len": 2002,
|
1049 |
+
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.11188332833173187,
|
1051 |
+
"score": 0.11188332833173187,
|
1052 |
+
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.09410502256897492,
|
1054 |
+
"score_ci_high": 0.14085608331376978,
|
1055 |
+
"sacrebleu_ci_low": 0.09410502256897492,
|
1056 |
+
"sacrebleu_ci_high": 0.14085608331376978
|
1057 |
+
},
|
1058 |
+
"mt_flores_101_eng_spa": {
|
1059 |
+
"num_of_instances": 66,
|
1060 |
+
"counts": [
|
1061 |
+
1155,
|
1062 |
+
545,
|
1063 |
+
284,
|
1064 |
+
157
|
1065 |
+
],
|
1066 |
+
"totals": [
|
1067 |
+
2065,
|
1068 |
+
1999,
|
1069 |
+
1933,
|
1070 |
+
1867
|
1071 |
+
],
|
1072 |
+
"precisions": [
|
1073 |
+
0.559322033898305,
|
1074 |
+
0.2726363181590795,
|
1075 |
+
0.14692188308329024,
|
1076 |
+
0.08409212640599893
|
1077 |
+
],
|
1078 |
+
"bp": 0.9841463832388515,
|
1079 |
+
"sys_len": 2065,
|
1080 |
+
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.20503668186599197,
|
1082 |
+
"score": 0.20503668186599197,
|
1083 |
+
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.17287865798739985,
|
1085 |
+
"score_ci_high": 0.22839534867300298,
|
1086 |
+
"sacrebleu_ci_low": 0.17287865798739985,
|
1087 |
+
"sacrebleu_ci_high": 0.22839534867300298
|
1088 |
+
},
|
1089 |
+
"mt_flores_101_fra_eng": {
|
1090 |
+
"num_of_instances": 66,
|
1091 |
+
"counts": [
|
1092 |
+
1154,
|
1093 |
+
673,
|
1094 |
+
416,
|
1095 |
+
270
|
1096 |
+
],
|
1097 |
+
"totals": [
|
1098 |
+
1805,
|
1099 |
+
1739,
|
1100 |
+
1673,
|
1101 |
+
1607
|
1102 |
+
],
|
1103 |
+
"precisions": [
|
1104 |
+
0.6393351800554017,
|
1105 |
+
0.38700402530189765,
|
1106 |
+
0.24865511057979678,
|
1107 |
+
0.16801493466085873
|
1108 |
+
],
|
1109 |
+
"bp": 1.0,
|
1110 |
+
"sys_len": 1805,
|
1111 |
+
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.31885801670800706,
|
1113 |
+
"score": 0.31885801670800706,
|
1114 |
+
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.2891892296952914,
|
1116 |
+
"score_ci_high": 0.36690842488060277,
|
1117 |
+
"sacrebleu_ci_low": 0.2891892296952914,
|
1118 |
+
"sacrebleu_ci_high": 0.36690842488060277
|
1119 |
+
},
|
1120 |
+
"mt_flores_101_jpn_eng": {
|
1121 |
+
"num_of_instances": 66,
|
1122 |
+
"counts": [
|
1123 |
+
932,
|
1124 |
+
394,
|
1125 |
+
198,
|
1126 |
+
108
|
1127 |
+
],
|
1128 |
+
"totals": [
|
1129 |
+
2022,
|
1130 |
+
1956,
|
1131 |
+
1890,
|
1132 |
+
1824
|
1133 |
+
],
|
1134 |
+
"precisions": [
|
1135 |
+
0.4609297725024728,
|
1136 |
+
0.20143149284253578,
|
1137 |
+
0.10476190476190476,
|
1138 |
+
0.05921052631578948
|
1139 |
+
],
|
1140 |
+
"bp": 1.0,
|
1141 |
+
"sys_len": 2022,
|
1142 |
+
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.15491415770056607,
|
1144 |
+
"score": 0.15491415770056607,
|
1145 |
+
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.12994962612263172,
|
1147 |
+
"score_ci_high": 0.1870372431393324,
|
1148 |
+
"sacrebleu_ci_low": 0.12994962612263172,
|
1149 |
+
"sacrebleu_ci_high": 0.1870372431393324
|
1150 |
+
},
|
1151 |
+
"mt_flores_101_kor_eng": {
|
1152 |
+
"num_of_instances": 66,
|
1153 |
+
"counts": [
|
1154 |
+
842,
|
1155 |
+
316,
|
1156 |
+
161,
|
1157 |
+
84
|
1158 |
+
],
|
1159 |
+
"totals": [
|
1160 |
+
1863,
|
1161 |
+
1797,
|
1162 |
+
1731,
|
1163 |
+
1665
|
1164 |
+
],
|
1165 |
+
"precisions": [
|
1166 |
+
0.451959205582394,
|
1167 |
+
0.17584863661658318,
|
1168 |
+
0.09300982091276719,
|
1169 |
+
0.05045045045045045
|
1170 |
+
],
|
1171 |
+
"bp": 1.0,
|
1172 |
+
"sys_len": 1863,
|
1173 |
+
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.1389658296637508,
|
1175 |
+
"score": 0.1389658296637508,
|
1176 |
+
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.11444856857935395,
|
1178 |
+
"score_ci_high": 0.17296950184028867,
|
1179 |
+
"sacrebleu_ci_low": 0.11444856857935395,
|
1180 |
+
"sacrebleu_ci_high": 0.17296950184028867
|
1181 |
+
},
|
1182 |
+
"mt_flores_101_por_eng": {
|
1183 |
+
"num_of_instances": 66,
|
1184 |
+
"counts": [
|
1185 |
+
1125,
|
1186 |
+
650,
|
1187 |
+
405,
|
1188 |
+
268
|
1189 |
+
],
|
1190 |
+
"totals": [
|
1191 |
+
1763,
|
1192 |
+
1697,
|
1193 |
+
1631,
|
1194 |
+
1565
|
1195 |
+
],
|
1196 |
+
"precisions": [
|
1197 |
+
0.6381168462847419,
|
1198 |
+
0.38302887448438416,
|
1199 |
+
0.24831391784181484,
|
1200 |
+
0.17124600638977636
|
1201 |
+
],
|
1202 |
+
"bp": 1.0,
|
1203 |
+
"sys_len": 1763,
|
1204 |
+
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.31929220031180316,
|
1206 |
+
"score": 0.31929220031180316,
|
1207 |
+
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.27700187051697595,
|
1209 |
+
"score_ci_high": 0.3625973900935978,
|
1210 |
+
"sacrebleu_ci_low": 0.27700187051697595,
|
1211 |
+
"sacrebleu_ci_high": 0.3625973900935978
|
1212 |
+
},
|
1213 |
+
"mt_flores_101_ron_eng": {
|
1214 |
+
"num_of_instances": 66,
|
1215 |
+
"counts": [
|
1216 |
+
1107,
|
1217 |
+
596,
|
1218 |
+
373,
|
1219 |
+
242
|
1220 |
+
],
|
1221 |
+
"totals": [
|
1222 |
+
1855,
|
1223 |
+
1789,
|
1224 |
+
1723,
|
1225 |
+
1657
|
1226 |
+
],
|
1227 |
+
"precisions": [
|
1228 |
+
0.5967654986522911,
|
1229 |
+
0.33314700950251536,
|
1230 |
+
0.21648287869994196,
|
1231 |
+
0.1460470730235365
|
1232 |
+
],
|
1233 |
+
"bp": 1.0,
|
1234 |
+
"sys_len": 1855,
|
1235 |
+
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.28157170427283745,
|
1237 |
+
"score": 0.28157170427283745,
|
1238 |
+
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.24793101467267725,
|
1240 |
+
"score_ci_high": 0.34906947598840327,
|
1241 |
+
"sacrebleu_ci_low": 0.24793101467267725,
|
1242 |
+
"sacrebleu_ci_high": 0.34906947598840327
|
1243 |
+
},
|
1244 |
+
"mt_flores_101_spa_eng": {
|
1245 |
+
"num_of_instances": 66,
|
1246 |
+
"counts": [
|
1247 |
+
1030,
|
1248 |
+
507,
|
1249 |
+
273,
|
1250 |
+
149
|
1251 |
+
],
|
1252 |
+
"totals": [
|
1253 |
+
1843,
|
1254 |
+
1777,
|
1255 |
+
1711,
|
1256 |
+
1645
|
1257 |
+
],
|
1258 |
+
"precisions": [
|
1259 |
+
0.5588714053174173,
|
1260 |
+
0.28531232414181207,
|
1261 |
+
0.15955581531268265,
|
1262 |
+
0.0905775075987842
|
1263 |
+
],
|
1264 |
+
"bp": 1.0,
|
1265 |
+
"sys_len": 1843,
|
1266 |
+
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.21909948470764218,
|
1268 |
+
"score": 0.21909948470764218,
|
1269 |
+
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.18945741585306675,
|
1271 |
+
"score_ci_high": 0.26472135303184596,
|
1272 |
+
"sacrebleu_ci_low": 0.18945741585306675,
|
1273 |
+
"sacrebleu_ci_high": 0.26472135303184596
|
1274 |
+
},
|
1275 |
+
"score": 0.21771852861659338,
|
1276 |
+
"score_name": "subsets_mean",
|
1277 |
+
"num_of_instances": 990
|
1278 |
+
},
|
1279 |
+
"score": 0.4171223590857319,
|
1280 |
+
"score_name": "subsets_mean",
|
1281 |
+
"num_of_instances": 12472
|
1282 |
+
}
|
1283 |
+
}
|
results/bluebench/2025-06-19T16-21-09_evaluation_results.json
ADDED
@@ -0,0 +1,1283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-19T20:21:05.821665Z",
|
4 |
+
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
+
"--tasks",
|
7 |
+
"benchmarks.bluebench",
|
8 |
+
"--model",
|
9 |
+
"cross_provider",
|
10 |
+
"--model_args",
|
11 |
+
"model_name=watsonx/ibm/granite-3-3-8b-instruct,max_tokens=256",
|
12 |
+
"--output_path",
|
13 |
+
"./results/bluebench",
|
14 |
+
"--log_samples",
|
15 |
+
"--trust_remote_code",
|
16 |
+
"--batch_size",
|
17 |
+
"8",
|
18 |
+
"--verbosity",
|
19 |
+
"ERROR"
|
20 |
+
],
|
21 |
+
"parsed_arguments": {
|
22 |
+
"tasks": [
|
23 |
+
"benchmarks.bluebench"
|
24 |
+
],
|
25 |
+
"split": "test",
|
26 |
+
"num_fewshots": null,
|
27 |
+
"limit": null,
|
28 |
+
"batch_size": 8,
|
29 |
+
"model": "watsonx/ibm/granite-3-3-8b-instruct",
|
30 |
+
"model_args": {
|
31 |
+
"max_tokens": 256
|
32 |
+
},
|
33 |
+
"gen_kwargs": null,
|
34 |
+
"chat_template_kwargs": null,
|
35 |
+
"output_path": "./results/bluebench",
|
36 |
+
"output_file_prefix": "evaluation_results",
|
37 |
+
"log_samples": true,
|
38 |
+
"verbosity": "ERROR",
|
39 |
+
"apply_chat_template": false,
|
40 |
+
"trust_remote_code": true,
|
41 |
+
"disable_hf_cache": false,
|
42 |
+
"cache_dir": null
|
43 |
+
},
|
44 |
+
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
|
46 |
+
"python_version": "3.10.18",
|
47 |
+
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
+
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
+
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
63 |
+
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
78 |
+
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
80 |
+
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
+
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
+
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
105 |
+
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
108 |
+
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
112 |
+
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
+
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
+
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
137 |
+
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
+
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
+
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
+
"jaraco.text": "3.12.1",
|
170 |
+
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"results": {
|
177 |
+
"bias": {
|
178 |
+
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.5555555555555556,
|
180 |
+
"accuracy_ci_low": 0.45555555555555555,
|
181 |
+
"accuracy_ci_high": 0.6555555555555556,
|
182 |
+
"score_name": "accuracy",
|
183 |
+
"score": 0.5555555555555556,
|
184 |
+
"score_ci_high": 0.6555555555555556,
|
185 |
+
"score_ci_low": 0.45555555555555555,
|
186 |
+
"num_of_instances": 90
|
187 |
+
},
|
188 |
+
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.6222222222222222,
|
190 |
+
"accuracy_ci_low": 0.5222222222222223,
|
191 |
+
"accuracy_ci_high": 0.7222222222222222,
|
192 |
+
"score_name": "accuracy",
|
193 |
+
"score": 0.6222222222222222,
|
194 |
+
"score_ci_high": 0.7222222222222222,
|
195 |
+
"score_ci_low": 0.5222222222222223,
|
196 |
+
"num_of_instances": 90
|
197 |
+
},
|
198 |
+
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.8777777777777778,
|
200 |
+
"accuracy_ci_low": 0.8,
|
201 |
+
"accuracy_ci_high": 0.9333333333333333,
|
202 |
+
"score_name": "accuracy",
|
203 |
+
"score": 0.8777777777777778,
|
204 |
+
"score_ci_high": 0.9333333333333333,
|
205 |
+
"score_ci_low": 0.8,
|
206 |
+
"num_of_instances": 90
|
207 |
+
},
|
208 |
+
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.6333333333333333,
|
210 |
+
"accuracy_ci_low": 0.5333333333333333,
|
211 |
+
"accuracy_ci_high": 0.7333333333333333,
|
212 |
+
"score_name": "accuracy",
|
213 |
+
"score": 0.6333333333333333,
|
214 |
+
"score_ci_high": 0.7333333333333333,
|
215 |
+
"score_ci_low": 0.5333333333333333,
|
216 |
+
"num_of_instances": 90
|
217 |
+
},
|
218 |
+
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.6555555555555556,
|
220 |
+
"accuracy_ci_low": 0.5555555555555556,
|
221 |
+
"accuracy_ci_high": 0.7539633744548231,
|
222 |
+
"score_name": "accuracy",
|
223 |
+
"score": 0.6555555555555556,
|
224 |
+
"score_ci_high": 0.7539633744548231,
|
225 |
+
"score_ci_low": 0.5555555555555556,
|
226 |
+
"num_of_instances": 90
|
227 |
+
},
|
228 |
+
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.9333333333333333,
|
230 |
+
"accuracy_ci_low": 0.8666666666666667,
|
231 |
+
"accuracy_ci_high": 0.9777777777777777,
|
232 |
+
"score_name": "accuracy",
|
233 |
+
"score": 0.9333333333333333,
|
234 |
+
"score_ci_high": 0.9777777777777777,
|
235 |
+
"score_ci_low": 0.8666666666666667,
|
236 |
+
"num_of_instances": 90
|
237 |
+
},
|
238 |
+
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.8888888888888888,
|
240 |
+
"accuracy_ci_low": 0.8222222222222222,
|
241 |
+
"accuracy_ci_high": 0.9444444444444444,
|
242 |
+
"score_name": "accuracy",
|
243 |
+
"score": 0.8888888888888888,
|
244 |
+
"score_ci_high": 0.9444444444444444,
|
245 |
+
"score_ci_low": 0.8222222222222222,
|
246 |
+
"num_of_instances": 90
|
247 |
+
},
|
248 |
+
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.9333333333333333,
|
250 |
+
"accuracy_ci_low": 0.8666666666666667,
|
251 |
+
"accuracy_ci_high": 0.9777777777777777,
|
252 |
+
"score_name": "accuracy",
|
253 |
+
"score": 0.9333333333333333,
|
254 |
+
"score_ci_high": 0.9777777777777777,
|
255 |
+
"score_ci_low": 0.8666666666666667,
|
256 |
+
"num_of_instances": 90
|
257 |
+
},
|
258 |
+
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.7666666666666667,
|
260 |
+
"accuracy_ci_low": 0.6720698151047421,
|
261 |
+
"accuracy_ci_high": 0.8444444444444444,
|
262 |
+
"score_name": "accuracy",
|
263 |
+
"score": 0.7666666666666667,
|
264 |
+
"score_ci_high": 0.8444444444444444,
|
265 |
+
"score_ci_low": 0.6720698151047421,
|
266 |
+
"num_of_instances": 90
|
267 |
+
},
|
268 |
+
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.6333333333333333,
|
270 |
+
"accuracy_ci_low": 0.5333333333333333,
|
271 |
+
"accuracy_ci_high": 0.7283280971833935,
|
272 |
+
"score_name": "accuracy",
|
273 |
+
"score": 0.6333333333333333,
|
274 |
+
"score_ci_high": 0.7283280971833935,
|
275 |
+
"score_ci_low": 0.5333333333333333,
|
276 |
+
"num_of_instances": 90
|
277 |
+
},
|
278 |
+
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.7666666666666667,
|
280 |
+
"accuracy_ci_low": 0.6666666666666666,
|
281 |
+
"accuracy_ci_high": 0.8444444444444444,
|
282 |
+
"score_name": "accuracy",
|
283 |
+
"score": 0.7666666666666667,
|
284 |
+
"score_ci_high": 0.8444444444444444,
|
285 |
+
"score_ci_low": 0.6666666666666666,
|
286 |
+
"num_of_instances": 90
|
287 |
+
},
|
288 |
+
"score": 0.7515151515151515,
|
289 |
+
"score_name": "subsets_mean",
|
290 |
+
"num_of_instances": 990
|
291 |
+
},
|
292 |
+
"chatbot_abilities": {
|
293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
+
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.5,
|
296 |
+
"score": 0.5,
|
297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
+
},
|
299 |
+
"score": 0.5,
|
300 |
+
"score_name": "subsets_mean",
|
301 |
+
"num_of_instances": 500
|
302 |
+
},
|
303 |
+
"entity_extraction": {
|
304 |
+
"universal_ner_en_ewt": {
|
305 |
+
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.5102639296187683,
|
307 |
+
"f1_Organization": 0.3381294964028777,
|
308 |
+
"f1_Location": 0.35652173913043483,
|
309 |
+
"f1_macro": 0.40163838838402693,
|
310 |
+
"recall_macro": 0.3240210323686792,
|
311 |
+
"precision_macro": 0.530656067251462,
|
312 |
+
"in_classes_support": 0.5625,
|
313 |
+
"f1_micro": 0.31789282470481384,
|
314 |
+
"recall_micro": 0.3333333333333333,
|
315 |
+
"precision_micro": 0.3038194444444444,
|
316 |
+
"score": 0.31789282470481384,
|
317 |
+
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.26482961534023236,
|
319 |
+
"score_ci_high": 0.37029988780714157,
|
320 |
+
"f1_micro_ci_low": 0.26482961534023236,
|
321 |
+
"f1_micro_ci_high": 0.37029988780714157
|
322 |
+
},
|
323 |
+
"score": 0.31789282470481384,
|
324 |
+
"score_name": "subsets_mean",
|
325 |
+
"num_of_instances": 1000
|
326 |
+
},
|
327 |
+
"knowledge": {
|
328 |
+
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.5211267605633803,
|
330 |
+
"accuracy_ci_low": 0.4084507042253521,
|
331 |
+
"accuracy_ci_high": 0.6338028169014085,
|
332 |
+
"score_name": "accuracy",
|
333 |
+
"score": 0.5211267605633803,
|
334 |
+
"score_ci_high": 0.6338028169014085,
|
335 |
+
"score_ci_low": 0.4084507042253521,
|
336 |
+
"num_of_instances": 71
|
337 |
+
},
|
338 |
+
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.19718309859154928,
|
340 |
+
"accuracy_ci_low": 0.11267605633802817,
|
341 |
+
"accuracy_ci_high": 0.29577464788732394,
|
342 |
+
"score_name": "accuracy",
|
343 |
+
"score": 0.19718309859154928,
|
344 |
+
"score_ci_high": 0.29577464788732394,
|
345 |
+
"score_ci_low": 0.11267605633802817,
|
346 |
+
"num_of_instances": 71
|
347 |
+
},
|
348 |
+
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.23943661971830985,
|
350 |
+
"accuracy_ci_low": 0.15492957746478872,
|
351 |
+
"accuracy_ci_high": 0.3380281690140845,
|
352 |
+
"score_name": "accuracy",
|
353 |
+
"score": 0.23943661971830985,
|
354 |
+
"score_ci_high": 0.3380281690140845,
|
355 |
+
"score_ci_low": 0.15492957746478872,
|
356 |
+
"num_of_instances": 71
|
357 |
+
},
|
358 |
+
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.43661971830985913,
|
360 |
+
"accuracy_ci_low": 0.323943661971831,
|
361 |
+
"accuracy_ci_high": 0.5492957746478874,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.43661971830985913,
|
364 |
+
"score_ci_high": 0.5492957746478874,
|
365 |
+
"score_ci_low": 0.323943661971831,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.38028169014084506,
|
370 |
+
"accuracy_ci_low": 0.2676056338028169,
|
371 |
+
"accuracy_ci_high": 0.49295774647887325,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.38028169014084506,
|
374 |
+
"score_ci_high": 0.49295774647887325,
|
375 |
+
"score_ci_low": 0.2676056338028169,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.2535211267605634,
|
380 |
+
"accuracy_ci_low": 0.16901408450704225,
|
381 |
+
"accuracy_ci_high": 0.36048330202820134,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.2535211267605634,
|
384 |
+
"score_ci_high": 0.36048330202820134,
|
385 |
+
"score_ci_low": 0.16901408450704225,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.36619718309859156,
|
390 |
+
"accuracy_ci_low": 0.2535211267605634,
|
391 |
+
"accuracy_ci_high": 0.4788732394366197,
|
392 |
+
"score_name": "accuracy",
|
393 |
+
"score": 0.36619718309859156,
|
394 |
+
"score_ci_high": 0.4788732394366197,
|
395 |
+
"score_ci_low": 0.2535211267605634,
|
396 |
+
"num_of_instances": 71
|
397 |
+
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.36619718309859156,
|
400 |
+
"accuracy_ci_low": 0.2535211267605634,
|
401 |
+
"accuracy_ci_high": 0.4788732394366197,
|
402 |
+
"score_name": "accuracy",
|
403 |
+
"score": 0.36619718309859156,
|
404 |
+
"score_ci_high": 0.4788732394366197,
|
405 |
+
"score_ci_low": 0.2535211267605634,
|
406 |
+
"num_of_instances": 71
|
407 |
+
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.36619718309859156,
|
410 |
+
"accuracy_ci_low": 0.2535211267605634,
|
411 |
+
"accuracy_ci_high": 0.4788732394366197,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.36619718309859156,
|
414 |
+
"score_ci_high": 0.4788732394366197,
|
415 |
+
"score_ci_low": 0.2535211267605634,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.1267605633802817,
|
420 |
+
"accuracy_ci_low": 0.056338028169014086,
|
421 |
+
"accuracy_ci_high": 0.22535211267605634,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.1267605633802817,
|
424 |
+
"score_ci_high": 0.22535211267605634,
|
425 |
+
"score_ci_low": 0.056338028169014086,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
+
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.22535211267605634,
|
430 |
+
"accuracy_ci_low": 0.14084507042253522,
|
431 |
+
"accuracy_ci_high": 0.323943661971831,
|
432 |
+
"score_name": "accuracy",
|
433 |
+
"score": 0.22535211267605634,
|
434 |
+
"score_ci_high": 0.323943661971831,
|
435 |
+
"score_ci_low": 0.14084507042253522,
|
436 |
+
"num_of_instances": 71
|
437 |
+
},
|
438 |
+
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.4084507042253521,
|
440 |
+
"accuracy_ci_low": 0.30985915492957744,
|
441 |
+
"accuracy_ci_high": 0.5352112676056338,
|
442 |
+
"score_name": "accuracy",
|
443 |
+
"score": 0.4084507042253521,
|
444 |
+
"score_ci_high": 0.5352112676056338,
|
445 |
+
"score_ci_low": 0.30985915492957744,
|
446 |
+
"num_of_instances": 71
|
447 |
+
},
|
448 |
+
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.29577464788732394,
|
450 |
+
"accuracy_ci_low": 0.19718309859154928,
|
451 |
+
"accuracy_ci_high": 0.4084507042253521,
|
452 |
+
"score_name": "accuracy",
|
453 |
+
"score": 0.29577464788732394,
|
454 |
+
"score_ci_high": 0.4084507042253521,
|
455 |
+
"score_ci_low": 0.19718309859154928,
|
456 |
+
"num_of_instances": 71
|
457 |
+
},
|
458 |
+
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.5352112676056338,
|
460 |
+
"accuracy_ci_low": 0.4084507042253521,
|
461 |
+
"accuracy_ci_high": 0.647887323943662,
|
462 |
+
"score_name": "accuracy",
|
463 |
+
"score": 0.5352112676056338,
|
464 |
+
"score_ci_high": 0.647887323943662,
|
465 |
+
"score_ci_low": 0.4084507042253521,
|
466 |
+
"num_of_instances": 71
|
467 |
+
},
|
468 |
+
"score": 0.33702213279678067,
|
469 |
+
"score_name": "subsets_mean",
|
470 |
+
"num_of_instances": 994
|
471 |
+
},
|
472 |
+
"legal": {
|
473 |
+
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.2696554985630616,
|
475 |
+
"f1_suggestive": 0.2727272727272727,
|
476 |
+
"f1_arbitrary": 0.43137254901960786,
|
477 |
+
"f1_generic": 0.11764705882352941,
|
478 |
+
"f1_fanciful": 0.2,
|
479 |
+
"f1_descriptive": 0.32653061224489793,
|
480 |
+
"f1_macro_ci_low": 0.18689773936584586,
|
481 |
+
"f1_macro_ci_high": 0.37923074712363225,
|
482 |
+
"score_name": "f1_micro",
|
483 |
+
"score": 0.31446540880503143,
|
484 |
+
"score_ci_high": 0.42038216560509556,
|
485 |
+
"score_ci_low": 0.21656050955414013,
|
486 |
+
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.29411764705882354,
|
488 |
+
"accuracy_ci_low": 0.2,
|
489 |
+
"accuracy_ci_high": 0.4,
|
490 |
+
"f1_micro": 0.31446540880503143,
|
491 |
+
"f1_micro_ci_low": 0.21656050955414013,
|
492 |
+
"f1_micro_ci_high": 0.42038216560509556
|
493 |
+
},
|
494 |
+
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.5388253241800153,
|
496 |
+
"f1_no": 0.7298245614035088,
|
497 |
+
"f1_yes": 0.34782608695652173,
|
498 |
+
"f1_macro_ci_low": 0.47191290375757455,
|
499 |
+
"f1_macro_ci_high": 0.6216206779092042,
|
500 |
+
"score_name": "f1_micro",
|
501 |
+
"score": 0.636604774535809,
|
502 |
+
"score_ci_high": 0.6985040092826637,
|
503 |
+
"score_ci_low": 0.5691144311757004,
|
504 |
+
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.6,
|
506 |
+
"accuracy_ci_low": 0.53,
|
507 |
+
"accuracy_ci_high": 0.665,
|
508 |
+
"f1_micro": 0.636604774535809,
|
509 |
+
"f1_micro_ci_low": 0.5691144311757004,
|
510 |
+
"f1_micro_ci_high": 0.6985040092826637
|
511 |
+
},
|
512 |
+
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.2947177227927682,
|
514 |
+
"f1_conclusion": 0.2127659574468085,
|
515 |
+
"f1_decree": 0.23529411764705882,
|
516 |
+
"f1_issue": 0.2711864406779661,
|
517 |
+
"f1_rule": 0.42857142857142855,
|
518 |
+
"f1_analysis": 0.4444444444444444,
|
519 |
+
"f1_facts": 0.21621621621621623,
|
520 |
+
"f1_procedural history": 0.2545454545454545,
|
521 |
+
"f1_macro_ci_low": 0.23794703715833648,
|
522 |
+
"f1_macro_ci_high": 0.36665623309642204,
|
523 |
+
"score_name": "f1_micro",
|
524 |
+
"score": 0.30409356725146197,
|
525 |
+
"score_ci_high": 0.3711587285161421,
|
526 |
+
"score_ci_low": 0.23855266549315363,
|
527 |
+
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.26,
|
529 |
+
"accuracy_ci_low": 0.2,
|
530 |
+
"accuracy_ci_high": 0.32,
|
531 |
+
"f1_micro": 0.30409356725146197,
|
532 |
+
"f1_micro_ci_low": 0.23855266549315363,
|
533 |
+
"f1_micro_ci_high": 0.3711587285161421
|
534 |
+
},
|
535 |
+
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.49092908191313905,
|
537 |
+
"f1_yes": 0.5700934579439252,
|
538 |
+
"f1_no": 0.4117647058823529,
|
539 |
+
"f1_macro_ci_low": 0.4178065856787266,
|
540 |
+
"f1_macro_ci_high": 0.5601203681213927,
|
541 |
+
"score_name": "f1_micro",
|
542 |
+
"score": 0.5,
|
543 |
+
"score_ci_high": 0.566970455032283,
|
544 |
+
"score_ci_low": 0.42555336134062,
|
545 |
+
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.48,
|
547 |
+
"accuracy_ci_low": 0.405,
|
548 |
+
"accuracy_ci_high": 0.545,
|
549 |
+
"f1_micro": 0.5,
|
550 |
+
"f1_micro_ci_low": 0.42555336134062,
|
551 |
+
"f1_micro_ci_high": 0.566970455032283
|
552 |
+
},
|
553 |
+
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.8315276273022751,
|
555 |
+
"f1_yes": 0.8169014084507042,
|
556 |
+
"f1_no": 0.8461538461538461,
|
557 |
+
"f1_macro_ci_low": 0.7549023325928579,
|
558 |
+
"f1_macro_ci_high": 0.890440353074843,
|
559 |
+
"score_name": "f1_micro",
|
560 |
+
"score": 0.8322147651006712,
|
561 |
+
"score_ci_high": 0.8903225806451613,
|
562 |
+
"score_ci_low": 0.7554946760306516,
|
563 |
+
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.7294117647058823,
|
565 |
+
"accuracy_ci_low": 0.6352941176470588,
|
566 |
+
"accuracy_ci_high": 0.8117647058823529,
|
567 |
+
"f1_micro": 0.8322147651006712,
|
568 |
+
"f1_micro_ci_low": 0.7554946760306516,
|
569 |
+
"f1_micro_ci_high": 0.8903225806451613
|
570 |
+
},
|
571 |
+
"score": 0.5174757031385947,
|
572 |
+
"score_name": "subsets_mean",
|
573 |
+
"num_of_instances": 770
|
574 |
+
},
|
575 |
+
"news_classification": {
|
576 |
+
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.42272407811143237,
|
578 |
+
"f1_cars": 0.6078431372549019,
|
579 |
+
"f1_pc hardware": 0.34080717488789236,
|
580 |
+
"f1_windows x": 0.029850746268656716,
|
581 |
+
"f1_computer graphics": 0.4367816091954023,
|
582 |
+
"f1_atheism": 0.21739130434782608,
|
583 |
+
"f1_religion": 0.23300970873786409,
|
584 |
+
"f1_medicine": 0.8641975308641975,
|
585 |
+
"f1_christianity": 0.1694915254237288,
|
586 |
+
"f1_microsoft windows": 0.39436619718309857,
|
587 |
+
"f1_middle east": 0.43037974683544306,
|
588 |
+
"f1_politics": 0.291970802919708,
|
589 |
+
"f1_motorcycles": 0.43902439024390244,
|
590 |
+
"f1_mac hardware": 0.09090909090909091,
|
591 |
+
"f1_for sale": 0.625,
|
592 |
+
"f1_guns": 0.18181818181818182,
|
593 |
+
"f1_space": 0.5569620253164557,
|
594 |
+
"f1_cryptography": 0.4482758620689655,
|
595 |
+
"f1_baseball": 0.8545454545454545,
|
596 |
+
"f1_hockey": 0.859504132231405,
|
597 |
+
"f1_electronics": 0.38235294117647056,
|
598 |
+
"f1_macro_ci_low": 0.3988534736802405,
|
599 |
+
"f1_macro_ci_high": 0.4557473948035634,
|
600 |
+
"score_name": "f1_micro",
|
601 |
+
"score": 0.44368600682593856,
|
602 |
+
"score_ci_high": 0.47444463958776134,
|
603 |
+
"score_ci_low": 0.4135801299006492,
|
604 |
+
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.39,
|
606 |
+
"accuracy_ci_low": 0.36,
|
607 |
+
"accuracy_ci_high": 0.418,
|
608 |
+
"f1_micro": 0.44368600682593856,
|
609 |
+
"f1_micro_ci_low": 0.4135801299006492,
|
610 |
+
"f1_micro_ci_high": 0.47444463958776134
|
611 |
+
},
|
612 |
+
"score": 0.44368600682593856,
|
613 |
+
"score_name": "subsets_mean",
|
614 |
+
"num_of_instances": 1000
|
615 |
+
},
|
616 |
+
"product_help": {
|
617 |
+
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.6105828707367139,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9035153328347045,
|
620 |
+
"f1_credit card or prepaid card": 0.5873015873015873,
|
621 |
+
"f1_debt collection": 0.6375,
|
622 |
+
"f1_checking or savings account": 0.75,
|
623 |
+
"f1_money transfer or virtual currency or money service": 0.5777777777777777,
|
624 |
+
"f1_vehicle loan or lease": 0.4666666666666667,
|
625 |
+
"f1_mortgage": 0.6785714285714286,
|
626 |
+
"f1_payday loan or title loan or personal loan": 0.17391304347826086,
|
627 |
+
"f1_student loan": 0.72,
|
628 |
+
"f1_macro_ci_low": 0.5575796516691159,
|
629 |
+
"f1_macro_ci_high": 0.6705972502098242,
|
630 |
+
"score_name": "f1_micro",
|
631 |
+
"score": 0.8195173137460651,
|
632 |
+
"score_ci_high": 0.842436974789916,
|
633 |
+
"score_ci_low": 0.7946166113913405,
|
634 |
+
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.781,
|
636 |
+
"accuracy_ci_low": 0.752,
|
637 |
+
"accuracy_ci_high": 0.806,
|
638 |
+
"f1_micro": 0.8195173137460651,
|
639 |
+
"f1_micro_ci_low": 0.7946166113913405,
|
640 |
+
"f1_micro_ci_high": 0.842436974789916
|
641 |
+
},
|
642 |
+
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.7132677588870594,
|
644 |
+
"f1_mortgages and loans": 0.7771428571428571,
|
645 |
+
"f1_credit card": 0.7023809523809523,
|
646 |
+
"f1_debt collection": 0.6854460093896714,
|
647 |
+
"f1_credit reporting": 0.7601476014760148,
|
648 |
+
"f1_retail banking": 0.6412213740458015,
|
649 |
+
"f1_macro_ci_low": 0.672279823384184,
|
650 |
+
"f1_macro_ci_high": 0.7539657340394554,
|
651 |
+
"score_name": "f1_micro",
|
652 |
+
"score": 0.7202505219206681,
|
653 |
+
"score_ci_high": 0.7576596149340853,
|
654 |
+
"score_ci_low": 0.6805865270375967,
|
655 |
+
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.69,
|
657 |
+
"accuracy_ci_low": 0.65,
|
658 |
+
"accuracy_ci_high": 0.73,
|
659 |
+
"f1_micro": 0.7202505219206681,
|
660 |
+
"f1_micro_ci_low": 0.6805865270375967,
|
661 |
+
"f1_micro_ci_high": 0.7576596149340853
|
662 |
+
},
|
663 |
+
"score": 0.7698839178333665,
|
664 |
+
"score_name": "subsets_mean",
|
665 |
+
"num_of_instances": 1500
|
666 |
+
},
|
667 |
+
"qa_finance": {
|
668 |
+
"fin_qa": {
|
669 |
+
"num_of_instances": 1000,
|
670 |
+
"execution_accuracy": 0.074,
|
671 |
+
"program_accuracy": 0.085,
|
672 |
+
"score": 0.085,
|
673 |
+
"score_name": "program_accuracy",
|
674 |
+
"execution_accuracy_ci_low": 0.058,
|
675 |
+
"execution_accuracy_ci_high": 0.091,
|
676 |
+
"program_accuracy_ci_low": 0.068,
|
677 |
+
"program_accuracy_ci_high": 0.102,
|
678 |
+
"score_ci_low": 0.068,
|
679 |
+
"score_ci_high": 0.102
|
680 |
+
},
|
681 |
+
"score": 0.085,
|
682 |
+
"score_name": "subsets_mean",
|
683 |
+
"num_of_instances": 1000
|
684 |
+
},
|
685 |
+
"rag_general": {
|
686 |
+
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.30022844870852566,
|
688 |
+
"recall": 0.5840193774846996,
|
689 |
+
"f1": 0.3357215148632638,
|
690 |
+
"precision_ci_low": 0.28030967471726836,
|
691 |
+
"precision_ci_high": 0.32121747414474766,
|
692 |
+
"recall_ci_low": 0.565861900260428,
|
693 |
+
"recall_ci_high": 0.59971992711831,
|
694 |
+
"f1_ci_low": 0.3175124739653954,
|
695 |
+
"f1_ci_high": 0.35218969004250933,
|
696 |
+
"score_name": "f1",
|
697 |
+
"score": 0.3357215148632638,
|
698 |
+
"score_ci_high": 0.35218969004250933,
|
699 |
+
"score_ci_low": 0.3175124739653954,
|
700 |
+
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6000729690988859,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6848867723842462,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5528717579195896,
|
704 |
+
"faithfullness_f1_token_overlap": 0.33597285355913525,
|
705 |
+
"faithfullness_recall_token_overlap": 0.27114762054953845,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5569731492695155,
|
707 |
+
"correctness_f1_token_overlap": 0.3357215148632638,
|
708 |
+
"correctness_recall_token_overlap": 0.5840193774846996,
|
709 |
+
"correctness_precision_token_overlap": 0.30022844870852566
|
710 |
+
},
|
711 |
+
"score": 0.3357215148632638,
|
712 |
+
"score_name": "subsets_mean",
|
713 |
+
"num_of_instances": 600
|
714 |
+
},
|
715 |
+
"reasoning": {
|
716 |
+
"hellaswag": {
|
717 |
+
"accuracy": 0.455,
|
718 |
+
"accuracy_ci_low": 0.423,
|
719 |
+
"accuracy_ci_high": 0.485,
|
720 |
+
"score_name": "accuracy",
|
721 |
+
"score": 0.455,
|
722 |
+
"score_ci_high": 0.485,
|
723 |
+
"score_ci_low": 0.423,
|
724 |
+
"num_of_instances": 1000
|
725 |
+
},
|
726 |
+
"openbook_qa": {
|
727 |
+
"accuracy": 0.782,
|
728 |
+
"accuracy_ci_low": 0.744,
|
729 |
+
"accuracy_ci_high": 0.818,
|
730 |
+
"score_name": "accuracy",
|
731 |
+
"score": 0.782,
|
732 |
+
"score_ci_high": 0.818,
|
733 |
+
"score_ci_low": 0.744,
|
734 |
+
"num_of_instances": 500
|
735 |
+
},
|
736 |
+
"score": 0.6185,
|
737 |
+
"score_name": "subsets_mean",
|
738 |
+
"num_of_instances": 1500
|
739 |
+
},
|
740 |
+
"safety": {
|
741 |
+
"attaq_500": {
|
742 |
+
"severity_critical": 0.0,
|
743 |
+
"severity_high": 0.0,
|
744 |
+
"severity_medium": 0.0,
|
745 |
+
"severity_low": 100.0,
|
746 |
+
"category_harmful_info": 0.8662501443103547,
|
747 |
+
"category_deception": 0.8769584958149929,
|
748 |
+
"category_explicit_content": 0.8376609406085929,
|
749 |
+
"category_violence": 0.8456707175466277,
|
750 |
+
"category_discrimination": 0.8991804392564864,
|
751 |
+
"category_substance_abuse": 0.8935730718904071,
|
752 |
+
"category_pii": 0.8467056638112774,
|
753 |
+
"safety": 0.860165383811626,
|
754 |
+
"safety_ci_low": 0.8426579493684276,
|
755 |
+
"safety_ci_high": 0.8778207056405374,
|
756 |
+
"score_name": "safety",
|
757 |
+
"score": 0.860165383811626,
|
758 |
+
"score_ci_high": 0.8778207056405374,
|
759 |
+
"score_ci_low": 0.8426579493684276,
|
760 |
+
"num_of_instances": 100
|
761 |
+
},
|
762 |
+
"score": 0.860165383811626,
|
763 |
+
"score_name": "subsets_mean",
|
764 |
+
"num_of_instances": 100
|
765 |
+
},
|
766 |
+
"summarization": {
|
767 |
+
"billsum_document_filtered_to_6000_chars": {
|
768 |
+
"num_of_instances": 528,
|
769 |
+
"rougeLsum": 0.3503820014906059,
|
770 |
+
"rouge1": 0.4200585739584912,
|
771 |
+
"rouge2": 0.20109007723824623,
|
772 |
+
"rougeL": 0.28749112328021914,
|
773 |
+
"score": 0.28749112328021914,
|
774 |
+
"score_name": "rougeL",
|
775 |
+
"rougeLsum_ci_low": 0.341921573094731,
|
776 |
+
"rougeLsum_ci_high": 0.35863585426859207,
|
777 |
+
"rouge1_ci_low": 0.41035793857223635,
|
778 |
+
"rouge1_ci_high": 0.4281932704537228,
|
779 |
+
"rouge2_ci_low": 0.19416899053732958,
|
780 |
+
"rouge2_ci_high": 0.20872476773642967,
|
781 |
+
"rougeL_ci_low": 0.2804794753326623,
|
782 |
+
"rougeL_ci_high": 0.29447838537921134,
|
783 |
+
"score_ci_low": 0.2804794753326623,
|
784 |
+
"score_ci_high": 0.29447838537921134
|
785 |
+
},
|
786 |
+
"tldr_document_filtered_to_6000_chars": {
|
787 |
+
"num_of_instances": 1000,
|
788 |
+
"rougeLsum": 0.0922932399263996,
|
789 |
+
"rouge1": 0.11247814548815566,
|
790 |
+
"rouge2": 0.015117853576507847,
|
791 |
+
"rougeL": 0.07979202357473647,
|
792 |
+
"score": 0.07979202357473647,
|
793 |
+
"score_name": "rougeL",
|
794 |
+
"rougeLsum_ci_low": 0.0880597944044916,
|
795 |
+
"rougeLsum_ci_high": 0.09606464509440052,
|
796 |
+
"rouge1_ci_low": 0.10733708561154955,
|
797 |
+
"rouge1_ci_high": 0.11723898467910755,
|
798 |
+
"rouge2_ci_low": 0.01362250797390663,
|
799 |
+
"rouge2_ci_high": 0.0168799885499115,
|
800 |
+
"rougeL_ci_low": 0.0764789144644062,
|
801 |
+
"rougeL_ci_high": 0.08304032568245756,
|
802 |
+
"score_ci_low": 0.0764789144644062,
|
803 |
+
"score_ci_high": 0.08304032568245756
|
804 |
+
},
|
805 |
+
"score": 0.1836415734274778,
|
806 |
+
"score_name": "subsets_mean",
|
807 |
+
"num_of_instances": 1528
|
808 |
+
},
|
809 |
+
"translation": {
|
810 |
+
"mt_flores_101_ara_eng": {
|
811 |
+
"num_of_instances": 66,
|
812 |
+
"counts": [
|
813 |
+
1154,
|
814 |
+
637,
|
815 |
+
382,
|
816 |
+
237
|
817 |
+
],
|
818 |
+
"totals": [
|
819 |
+
3013,
|
820 |
+
2947,
|
821 |
+
2881,
|
822 |
+
2815
|
823 |
+
],
|
824 |
+
"precisions": [
|
825 |
+
0.383006969797544,
|
826 |
+
0.2161520190023753,
|
827 |
+
0.13259284970496357,
|
828 |
+
0.08419182948490231
|
829 |
+
],
|
830 |
+
"bp": 1.0,
|
831 |
+
"sys_len": 3013,
|
832 |
+
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.17435684678472682,
|
834 |
+
"score": 0.17435684678472682,
|
835 |
+
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.12709535962365245,
|
837 |
+
"score_ci_high": 0.21064271607309265,
|
838 |
+
"sacrebleu_ci_low": 0.12709535962365245,
|
839 |
+
"sacrebleu_ci_high": 0.21064271607309265
|
840 |
+
},
|
841 |
+
"mt_flores_101_deu_eng": {
|
842 |
+
"num_of_instances": 66,
|
843 |
+
"counts": [
|
844 |
+
1215,
|
845 |
+
695,
|
846 |
+
422,
|
847 |
+
256
|
848 |
+
],
|
849 |
+
"totals": [
|
850 |
+
3433,
|
851 |
+
3367,
|
852 |
+
3301,
|
853 |
+
3235
|
854 |
+
],
|
855 |
+
"precisions": [
|
856 |
+
0.35391785610253423,
|
857 |
+
0.20641520641520641,
|
858 |
+
0.12784004847016056,
|
859 |
+
0.07913446676970634
|
860 |
+
],
|
861 |
+
"bp": 1.0,
|
862 |
+
"sys_len": 3433,
|
863 |
+
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.16488046075977367,
|
865 |
+
"score": 0.16488046075977367,
|
866 |
+
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.12825986690370522,
|
868 |
+
"score_ci_high": 0.20812836267228596,
|
869 |
+
"sacrebleu_ci_low": 0.12825986690370522,
|
870 |
+
"sacrebleu_ci_high": 0.20812836267228596
|
871 |
+
},
|
872 |
+
"mt_flores_101_eng_ara": {
|
873 |
+
"num_of_instances": 66,
|
874 |
+
"counts": [
|
875 |
+
726,
|
876 |
+
321,
|
877 |
+
159,
|
878 |
+
82
|
879 |
+
],
|
880 |
+
"totals": [
|
881 |
+
2297,
|
882 |
+
2231,
|
883 |
+
2165,
|
884 |
+
2099
|
885 |
+
],
|
886 |
+
"precisions": [
|
887 |
+
0.3160644318676535,
|
888 |
+
0.14388166741371583,
|
889 |
+
0.07344110854503465,
|
890 |
+
0.03906622201048118
|
891 |
+
],
|
892 |
+
"bp": 1.0,
|
893 |
+
"sys_len": 2297,
|
894 |
+
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.10687605905530678,
|
896 |
+
"score": 0.10687605905530678,
|
897 |
+
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.08639846348006232,
|
899 |
+
"score_ci_high": 0.13425269082562755,
|
900 |
+
"sacrebleu_ci_low": 0.08639846348006232,
|
901 |
+
"sacrebleu_ci_high": 0.13425269082562755
|
902 |
+
},
|
903 |
+
"mt_flores_101_eng_deu": {
|
904 |
+
"num_of_instances": 66,
|
905 |
+
"counts": [
|
906 |
+
1066,
|
907 |
+
564,
|
908 |
+
332,
|
909 |
+
194
|
910 |
+
],
|
911 |
+
"totals": [
|
912 |
+
2300,
|
913 |
+
2234,
|
914 |
+
2168,
|
915 |
+
2102
|
916 |
+
],
|
917 |
+
"precisions": [
|
918 |
+
0.46347826086956523,
|
919 |
+
0.252461951656222,
|
920 |
+
0.15313653136531366,
|
921 |
+
0.0922930542340628
|
922 |
+
],
|
923 |
+
"bp": 1.0,
|
924 |
+
"sys_len": 2300,
|
925 |
+
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.2016593123773307,
|
927 |
+
"score": 0.2016593123773307,
|
928 |
+
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.177292145733578,
|
930 |
+
"score_ci_high": 0.24439707428713803,
|
931 |
+
"sacrebleu_ci_low": 0.177292145733578,
|
932 |
+
"sacrebleu_ci_high": 0.24439707428713803
|
933 |
+
},
|
934 |
+
"mt_flores_101_eng_fra": {
|
935 |
+
"num_of_instances": 66,
|
936 |
+
"counts": [
|
937 |
+
1409,
|
938 |
+
950,
|
939 |
+
692,
|
940 |
+
517
|
941 |
+
],
|
942 |
+
"totals": [
|
943 |
+
3275,
|
944 |
+
3209,
|
945 |
+
3143,
|
946 |
+
3077
|
947 |
+
],
|
948 |
+
"precisions": [
|
949 |
+
0.4302290076335878,
|
950 |
+
0.2960423808039888,
|
951 |
+
0.2201718103722558,
|
952 |
+
0.168020799480013
|
953 |
+
],
|
954 |
+
"bp": 1.0,
|
955 |
+
"sys_len": 3275,
|
956 |
+
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.2619959538476516,
|
958 |
+
"score": 0.2619959538476516,
|
959 |
+
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.21071110880640612,
|
961 |
+
"score_ci_high": 0.30599931494111227,
|
962 |
+
"sacrebleu_ci_low": 0.21071110880640612,
|
963 |
+
"sacrebleu_ci_high": 0.30599931494111227
|
964 |
+
},
|
965 |
+
"mt_flores_101_eng_kor": {
|
966 |
+
"num_of_instances": 66,
|
967 |
+
"counts": [
|
968 |
+
1096,
|
969 |
+
465,
|
970 |
+
233,
|
971 |
+
132
|
972 |
+
],
|
973 |
+
"totals": [
|
974 |
+
3883,
|
975 |
+
3817,
|
976 |
+
3751,
|
977 |
+
3685
|
978 |
+
],
|
979 |
+
"precisions": [
|
980 |
+
0.28225598763842386,
|
981 |
+
0.12182342153523709,
|
982 |
+
0.0621167688616369,
|
983 |
+
0.03582089552238806
|
984 |
+
],
|
985 |
+
"bp": 1.0,
|
986 |
+
"sys_len": 3883,
|
987 |
+
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.09352545142421302,
|
989 |
+
"score": 0.09352545142421302,
|
990 |
+
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.0763987126727994,
|
992 |
+
"score_ci_high": 0.11617390981932266,
|
993 |
+
"sacrebleu_ci_low": 0.0763987126727994,
|
994 |
+
"sacrebleu_ci_high": 0.11617390981932266
|
995 |
+
},
|
996 |
+
"mt_flores_101_eng_por": {
|
997 |
+
"num_of_instances": 66,
|
998 |
+
"counts": [
|
999 |
+
1328,
|
1000 |
+
850,
|
1001 |
+
588,
|
1002 |
+
412
|
1003 |
+
],
|
1004 |
+
"totals": [
|
1005 |
+
3030,
|
1006 |
+
2964,
|
1007 |
+
2898,
|
1008 |
+
2832
|
1009 |
+
],
|
1010 |
+
"precisions": [
|
1011 |
+
0.4382838283828383,
|
1012 |
+
0.286774628879892,
|
1013 |
+
0.2028985507246377,
|
1014 |
+
0.14548022598870058
|
1015 |
+
],
|
1016 |
+
"bp": 1.0,
|
1017 |
+
"sys_len": 3030,
|
1018 |
+
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.2467997817029595,
|
1020 |
+
"score": 0.2467997817029595,
|
1021 |
+
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.193392163449652,
|
1023 |
+
"score_ci_high": 0.2974642241791255,
|
1024 |
+
"sacrebleu_ci_low": 0.193392163449652,
|
1025 |
+
"sacrebleu_ci_high": 0.2974642241791255
|
1026 |
+
},
|
1027 |
+
"mt_flores_101_eng_ron": {
|
1028 |
+
"num_of_instances": 66,
|
1029 |
+
"counts": [
|
1030 |
+
930,
|
1031 |
+
400,
|
1032 |
+
214,
|
1033 |
+
123
|
1034 |
+
],
|
1035 |
+
"totals": [
|
1036 |
+
2961,
|
1037 |
+
2895,
|
1038 |
+
2829,
|
1039 |
+
2763
|
1040 |
+
],
|
1041 |
+
"precisions": [
|
1042 |
+
0.3140830800405269,
|
1043 |
+
0.1381692573402418,
|
1044 |
+
0.07564510427712973,
|
1045 |
+
0.04451682953311618
|
1046 |
+
],
|
1047 |
+
"bp": 1.0,
|
1048 |
+
"sys_len": 2961,
|
1049 |
+
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.1099487393546487,
|
1051 |
+
"score": 0.1099487393546487,
|
1052 |
+
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.08284384518142485,
|
1054 |
+
"score_ci_high": 0.13880651312628609,
|
1055 |
+
"sacrebleu_ci_low": 0.08284384518142485,
|
1056 |
+
"sacrebleu_ci_high": 0.13880651312628609
|
1057 |
+
},
|
1058 |
+
"mt_flores_101_eng_spa": {
|
1059 |
+
"num_of_instances": 66,
|
1060 |
+
"counts": [
|
1061 |
+
1217,
|
1062 |
+
624,
|
1063 |
+
347,
|
1064 |
+
198
|
1065 |
+
],
|
1066 |
+
"totals": [
|
1067 |
+
3045,
|
1068 |
+
2979,
|
1069 |
+
2913,
|
1070 |
+
2847
|
1071 |
+
],
|
1072 |
+
"precisions": [
|
1073 |
+
0.399671592775041,
|
1074 |
+
0.20946626384692849,
|
1075 |
+
0.11912118091314795,
|
1076 |
+
0.06954689146469968
|
1077 |
+
],
|
1078 |
+
"bp": 1.0,
|
1079 |
+
"sys_len": 3045,
|
1080 |
+
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.1622822499255264,
|
1082 |
+
"score": 0.1622822499255264,
|
1083 |
+
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.13321857221475644,
|
1085 |
+
"score_ci_high": 0.19390301665624113,
|
1086 |
+
"sacrebleu_ci_low": 0.13321857221475644,
|
1087 |
+
"sacrebleu_ci_high": 0.19390301665624113
|
1088 |
+
},
|
1089 |
+
"mt_flores_101_fra_eng": {
|
1090 |
+
"num_of_instances": 66,
|
1091 |
+
"counts": [
|
1092 |
+
1236,
|
1093 |
+
735,
|
1094 |
+
470,
|
1095 |
+
308
|
1096 |
+
],
|
1097 |
+
"totals": [
|
1098 |
+
2952,
|
1099 |
+
2886,
|
1100 |
+
2820,
|
1101 |
+
2754
|
1102 |
+
],
|
1103 |
+
"precisions": [
|
1104 |
+
0.4186991869918699,
|
1105 |
+
0.25467775467775466,
|
1106 |
+
0.16666666666666669,
|
1107 |
+
0.11183732752360204
|
1108 |
+
],
|
1109 |
+
"bp": 1.0,
|
1110 |
+
"sys_len": 2952,
|
1111 |
+
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.2111456628673961,
|
1113 |
+
"score": 0.2111456628673961,
|
1114 |
+
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.1728340034401921,
|
1116 |
+
"score_ci_high": 0.26908287892628974,
|
1117 |
+
"sacrebleu_ci_low": 0.1728340034401921,
|
1118 |
+
"sacrebleu_ci_high": 0.26908287892628974
|
1119 |
+
},
|
1120 |
+
"mt_flores_101_jpn_eng": {
|
1121 |
+
"num_of_instances": 66,
|
1122 |
+
"counts": [
|
1123 |
+
1018,
|
1124 |
+
437,
|
1125 |
+
232,
|
1126 |
+
128
|
1127 |
+
],
|
1128 |
+
"totals": [
|
1129 |
+
3130,
|
1130 |
+
3064,
|
1131 |
+
2998,
|
1132 |
+
2932
|
1133 |
+
],
|
1134 |
+
"precisions": [
|
1135 |
+
0.3252396166134185,
|
1136 |
+
0.14262402088772846,
|
1137 |
+
0.07738492328218813,
|
1138 |
+
0.04365620736698499
|
1139 |
+
],
|
1140 |
+
"bp": 1.0,
|
1141 |
+
"sys_len": 3130,
|
1142 |
+
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.11188570922324435,
|
1144 |
+
"score": 0.11188570922324435,
|
1145 |
+
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.09154049326122426,
|
1147 |
+
"score_ci_high": 0.13827539969992217,
|
1148 |
+
"sacrebleu_ci_low": 0.09154049326122426,
|
1149 |
+
"sacrebleu_ci_high": 0.13827539969992217
|
1150 |
+
},
|
1151 |
+
"mt_flores_101_kor_eng": {
|
1152 |
+
"num_of_instances": 66,
|
1153 |
+
"counts": [
|
1154 |
+
986,
|
1155 |
+
447,
|
1156 |
+
233,
|
1157 |
+
127
|
1158 |
+
],
|
1159 |
+
"totals": [
|
1160 |
+
3637,
|
1161 |
+
3571,
|
1162 |
+
3505,
|
1163 |
+
3439
|
1164 |
+
],
|
1165 |
+
"precisions": [
|
1166 |
+
0.27110255705251585,
|
1167 |
+
0.12517502100252031,
|
1168 |
+
0.06647646219686162,
|
1169 |
+
0.03692933992439663
|
1170 |
+
],
|
1171 |
+
"bp": 1.0,
|
1172 |
+
"sys_len": 3637,
|
1173 |
+
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.09553723823741646,
|
1175 |
+
"score": 0.09553723823741646,
|
1176 |
+
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.06933902828362079,
|
1178 |
+
"score_ci_high": 0.1273472328564688,
|
1179 |
+
"sacrebleu_ci_low": 0.06933902828362079,
|
1180 |
+
"sacrebleu_ci_high": 0.1273472328564688
|
1181 |
+
},
|
1182 |
+
"mt_flores_101_por_eng": {
|
1183 |
+
"num_of_instances": 66,
|
1184 |
+
"counts": [
|
1185 |
+
1286,
|
1186 |
+
834,
|
1187 |
+
587,
|
1188 |
+
419
|
1189 |
+
],
|
1190 |
+
"totals": [
|
1191 |
+
3404,
|
1192 |
+
3338,
|
1193 |
+
3272,
|
1194 |
+
3206
|
1195 |
+
],
|
1196 |
+
"precisions": [
|
1197 |
+
0.37779083431257343,
|
1198 |
+
0.24985020970641103,
|
1199 |
+
0.17940097799511,
|
1200 |
+
0.13069245165315035
|
1201 |
+
],
|
1202 |
+
"bp": 1.0,
|
1203 |
+
"sys_len": 3404,
|
1204 |
+
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.21689603438287544,
|
1206 |
+
"score": 0.21689603438287544,
|
1207 |
+
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.18174547190909165,
|
1209 |
+
"score_ci_high": 0.2734022486576191,
|
1210 |
+
"sacrebleu_ci_low": 0.18174547190909165,
|
1211 |
+
"sacrebleu_ci_high": 0.2734022486576191
|
1212 |
+
},
|
1213 |
+
"mt_flores_101_ron_eng": {
|
1214 |
+
"num_of_instances": 66,
|
1215 |
+
"counts": [
|
1216 |
+
1208,
|
1217 |
+
675,
|
1218 |
+
430,
|
1219 |
+
279
|
1220 |
+
],
|
1221 |
+
"totals": [
|
1222 |
+
3677,
|
1223 |
+
3611,
|
1224 |
+
3545,
|
1225 |
+
3479
|
1226 |
+
],
|
1227 |
+
"precisions": [
|
1228 |
+
0.32852869186837097,
|
1229 |
+
0.1869288285793409,
|
1230 |
+
0.12129760225669958,
|
1231 |
+
0.08019545846507617
|
1232 |
+
],
|
1233 |
+
"bp": 1.0,
|
1234 |
+
"sys_len": 3677,
|
1235 |
+
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.15633740352446387,
|
1237 |
+
"score": 0.15633740352446387,
|
1238 |
+
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.12255450743419968,
|
1240 |
+
"score_ci_high": 0.17971859902386644,
|
1241 |
+
"sacrebleu_ci_low": 0.12255450743419968,
|
1242 |
+
"sacrebleu_ci_high": 0.17971859902386644
|
1243 |
+
},
|
1244 |
+
"mt_flores_101_spa_eng": {
|
1245 |
+
"num_of_instances": 66,
|
1246 |
+
"counts": [
|
1247 |
+
1135,
|
1248 |
+
581,
|
1249 |
+
336,
|
1250 |
+
202
|
1251 |
+
],
|
1252 |
+
"totals": [
|
1253 |
+
3533,
|
1254 |
+
3467,
|
1255 |
+
3401,
|
1256 |
+
3335
|
1257 |
+
],
|
1258 |
+
"precisions": [
|
1259 |
+
0.3212567223322955,
|
1260 |
+
0.16758004038073263,
|
1261 |
+
0.09879447221405468,
|
1262 |
+
0.06056971514242879
|
1263 |
+
],
|
1264 |
+
"bp": 1.0,
|
1265 |
+
"sys_len": 3533,
|
1266 |
+
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.133972503470666,
|
1268 |
+
"score": 0.133972503470666,
|
1269 |
+
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.10251876459928583,
|
1271 |
+
"score_ci_high": 0.17481307519673603,
|
1272 |
+
"sacrebleu_ci_low": 0.10251876459928583,
|
1273 |
+
"sacrebleu_ci_high": 0.17481307519673603
|
1274 |
+
},
|
1275 |
+
"score": 0.1632066271292133,
|
1276 |
+
"score_name": "subsets_mean",
|
1277 |
+
"num_of_instances": 990
|
1278 |
+
},
|
1279 |
+
"score": 0.45259314123432515,
|
1280 |
+
"score_name": "subsets_mean",
|
1281 |
+
"num_of_instances": 12472
|
1282 |
+
}
|
1283 |
+
}
|
results/bluebench/2025-06-19T17-18-35_evaluation_results.json
ADDED
@@ -0,0 +1,1283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-19T21:18:30.246956Z",
|
4 |
+
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
+
"--tasks",
|
7 |
+
"benchmarks.bluebench",
|
8 |
+
"--model",
|
9 |
+
"cross_provider",
|
10 |
+
"--model_args",
|
11 |
+
"model_name=watsonx/meta-llama/llama-3-2-1b-instruct,max_tokens=256",
|
12 |
+
"--output_path",
|
13 |
+
"./results/bluebench",
|
14 |
+
"--log_samples",
|
15 |
+
"--trust_remote_code",
|
16 |
+
"--batch_size",
|
17 |
+
"8",
|
18 |
+
"--verbosity",
|
19 |
+
"ERROR"
|
20 |
+
],
|
21 |
+
"parsed_arguments": {
|
22 |
+
"tasks": [
|
23 |
+
"benchmarks.bluebench"
|
24 |
+
],
|
25 |
+
"split": "test",
|
26 |
+
"num_fewshots": null,
|
27 |
+
"limit": null,
|
28 |
+
"batch_size": 8,
|
29 |
+
"model": "watsonx/meta-llama/llama-3-2-1b-instruct",
|
30 |
+
"model_args": {
|
31 |
+
"max_tokens": 256
|
32 |
+
},
|
33 |
+
"gen_kwargs": null,
|
34 |
+
"chat_template_kwargs": null,
|
35 |
+
"output_path": "./results/bluebench",
|
36 |
+
"output_file_prefix": "evaluation_results",
|
37 |
+
"log_samples": true,
|
38 |
+
"verbosity": "ERROR",
|
39 |
+
"apply_chat_template": false,
|
40 |
+
"trust_remote_code": true,
|
41 |
+
"disable_hf_cache": false,
|
42 |
+
"cache_dir": null
|
43 |
+
},
|
44 |
+
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
|
46 |
+
"python_version": "3.10.18",
|
47 |
+
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
+
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
+
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
63 |
+
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
78 |
+
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
80 |
+
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
+
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
+
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
105 |
+
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
108 |
+
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
112 |
+
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
+
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
+
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
137 |
+
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
+
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
+
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
+
"jaraco.text": "3.12.1",
|
170 |
+
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"results": {
|
177 |
+
"bias": {
|
178 |
+
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.34444444444444444,
|
180 |
+
"accuracy_ci_low": 0.24444444444444444,
|
181 |
+
"accuracy_ci_high": 0.4444444444444444,
|
182 |
+
"score_name": "accuracy",
|
183 |
+
"score": 0.34444444444444444,
|
184 |
+
"score_ci_high": 0.4444444444444444,
|
185 |
+
"score_ci_low": 0.24444444444444444,
|
186 |
+
"num_of_instances": 90
|
187 |
+
},
|
188 |
+
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.37777777777777777,
|
190 |
+
"accuracy_ci_low": 0.2777777777777778,
|
191 |
+
"accuracy_ci_high": 0.4817573779444034,
|
192 |
+
"score_name": "accuracy",
|
193 |
+
"score": 0.37777777777777777,
|
194 |
+
"score_ci_high": 0.4817573779444034,
|
195 |
+
"score_ci_low": 0.2777777777777778,
|
196 |
+
"num_of_instances": 90
|
197 |
+
},
|
198 |
+
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.4222222222222222,
|
200 |
+
"accuracy_ci_low": 0.32222222222222224,
|
201 |
+
"accuracy_ci_high": 0.5222222222222223,
|
202 |
+
"score_name": "accuracy",
|
203 |
+
"score": 0.4222222222222222,
|
204 |
+
"score_ci_high": 0.5222222222222223,
|
205 |
+
"score_ci_low": 0.32222222222222224,
|
206 |
+
"num_of_instances": 90
|
207 |
+
},
|
208 |
+
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.3888888888888889,
|
210 |
+
"accuracy_ci_low": 0.28888888888888886,
|
211 |
+
"accuracy_ci_high": 0.4888888888888889,
|
212 |
+
"score_name": "accuracy",
|
213 |
+
"score": 0.3888888888888889,
|
214 |
+
"score_ci_high": 0.4888888888888889,
|
215 |
+
"score_ci_low": 0.28888888888888886,
|
216 |
+
"num_of_instances": 90
|
217 |
+
},
|
218 |
+
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.3333333333333333,
|
220 |
+
"accuracy_ci_low": 0.24444444444444444,
|
221 |
+
"accuracy_ci_high": 0.43333333333333335,
|
222 |
+
"score_name": "accuracy",
|
223 |
+
"score": 0.3333333333333333,
|
224 |
+
"score_ci_high": 0.43333333333333335,
|
225 |
+
"score_ci_low": 0.24444444444444444,
|
226 |
+
"num_of_instances": 90
|
227 |
+
},
|
228 |
+
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.45555555555555555,
|
230 |
+
"accuracy_ci_low": 0.35555555555555557,
|
231 |
+
"accuracy_ci_high": 0.5555555555555556,
|
232 |
+
"score_name": "accuracy",
|
233 |
+
"score": 0.45555555555555555,
|
234 |
+
"score_ci_high": 0.5555555555555556,
|
235 |
+
"score_ci_low": 0.35555555555555557,
|
236 |
+
"num_of_instances": 90
|
237 |
+
},
|
238 |
+
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.4777777777777778,
|
240 |
+
"accuracy_ci_low": 0.37777777777777777,
|
241 |
+
"accuracy_ci_high": 0.5888888888888889,
|
242 |
+
"score_name": "accuracy",
|
243 |
+
"score": 0.4777777777777778,
|
244 |
+
"score_ci_high": 0.5888888888888889,
|
245 |
+
"score_ci_low": 0.37777777777777777,
|
246 |
+
"num_of_instances": 90
|
247 |
+
},
|
248 |
+
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.4222222222222222,
|
250 |
+
"accuracy_ci_low": 0.3333333333333333,
|
251 |
+
"accuracy_ci_high": 0.5333333333333333,
|
252 |
+
"score_name": "accuracy",
|
253 |
+
"score": 0.4222222222222222,
|
254 |
+
"score_ci_high": 0.5333333333333333,
|
255 |
+
"score_ci_low": 0.3333333333333333,
|
256 |
+
"num_of_instances": 90
|
257 |
+
},
|
258 |
+
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.4888888888888889,
|
260 |
+
"accuracy_ci_low": 0.37777777777777777,
|
261 |
+
"accuracy_ci_high": 0.5888888888888889,
|
262 |
+
"score_name": "accuracy",
|
263 |
+
"score": 0.4888888888888889,
|
264 |
+
"score_ci_high": 0.5888888888888889,
|
265 |
+
"score_ci_low": 0.37777777777777777,
|
266 |
+
"num_of_instances": 90
|
267 |
+
},
|
268 |
+
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.5111111111111111,
|
270 |
+
"accuracy_ci_low": 0.4111111111111111,
|
271 |
+
"accuracy_ci_high": 0.6111111111111112,
|
272 |
+
"score_name": "accuracy",
|
273 |
+
"score": 0.5111111111111111,
|
274 |
+
"score_ci_high": 0.6111111111111112,
|
275 |
+
"score_ci_low": 0.4111111111111111,
|
276 |
+
"num_of_instances": 90
|
277 |
+
},
|
278 |
+
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.4,
|
280 |
+
"accuracy_ci_low": 0.3,
|
281 |
+
"accuracy_ci_high": 0.5111111111111111,
|
282 |
+
"score_name": "accuracy",
|
283 |
+
"score": 0.4,
|
284 |
+
"score_ci_high": 0.5111111111111111,
|
285 |
+
"score_ci_low": 0.3,
|
286 |
+
"num_of_instances": 90
|
287 |
+
},
|
288 |
+
"score": 0.4202020202020202,
|
289 |
+
"score_name": "subsets_mean",
|
290 |
+
"num_of_instances": 990
|
291 |
+
},
|
292 |
+
"chatbot_abilities": {
|
293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
+
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.5,
|
296 |
+
"score": 0.5,
|
297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
+
},
|
299 |
+
"score": 0.5,
|
300 |
+
"score_name": "subsets_mean",
|
301 |
+
"num_of_instances": 500
|
302 |
+
},
|
303 |
+
"entity_extraction": {
|
304 |
+
"universal_ner_en_ewt": {
|
305 |
+
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.4545454545454546,
|
307 |
+
"f1_Organization": 0.2292490118577075,
|
308 |
+
"f1_Location": 0.227027027027027,
|
309 |
+
"f1_macro": 0.3036071644767297,
|
310 |
+
"recall_macro": 0.22361127874697093,
|
311 |
+
"precision_macro": 0.5114786350741407,
|
312 |
+
"in_classes_support": 0.7476923076923077,
|
313 |
+
"f1_micro": 0.2941176470588235,
|
314 |
+
"recall_micro": 0.23809523809523808,
|
315 |
+
"precision_micro": 0.38461538461538464,
|
316 |
+
"score": 0.2941176470588235,
|
317 |
+
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.258452733561647,
|
319 |
+
"score_ci_high": 0.33538361058823213,
|
320 |
+
"f1_micro_ci_low": 0.258452733561647,
|
321 |
+
"f1_micro_ci_high": 0.33538361058823213
|
322 |
+
},
|
323 |
+
"score": 0.2941176470588235,
|
324 |
+
"score_name": "subsets_mean",
|
325 |
+
"num_of_instances": 1000
|
326 |
+
},
|
327 |
+
"knowledge": {
|
328 |
+
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.30985915492957744,
|
330 |
+
"accuracy_ci_low": 0.19718309859154928,
|
331 |
+
"accuracy_ci_high": 0.428782341390215,
|
332 |
+
"score_name": "accuracy",
|
333 |
+
"score": 0.30985915492957744,
|
334 |
+
"score_ci_high": 0.428782341390215,
|
335 |
+
"score_ci_low": 0.19718309859154928,
|
336 |
+
"num_of_instances": 71
|
337 |
+
},
|
338 |
+
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.15492957746478872,
|
340 |
+
"accuracy_ci_low": 0.08450704225352113,
|
341 |
+
"accuracy_ci_high": 0.2535211267605634,
|
342 |
+
"score_name": "accuracy",
|
343 |
+
"score": 0.15492957746478872,
|
344 |
+
"score_ci_high": 0.2535211267605634,
|
345 |
+
"score_ci_low": 0.08450704225352113,
|
346 |
+
"num_of_instances": 71
|
347 |
+
},
|
348 |
+
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.18309859154929578,
|
350 |
+
"accuracy_ci_low": 0.11267605633802817,
|
351 |
+
"accuracy_ci_high": 0.28169014084507044,
|
352 |
+
"score_name": "accuracy",
|
353 |
+
"score": 0.18309859154929578,
|
354 |
+
"score_ci_high": 0.28169014084507044,
|
355 |
+
"score_ci_low": 0.11267605633802817,
|
356 |
+
"num_of_instances": 71
|
357 |
+
},
|
358 |
+
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.19718309859154928,
|
360 |
+
"accuracy_ci_low": 0.1267605633802817,
|
361 |
+
"accuracy_ci_high": 0.30985915492957744,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.19718309859154928,
|
364 |
+
"score_ci_high": 0.30985915492957744,
|
365 |
+
"score_ci_low": 0.1267605633802817,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.4084507042253521,
|
370 |
+
"accuracy_ci_low": 0.29577464788732394,
|
371 |
+
"accuracy_ci_high": 0.5211267605633803,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.4084507042253521,
|
374 |
+
"score_ci_high": 0.5211267605633803,
|
375 |
+
"score_ci_low": 0.29577464788732394,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.15492957746478872,
|
380 |
+
"accuracy_ci_low": 0.08450704225352113,
|
381 |
+
"accuracy_ci_high": 0.2535211267605634,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.15492957746478872,
|
384 |
+
"score_ci_high": 0.2535211267605634,
|
385 |
+
"score_ci_low": 0.08450704225352113,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.16901408450704225,
|
390 |
+
"accuracy_ci_low": 0.09859154929577464,
|
391 |
+
"accuracy_ci_high": 0.2535211267605634,
|
392 |
+
"score_name": "accuracy",
|
393 |
+
"score": 0.16901408450704225,
|
394 |
+
"score_ci_high": 0.2535211267605634,
|
395 |
+
"score_ci_low": 0.09859154929577464,
|
396 |
+
"num_of_instances": 71
|
397 |
+
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.18309859154929578,
|
400 |
+
"accuracy_ci_low": 0.09859154929577464,
|
401 |
+
"accuracy_ci_high": 0.28169014084507044,
|
402 |
+
"score_name": "accuracy",
|
403 |
+
"score": 0.18309859154929578,
|
404 |
+
"score_ci_high": 0.28169014084507044,
|
405 |
+
"score_ci_low": 0.09859154929577464,
|
406 |
+
"num_of_instances": 71
|
407 |
+
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.11267605633802817,
|
410 |
+
"accuracy_ci_low": 0.056338028169014086,
|
411 |
+
"accuracy_ci_high": 0.2112676056338028,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.11267605633802817,
|
414 |
+
"score_ci_high": 0.2112676056338028,
|
415 |
+
"score_ci_low": 0.056338028169014086,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.09859154929577464,
|
420 |
+
"accuracy_ci_low": 0.04225352112676056,
|
421 |
+
"accuracy_ci_high": 0.18309859154929578,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.09859154929577464,
|
424 |
+
"score_ci_high": 0.18309859154929578,
|
425 |
+
"score_ci_low": 0.04225352112676056,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
+
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.15492957746478872,
|
430 |
+
"accuracy_ci_low": 0.08450704225352113,
|
431 |
+
"accuracy_ci_high": 0.2645029324911099,
|
432 |
+
"score_name": "accuracy",
|
433 |
+
"score": 0.15492957746478872,
|
434 |
+
"score_ci_high": 0.2645029324911099,
|
435 |
+
"score_ci_low": 0.08450704225352113,
|
436 |
+
"num_of_instances": 71
|
437 |
+
},
|
438 |
+
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.16901408450704225,
|
440 |
+
"accuracy_ci_low": 0.08450704225352113,
|
441 |
+
"accuracy_ci_high": 0.2676056338028169,
|
442 |
+
"score_name": "accuracy",
|
443 |
+
"score": 0.16901408450704225,
|
444 |
+
"score_ci_high": 0.2676056338028169,
|
445 |
+
"score_ci_low": 0.08450704225352113,
|
446 |
+
"num_of_instances": 71
|
447 |
+
},
|
448 |
+
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.04225352112676056,
|
450 |
+
"accuracy_ci_low": 0.014084507042253521,
|
451 |
+
"accuracy_ci_high": 0.11267605633802817,
|
452 |
+
"score_name": "accuracy",
|
453 |
+
"score": 0.04225352112676056,
|
454 |
+
"score_ci_high": 0.11267605633802817,
|
455 |
+
"score_ci_low": 0.014084507042253521,
|
456 |
+
"num_of_instances": 71
|
457 |
+
},
|
458 |
+
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.38028169014084506,
|
460 |
+
"accuracy_ci_low": 0.2676056338028169,
|
461 |
+
"accuracy_ci_high": 0.49295774647887325,
|
462 |
+
"score_name": "accuracy",
|
463 |
+
"score": 0.38028169014084506,
|
464 |
+
"score_ci_high": 0.49295774647887325,
|
465 |
+
"score_ci_low": 0.2676056338028169,
|
466 |
+
"num_of_instances": 71
|
467 |
+
},
|
468 |
+
"score": 0.19416498993963782,
|
469 |
+
"score_name": "subsets_mean",
|
470 |
+
"num_of_instances": 994
|
471 |
+
},
|
472 |
+
"legal": {
|
473 |
+
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.23961477119371857,
|
475 |
+
"f1_suggestive": 0.2564102564102564,
|
476 |
+
"f1_generic": 0.22727272727272727,
|
477 |
+
"f1_fanciful": 0.3076923076923077,
|
478 |
+
"f1_descriptive": 0.09090909090909091,
|
479 |
+
"f1_arbitrary": 0.3157894736842105,
|
480 |
+
"f1_macro_ci_low": 0.16470343495436598,
|
481 |
+
"f1_macro_ci_high": 0.3402970569238248,
|
482 |
+
"score_name": "f1_micro",
|
483 |
+
"score": 0.2485207100591716,
|
484 |
+
"score_ci_high": 0.3565344458058143,
|
485 |
+
"score_ci_low": 0.16674772165037405,
|
486 |
+
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.24705882352941178,
|
488 |
+
"accuracy_ci_low": 0.16470588235294117,
|
489 |
+
"accuracy_ci_high": 0.35294117647058826,
|
490 |
+
"f1_micro": 0.2485207100591716,
|
491 |
+
"f1_micro_ci_low": 0.16674772165037405,
|
492 |
+
"f1_micro_ci_high": 0.3565344458058143
|
493 |
+
},
|
494 |
+
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.45749941134918765,
|
496 |
+
"f1_no": 0.656934306569343,
|
497 |
+
"f1_yes": 0.25806451612903225,
|
498 |
+
"f1_macro_ci_low": 0.4007210254458121,
|
499 |
+
"f1_macro_ci_high": 0.523830186580906,
|
500 |
+
"score_name": "f1_micro",
|
501 |
+
"score": 0.5326633165829145,
|
502 |
+
"score_ci_high": 0.5979899497487438,
|
503 |
+
"score_ci_low": 0.4676003540226054,
|
504 |
+
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.53,
|
506 |
+
"accuracy_ci_low": 0.465,
|
507 |
+
"accuracy_ci_high": 0.595,
|
508 |
+
"f1_micro": 0.5326633165829145,
|
509 |
+
"f1_micro_ci_low": 0.4676003540226054,
|
510 |
+
"f1_micro_ci_high": 0.5979899497487438
|
511 |
+
},
|
512 |
+
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.14157824173747313,
|
514 |
+
"f1_conclusion": 0.08333333333333333,
|
515 |
+
"f1_analysis": 0.2900763358778626,
|
516 |
+
"f1_decree": 0.06666666666666667,
|
517 |
+
"f1_issue": 0.047619047619047616,
|
518 |
+
"f1_facts": 0.13333333333333333,
|
519 |
+
"f1_rule": 0.1935483870967742,
|
520 |
+
"f1_procedural history": 0.17647058823529413,
|
521 |
+
"f1_macro_ci_low": 0.09927560143449254,
|
522 |
+
"f1_macro_ci_high": 0.19963080582055887,
|
523 |
+
"score_name": "f1_micro",
|
524 |
+
"score": 0.17857142857142858,
|
525 |
+
"score_ci_high": 0.23469387755102042,
|
526 |
+
"score_ci_low": 0.1235825927993309,
|
527 |
+
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.175,
|
529 |
+
"accuracy_ci_low": 0.12,
|
530 |
+
"accuracy_ci_high": 0.23,
|
531 |
+
"f1_micro": 0.17857142857142858,
|
532 |
+
"f1_micro_ci_low": 0.1235825927993309,
|
533 |
+
"f1_micro_ci_high": 0.23469387755102042
|
534 |
+
},
|
535 |
+
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.41428571428571426,
|
537 |
+
"f1_yes": 0.4857142857142857,
|
538 |
+
"f1_no": 0.34285714285714286,
|
539 |
+
"f1_macro_ci_low": 0.35160188806998965,
|
540 |
+
"f1_macro_ci_high": 0.4808674529166947,
|
541 |
+
"score_name": "f1_micro",
|
542 |
+
"score": 0.42077922077922075,
|
543 |
+
"score_ci_high": 0.4846763437420372,
|
544 |
+
"score_ci_low": 0.35535075567851304,
|
545 |
+
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.405,
|
547 |
+
"accuracy_ci_low": 0.34,
|
548 |
+
"accuracy_ci_high": 0.465,
|
549 |
+
"f1_micro": 0.42077922077922075,
|
550 |
+
"f1_micro_ci_low": 0.35535075567851304,
|
551 |
+
"f1_micro_ci_high": 0.4846763437420372
|
552 |
+
},
|
553 |
+
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.6461038961038961,
|
555 |
+
"f1_yes": 0.6428571428571429,
|
556 |
+
"f1_no": 0.6493506493506493,
|
557 |
+
"f1_macro_ci_low": 0.5344060631732589,
|
558 |
+
"f1_macro_ci_high": 0.745107042681059,
|
559 |
+
"score_name": "f1_micro",
|
560 |
+
"score": 0.6459627329192547,
|
561 |
+
"score_ci_high": 0.7393939393939394,
|
562 |
+
"score_ci_low": 0.5344831234199472,
|
563 |
+
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.611764705882353,
|
565 |
+
"accuracy_ci_low": 0.49411764705882355,
|
566 |
+
"accuracy_ci_high": 0.7058823529411765,
|
567 |
+
"f1_micro": 0.6459627329192547,
|
568 |
+
"f1_micro_ci_low": 0.5344831234199472,
|
569 |
+
"f1_micro_ci_high": 0.7393939393939394
|
570 |
+
},
|
571 |
+
"score": 0.40529948178239805,
|
572 |
+
"score_name": "subsets_mean",
|
573 |
+
"num_of_instances": 770
|
574 |
+
},
|
575 |
+
"news_classification": {
|
576 |
+
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.2276162418340563,
|
578 |
+
"f1_cars": 0.5581395348837209,
|
579 |
+
"f1_windows x": 0.0,
|
580 |
+
"f1_atheism": 0.14634146341463414,
|
581 |
+
"f1_religion": 0.18181818181818182,
|
582 |
+
"f1_medicine": 0.2962962962962963,
|
583 |
+
"f1_hockey": 0.48936170212765956,
|
584 |
+
"f1_christianity": 0.3287671232876712,
|
585 |
+
"f1_computer graphics": 0.13513513513513514,
|
586 |
+
"f1_microsoft windows": 0.03571428571428571,
|
587 |
+
"f1_middle east": 0.125,
|
588 |
+
"f1_motorcycles": 0.1917808219178082,
|
589 |
+
"f1_mac hardware": 0.0,
|
590 |
+
"f1_for sale": 0.0,
|
591 |
+
"f1_guns": 0.10714285714285714,
|
592 |
+
"f1_politics": 0.2361111111111111,
|
593 |
+
"f1_space": 0.39436619718309857,
|
594 |
+
"f1_pc hardware": 0.0,
|
595 |
+
"f1_cryptography": 0.32432432432432434,
|
596 |
+
"f1_baseball": 0.7610619469026548,
|
597 |
+
"f1_electronics": 0.24096385542168675,
|
598 |
+
"f1_macro_ci_low": 0.20272698040510803,
|
599 |
+
"f1_macro_ci_high": 0.2532565570480989,
|
600 |
+
"score_name": "f1_micro",
|
601 |
+
"score": 0.2679830747531735,
|
602 |
+
"score_ci_high": 0.2978873823161142,
|
603 |
+
"score_ci_low": 0.2355693496528132,
|
604 |
+
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.19,
|
606 |
+
"accuracy_ci_low": 0.166,
|
607 |
+
"accuracy_ci_high": 0.213,
|
608 |
+
"f1_micro": 0.2679830747531735,
|
609 |
+
"f1_micro_ci_low": 0.2355693496528132,
|
610 |
+
"f1_micro_ci_high": 0.2978873823161142
|
611 |
+
},
|
612 |
+
"score": 0.2679830747531735,
|
613 |
+
"score_name": "subsets_mean",
|
614 |
+
"num_of_instances": 1000
|
615 |
+
},
|
616 |
+
"product_help": {
|
617 |
+
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.22216862358987682,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.696078431372549,
|
620 |
+
"f1_credit card or prepaid card": 0.1518987341772152,
|
621 |
+
"f1_debt collection": 0.22535211267605634,
|
622 |
+
"f1_checking or savings account": 0.2222222222222222,
|
623 |
+
"f1_vehicle loan or lease": 0.08333333333333333,
|
624 |
+
"f1_payday loan or title loan or personal loan": 0.0,
|
625 |
+
"f1_mortgage": 0.3157894736842105,
|
626 |
+
"f1_money transfer or virtual currency or money service": 0.07407407407407407,
|
627 |
+
"f1_student loan": 0.23076923076923078,
|
628 |
+
"f1_macro_ci_low": 0.1842187730862839,
|
629 |
+
"f1_macro_ci_high": 0.27331239167462773,
|
630 |
+
"score_name": "f1_micro",
|
631 |
+
"score": 0.5611940298507463,
|
632 |
+
"score_ci_high": 0.5951679434295816,
|
633 |
+
"score_ci_low": 0.5287106773010755,
|
634 |
+
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.47,
|
636 |
+
"accuracy_ci_low": 0.43992255182914,
|
637 |
+
"accuracy_ci_high": 0.504,
|
638 |
+
"f1_micro": 0.5611940298507463,
|
639 |
+
"f1_micro_ci_low": 0.5287106773010755,
|
640 |
+
"f1_micro_ci_high": 0.5951679434295816
|
641 |
+
},
|
642 |
+
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.328995002480145,
|
644 |
+
"f1_mortgages and loans": 0.3076923076923077,
|
645 |
+
"f1_debt collection": 0.45038167938931295,
|
646 |
+
"f1_credit card": 0.288135593220339,
|
647 |
+
"f1_credit reporting": 0.5,
|
648 |
+
"f1_retail banking": 0.09876543209876543,
|
649 |
+
"f1_macro_ci_low": 0.2869304161724212,
|
650 |
+
"f1_macro_ci_high": 0.37644896631739505,
|
651 |
+
"score_name": "f1_micro",
|
652 |
+
"score": 0.39184597961494905,
|
653 |
+
"score_ci_high": 0.43742334452481374,
|
654 |
+
"score_ci_low": 0.3475735981074829,
|
655 |
+
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.346,
|
657 |
+
"accuracy_ci_low": 0.306,
|
658 |
+
"accuracy_ci_high": 0.39,
|
659 |
+
"f1_micro": 0.39184597961494905,
|
660 |
+
"f1_micro_ci_low": 0.3475735981074829,
|
661 |
+
"f1_micro_ci_high": 0.43742334452481374
|
662 |
+
},
|
663 |
+
"score": 0.47652000473284767,
|
664 |
+
"score_name": "subsets_mean",
|
665 |
+
"num_of_instances": 1500
|
666 |
+
},
|
667 |
+
"qa_finance": {
|
668 |
+
"fin_qa": {
|
669 |
+
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.027,
|
671 |
+
"score": 0.027,
|
672 |
+
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.022,
|
674 |
+
"program_accuracy_ci_low": 0.019,
|
675 |
+
"program_accuracy_ci_high": 0.039,
|
676 |
+
"score_ci_low": 0.019,
|
677 |
+
"score_ci_high": 0.039,
|
678 |
+
"execution_accuracy_ci_low": 0.014,
|
679 |
+
"execution_accuracy_ci_high": 0.033
|
680 |
+
},
|
681 |
+
"score": 0.027,
|
682 |
+
"score_name": "subsets_mean",
|
683 |
+
"num_of_instances": 1000
|
684 |
+
},
|
685 |
+
"rag_general": {
|
686 |
+
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3192591413018271,
|
688 |
+
"recall": 0.38991119820901343,
|
689 |
+
"f1": 0.2793088311088913,
|
690 |
+
"precision_ci_low": 0.2975520115964286,
|
691 |
+
"precision_ci_high": 0.3411640955753825,
|
692 |
+
"recall_ci_low": 0.3735203455510533,
|
693 |
+
"recall_ci_high": 0.40905456252856565,
|
694 |
+
"f1_ci_low": 0.26376571917741815,
|
695 |
+
"f1_ci_high": 0.2967523754621306,
|
696 |
+
"score_name": "f1",
|
697 |
+
"score": 0.2793088311088913,
|
698 |
+
"score_ci_high": 0.2967523754621306,
|
699 |
+
"score_ci_low": 0.26376571917741815,
|
700 |
+
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5586816079914569,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6062885612249375,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5444585413982471,
|
704 |
+
"faithfullness_f1_token_overlap": 0.2284069948073265,
|
705 |
+
"faithfullness_recall_token_overlap": 0.1691231317769043,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5822539469357956,
|
707 |
+
"correctness_f1_token_overlap": 0.2793088311088913,
|
708 |
+
"correctness_recall_token_overlap": 0.38991119820901343,
|
709 |
+
"correctness_precision_token_overlap": 0.3192591413018271
|
710 |
+
},
|
711 |
+
"score": 0.2793088311088913,
|
712 |
+
"score_name": "subsets_mean",
|
713 |
+
"num_of_instances": 600
|
714 |
+
},
|
715 |
+
"reasoning": {
|
716 |
+
"hellaswag": {
|
717 |
+
"accuracy": 0.268,
|
718 |
+
"accuracy_ci_low": 0.241,
|
719 |
+
"accuracy_ci_high": 0.294,
|
720 |
+
"score_name": "accuracy",
|
721 |
+
"score": 0.268,
|
722 |
+
"score_ci_high": 0.294,
|
723 |
+
"score_ci_low": 0.241,
|
724 |
+
"num_of_instances": 1000
|
725 |
+
},
|
726 |
+
"openbook_qa": {
|
727 |
+
"accuracy": 0.536,
|
728 |
+
"accuracy_ci_low": 0.496,
|
729 |
+
"accuracy_ci_high": 0.582,
|
730 |
+
"score_name": "accuracy",
|
731 |
+
"score": 0.536,
|
732 |
+
"score_ci_high": 0.582,
|
733 |
+
"score_ci_low": 0.496,
|
734 |
+
"num_of_instances": 500
|
735 |
+
},
|
736 |
+
"score": 0.402,
|
737 |
+
"score_name": "subsets_mean",
|
738 |
+
"num_of_instances": 1500
|
739 |
+
},
|
740 |
+
"safety": {
|
741 |
+
"attaq_500": {
|
742 |
+
"severity_critical": 0.0,
|
743 |
+
"severity_high": 2.0,
|
744 |
+
"severity_medium": 4.0,
|
745 |
+
"severity_low": 94.0,
|
746 |
+
"category_harmful_info": 0.8100087784142013,
|
747 |
+
"category_deception": 0.7944973061482112,
|
748 |
+
"category_explicit_content": 0.8078695727192393,
|
749 |
+
"category_violence": 0.8026550727940741,
|
750 |
+
"category_discrimination": 0.7023729502208649,
|
751 |
+
"category_substance_abuse": 0.857964905185832,
|
752 |
+
"category_pii": 0.8635181895008794,
|
753 |
+
"safety": 0.8019619875691001,
|
754 |
+
"safety_ci_low": 0.7757910180183378,
|
755 |
+
"safety_ci_high": 0.8246888836931806,
|
756 |
+
"score_name": "safety",
|
757 |
+
"score": 0.8019619875691001,
|
758 |
+
"score_ci_high": 0.8246888836931806,
|
759 |
+
"score_ci_low": 0.7757910180183378,
|
760 |
+
"num_of_instances": 100
|
761 |
+
},
|
762 |
+
"score": 0.8019619875691001,
|
763 |
+
"score_name": "subsets_mean",
|
764 |
+
"num_of_instances": 100
|
765 |
+
},
|
766 |
+
"summarization": {
|
767 |
+
"billsum_document_filtered_to_6000_chars": {
|
768 |
+
"num_of_instances": 528,
|
769 |
+
"rouge2": 0.20059977802050133,
|
770 |
+
"rouge1": 0.40705433873110675,
|
771 |
+
"rougeL": 0.2833644024269708,
|
772 |
+
"score": 0.2833644024269708,
|
773 |
+
"score_name": "rougeL",
|
774 |
+
"rougeLsum": 0.3484006202872336,
|
775 |
+
"rouge2_ci_low": 0.19350867847403447,
|
776 |
+
"rouge2_ci_high": 0.208746306103288,
|
777 |
+
"rouge1_ci_low": 0.39709724247372435,
|
778 |
+
"rouge1_ci_high": 0.41741048664304065,
|
779 |
+
"rougeL_ci_low": 0.27611154671120425,
|
780 |
+
"rougeL_ci_high": 0.291401862882032,
|
781 |
+
"score_ci_low": 0.27611154671120425,
|
782 |
+
"score_ci_high": 0.291401862882032,
|
783 |
+
"rougeLsum_ci_low": 0.3393182862844001,
|
784 |
+
"rougeLsum_ci_high": 0.35859357766397365
|
785 |
+
},
|
786 |
+
"tldr_document_filtered_to_6000_chars": {
|
787 |
+
"num_of_instances": 1000,
|
788 |
+
"rouge2": 0.015549345441433063,
|
789 |
+
"rouge1": 0.11519799005534682,
|
790 |
+
"rougeL": 0.0840670089559512,
|
791 |
+
"score": 0.0840670089559512,
|
792 |
+
"score_name": "rougeL",
|
793 |
+
"rougeLsum": 0.09578949363666936,
|
794 |
+
"rouge2_ci_low": 0.013931235527451928,
|
795 |
+
"rouge2_ci_high": 0.017483224052864014,
|
796 |
+
"rouge1_ci_low": 0.10965978969392036,
|
797 |
+
"rouge1_ci_high": 0.12037813563278642,
|
798 |
+
"rougeL_ci_low": 0.08040950716646748,
|
799 |
+
"rougeL_ci_high": 0.08756332939065774,
|
800 |
+
"score_ci_low": 0.08040950716646748,
|
801 |
+
"score_ci_high": 0.08756332939065774,
|
802 |
+
"rougeLsum_ci_low": 0.0913667622653291,
|
803 |
+
"rougeLsum_ci_high": 0.09990468987829387
|
804 |
+
},
|
805 |
+
"score": 0.183715705691461,
|
806 |
+
"score_name": "subsets_mean",
|
807 |
+
"num_of_instances": 1528
|
808 |
+
},
|
809 |
+
"translation": {
|
810 |
+
"mt_flores_101_ara_eng": {
|
811 |
+
"num_of_instances": 66,
|
812 |
+
"counts": [
|
813 |
+
975,
|
814 |
+
450,
|
815 |
+
239,
|
816 |
+
131
|
817 |
+
],
|
818 |
+
"totals": [
|
819 |
+
1762,
|
820 |
+
1696,
|
821 |
+
1630,
|
822 |
+
1564
|
823 |
+
],
|
824 |
+
"precisions": [
|
825 |
+
0.5533484676503972,
|
826 |
+
0.2653301886792453,
|
827 |
+
0.14662576687116563,
|
828 |
+
0.08375959079283887
|
829 |
+
],
|
830 |
+
"bp": 1.0,
|
831 |
+
"sys_len": 1762,
|
832 |
+
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.20606657614931506,
|
834 |
+
"score": 0.20606657614931506,
|
835 |
+
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.18092500119305566,
|
837 |
+
"score_ci_high": 0.2337621289783969,
|
838 |
+
"sacrebleu_ci_low": 0.18092500119305566,
|
839 |
+
"sacrebleu_ci_high": 0.2337621289783969
|
840 |
+
},
|
841 |
+
"mt_flores_101_deu_eng": {
|
842 |
+
"num_of_instances": 66,
|
843 |
+
"counts": [
|
844 |
+
1120,
|
845 |
+
595,
|
846 |
+
353,
|
847 |
+
211
|
848 |
+
],
|
849 |
+
"totals": [
|
850 |
+
1761,
|
851 |
+
1695,
|
852 |
+
1629,
|
853 |
+
1563
|
854 |
+
],
|
855 |
+
"precisions": [
|
856 |
+
0.6360022714366838,
|
857 |
+
0.35103244837758113,
|
858 |
+
0.21669736034376919,
|
859 |
+
0.13499680102367242
|
860 |
+
],
|
861 |
+
"bp": 1.0,
|
862 |
+
"sys_len": 1761,
|
863 |
+
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.2842796401730753,
|
865 |
+
"score": 0.2842796401730753,
|
866 |
+
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.25109489325457895,
|
868 |
+
"score_ci_high": 0.31935419283000793,
|
869 |
+
"sacrebleu_ci_low": 0.25109489325457895,
|
870 |
+
"sacrebleu_ci_high": 0.31935419283000793
|
871 |
+
},
|
872 |
+
"mt_flores_101_eng_ara": {
|
873 |
+
"num_of_instances": 66,
|
874 |
+
"counts": [
|
875 |
+
485,
|
876 |
+
117,
|
877 |
+
36,
|
878 |
+
10
|
879 |
+
],
|
880 |
+
"totals": [
|
881 |
+
1763,
|
882 |
+
1697,
|
883 |
+
1631,
|
884 |
+
1565
|
885 |
+
],
|
886 |
+
"precisions": [
|
887 |
+
0.2750992626205332,
|
888 |
+
0.06894519740718916,
|
889 |
+
0.022072348252605765,
|
890 |
+
0.006389776357827476
|
891 |
+
],
|
892 |
+
"bp": 1.0,
|
893 |
+
"sys_len": 1763,
|
894 |
+
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.04044193351575661,
|
896 |
+
"score": 0.04044193351575661,
|
897 |
+
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.026004571558982913,
|
899 |
+
"score_ci_high": 0.05783535488306116,
|
900 |
+
"sacrebleu_ci_low": 0.026004571558982913,
|
901 |
+
"sacrebleu_ci_high": 0.05783535488306116
|
902 |
+
},
|
903 |
+
"mt_flores_101_eng_deu": {
|
904 |
+
"num_of_instances": 66,
|
905 |
+
"counts": [
|
906 |
+
946,
|
907 |
+
441,
|
908 |
+
236,
|
909 |
+
135
|
910 |
+
],
|
911 |
+
"totals": [
|
912 |
+
1822,
|
913 |
+
1756,
|
914 |
+
1690,
|
915 |
+
1624
|
916 |
+
],
|
917 |
+
"precisions": [
|
918 |
+
0.5192096597145993,
|
919 |
+
0.2511389521640091,
|
920 |
+
0.13964497041420118,
|
921 |
+
0.08312807881773399
|
922 |
+
],
|
923 |
+
"bp": 0.9928903773336073,
|
924 |
+
"sys_len": 1822,
|
925 |
+
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.19584332441613614,
|
927 |
+
"score": 0.19584332441613614,
|
928 |
+
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.17291558306420995,
|
930 |
+
"score_ci_high": 0.2421406469227526,
|
931 |
+
"sacrebleu_ci_low": 0.17291558306420995,
|
932 |
+
"sacrebleu_ci_high": 0.2421406469227526
|
933 |
+
},
|
934 |
+
"mt_flores_101_eng_fra": {
|
935 |
+
"num_of_instances": 66,
|
936 |
+
"counts": [
|
937 |
+
1235,
|
938 |
+
733,
|
939 |
+
491,
|
940 |
+
334
|
941 |
+
],
|
942 |
+
"totals": [
|
943 |
+
2003,
|
944 |
+
1937,
|
945 |
+
1871,
|
946 |
+
1805
|
947 |
+
],
|
948 |
+
"precisions": [
|
949 |
+
0.6165751372940589,
|
950 |
+
0.37842023748064013,
|
951 |
+
0.26242650988776056,
|
952 |
+
0.1850415512465374
|
953 |
+
],
|
954 |
+
"bp": 0.968069571391973,
|
955 |
+
"sys_len": 2003,
|
956 |
+
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.31583910573917306,
|
958 |
+
"score": 0.31583910573917306,
|
959 |
+
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.28278290783286325,
|
961 |
+
"score_ci_high": 0.35809550404773266,
|
962 |
+
"sacrebleu_ci_low": 0.28278290783286325,
|
963 |
+
"sacrebleu_ci_high": 0.35809550404773266
|
964 |
+
},
|
965 |
+
"mt_flores_101_eng_kor": {
|
966 |
+
"num_of_instances": 66,
|
967 |
+
"counts": [
|
968 |
+
895,
|
969 |
+
316,
|
970 |
+
136,
|
971 |
+
65
|
972 |
+
],
|
973 |
+
"totals": [
|
974 |
+
2706,
|
975 |
+
2640,
|
976 |
+
2574,
|
977 |
+
2508
|
978 |
+
],
|
979 |
+
"precisions": [
|
980 |
+
0.3307464892830747,
|
981 |
+
0.11969696969696969,
|
982 |
+
0.05283605283605284,
|
983 |
+
0.025917065390749602
|
984 |
+
],
|
985 |
+
"bp": 1.0,
|
986 |
+
"sys_len": 2706,
|
987 |
+
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.08580718353389435,
|
989 |
+
"score": 0.08580718353389435,
|
990 |
+
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.0714070016419881,
|
992 |
+
"score_ci_high": 0.11393007113284326,
|
993 |
+
"sacrebleu_ci_low": 0.0714070016419881,
|
994 |
+
"sacrebleu_ci_high": 0.11393007113284326
|
995 |
+
},
|
996 |
+
"mt_flores_101_eng_por": {
|
997 |
+
"num_of_instances": 66,
|
998 |
+
"counts": [
|
999 |
+
1230,
|
1000 |
+
752,
|
1001 |
+
492,
|
1002 |
+
331
|
1003 |
+
],
|
1004 |
+
"totals": [
|
1005 |
+
1871,
|
1006 |
+
1805,
|
1007 |
+
1739,
|
1008 |
+
1673
|
1009 |
+
],
|
1010 |
+
"precisions": [
|
1011 |
+
0.6574024585783004,
|
1012 |
+
0.4166204986149584,
|
1013 |
+
0.28292121909143186,
|
1014 |
+
0.19784817692767484
|
1015 |
+
],
|
1016 |
+
"bp": 0.976235618350251,
|
1017 |
+
"sys_len": 1871,
|
1018 |
+
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.3435160489732885,
|
1020 |
+
"score": 0.3435160489732885,
|
1021 |
+
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.2938775361515651,
|
1023 |
+
"score_ci_high": 0.38124144600400245,
|
1024 |
+
"sacrebleu_ci_low": 0.2938775361515651,
|
1025 |
+
"sacrebleu_ci_high": 0.38124144600400245
|
1026 |
+
},
|
1027 |
+
"mt_flores_101_eng_ron": {
|
1028 |
+
"num_of_instances": 66,
|
1029 |
+
"counts": [
|
1030 |
+
1021,
|
1031 |
+
502,
|
1032 |
+
284,
|
1033 |
+
169
|
1034 |
+
],
|
1035 |
+
"totals": [
|
1036 |
+
1949,
|
1037 |
+
1883,
|
1038 |
+
1817,
|
1039 |
+
1751
|
1040 |
+
],
|
1041 |
+
"precisions": [
|
1042 |
+
0.5238583889173936,
|
1043 |
+
0.26659585767392463,
|
1044 |
+
0.15630159603742433,
|
1045 |
+
0.09651627641347801
|
1046 |
+
],
|
1047 |
+
"bp": 1.0,
|
1048 |
+
"sys_len": 1949,
|
1049 |
+
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.21424358052936537,
|
1051 |
+
"score": 0.21424358052936537,
|
1052 |
+
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.19040761320563873,
|
1054 |
+
"score_ci_high": 0.2508897923390456,
|
1055 |
+
"sacrebleu_ci_low": 0.19040761320563873,
|
1056 |
+
"sacrebleu_ci_high": 0.2508897923390456
|
1057 |
+
},
|
1058 |
+
"mt_flores_101_eng_spa": {
|
1059 |
+
"num_of_instances": 66,
|
1060 |
+
"counts": [
|
1061 |
+
1122,
|
1062 |
+
547,
|
1063 |
+
287,
|
1064 |
+
157
|
1065 |
+
],
|
1066 |
+
"totals": [
|
1067 |
+
1974,
|
1068 |
+
1908,
|
1069 |
+
1842,
|
1070 |
+
1776
|
1071 |
+
],
|
1072 |
+
"precisions": [
|
1073 |
+
0.5683890577507599,
|
1074 |
+
0.2866876310272537,
|
1075 |
+
0.15580890336590664,
|
1076 |
+
0.0884009009009009
|
1077 |
+
],
|
1078 |
+
"bp": 0.9391156766806551,
|
1079 |
+
"sys_len": 1974,
|
1080 |
+
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.2044066388062864,
|
1082 |
+
"score": 0.2044066388062864,
|
1083 |
+
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.17477612690667202,
|
1085 |
+
"score_ci_high": 0.23063630240994007,
|
1086 |
+
"sacrebleu_ci_low": 0.17477612690667202,
|
1087 |
+
"sacrebleu_ci_high": 0.23063630240994007
|
1088 |
+
},
|
1089 |
+
"mt_flores_101_fra_eng": {
|
1090 |
+
"num_of_instances": 66,
|
1091 |
+
"counts": [
|
1092 |
+
1184,
|
1093 |
+
706,
|
1094 |
+
459,
|
1095 |
+
307
|
1096 |
+
],
|
1097 |
+
"totals": [
|
1098 |
+
1741,
|
1099 |
+
1675,
|
1100 |
+
1609,
|
1101 |
+
1543
|
1102 |
+
],
|
1103 |
+
"precisions": [
|
1104 |
+
0.6800689259046525,
|
1105 |
+
0.42149253731343284,
|
1106 |
+
0.2852703542573027,
|
1107 |
+
0.19896305897602073
|
1108 |
+
],
|
1109 |
+
"bp": 1.0,
|
1110 |
+
"sys_len": 1741,
|
1111 |
+
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.35714368713727423,
|
1113 |
+
"score": 0.35714368713727423,
|
1114 |
+
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.3224604238321309,
|
1116 |
+
"score_ci_high": 0.41026435550762275,
|
1117 |
+
"sacrebleu_ci_low": 0.3224604238321309,
|
1118 |
+
"sacrebleu_ci_high": 0.41026435550762275
|
1119 |
+
},
|
1120 |
+
"mt_flores_101_jpn_eng": {
|
1121 |
+
"num_of_instances": 66,
|
1122 |
+
"counts": [
|
1123 |
+
846,
|
1124 |
+
309,
|
1125 |
+
132,
|
1126 |
+
62
|
1127 |
+
],
|
1128 |
+
"totals": [
|
1129 |
+
1698,
|
1130 |
+
1632,
|
1131 |
+
1566,
|
1132 |
+
1500
|
1133 |
+
],
|
1134 |
+
"precisions": [
|
1135 |
+
0.49823321554770317,
|
1136 |
+
0.18933823529411764,
|
1137 |
+
0.0842911877394636,
|
1138 |
+
0.04133333333333334
|
1139 |
+
],
|
1140 |
+
"bp": 0.9790217565823072,
|
1141 |
+
"sys_len": 1698,
|
1142 |
+
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.13181982922133714,
|
1144 |
+
"score": 0.13181982922133714,
|
1145 |
+
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.10251074913175488,
|
1147 |
+
"score_ci_high": 0.18667757600957346,
|
1148 |
+
"sacrebleu_ci_low": 0.10251074913175488,
|
1149 |
+
"sacrebleu_ci_high": 0.18667757600957346
|
1150 |
+
},
|
1151 |
+
"mt_flores_101_kor_eng": {
|
1152 |
+
"num_of_instances": 66,
|
1153 |
+
"counts": [
|
1154 |
+
817,
|
1155 |
+
278,
|
1156 |
+
122,
|
1157 |
+
63
|
1158 |
+
],
|
1159 |
+
"totals": [
|
1160 |
+
1724,
|
1161 |
+
1658,
|
1162 |
+
1592,
|
1163 |
+
1526
|
1164 |
+
],
|
1165 |
+
"precisions": [
|
1166 |
+
0.4738979118329466,
|
1167 |
+
0.16767189384800965,
|
1168 |
+
0.07663316582914573,
|
1169 |
+
0.041284403669724766
|
1170 |
+
],
|
1171 |
+
"bp": 0.9942163261750401,
|
1172 |
+
"sys_len": 1724,
|
1173 |
+
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.12518948488181658,
|
1175 |
+
"score": 0.12518948488181658,
|
1176 |
+
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.10085361909517791,
|
1178 |
+
"score_ci_high": 0.1695266774061832,
|
1179 |
+
"sacrebleu_ci_low": 0.10085361909517791,
|
1180 |
+
"sacrebleu_ci_high": 0.1695266774061832
|
1181 |
+
},
|
1182 |
+
"mt_flores_101_por_eng": {
|
1183 |
+
"num_of_instances": 66,
|
1184 |
+
"counts": [
|
1185 |
+
1164,
|
1186 |
+
692,
|
1187 |
+
452,
|
1188 |
+
295
|
1189 |
+
],
|
1190 |
+
"totals": [
|
1191 |
+
1737,
|
1192 |
+
1671,
|
1193 |
+
1605,
|
1194 |
+
1539
|
1195 |
+
],
|
1196 |
+
"precisions": [
|
1197 |
+
0.6701208981001727,
|
1198 |
+
0.41412327947336924,
|
1199 |
+
0.28161993769470406,
|
1200 |
+
0.19168291098115658
|
1201 |
+
],
|
1202 |
+
"bp": 1.0,
|
1203 |
+
"sys_len": 1737,
|
1204 |
+
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.3498504204327118,
|
1206 |
+
"score": 0.3498504204327118,
|
1207 |
+
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.3154848870877421,
|
1209 |
+
"score_ci_high": 0.42165027559439294,
|
1210 |
+
"sacrebleu_ci_low": 0.3154848870877421,
|
1211 |
+
"sacrebleu_ci_high": 0.42165027559439294
|
1212 |
+
},
|
1213 |
+
"mt_flores_101_ron_eng": {
|
1214 |
+
"num_of_instances": 66,
|
1215 |
+
"counts": [
|
1216 |
+
1200,
|
1217 |
+
704,
|
1218 |
+
458,
|
1219 |
+
314
|
1220 |
+
],
|
1221 |
+
"totals": [
|
1222 |
+
1781,
|
1223 |
+
1715,
|
1224 |
+
1649,
|
1225 |
+
1583
|
1226 |
+
],
|
1227 |
+
"precisions": [
|
1228 |
+
0.673778775968557,
|
1229 |
+
0.41049562682215746,
|
1230 |
+
0.2777440873256519,
|
1231 |
+
0.1983575489576753
|
1232 |
+
],
|
1233 |
+
"bp": 1.0,
|
1234 |
+
"sys_len": 1781,
|
1235 |
+
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.3513418261131799,
|
1237 |
+
"score": 0.3513418261131799,
|
1238 |
+
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.3037866384367266,
|
1240 |
+
"score_ci_high": 0.39510754604511705,
|
1241 |
+
"sacrebleu_ci_low": 0.3037866384367266,
|
1242 |
+
"sacrebleu_ci_high": 0.39510754604511705
|
1243 |
+
},
|
1244 |
+
"mt_flores_101_spa_eng": {
|
1245 |
+
"num_of_instances": 66,
|
1246 |
+
"counts": [
|
1247 |
+
1072,
|
1248 |
+
530,
|
1249 |
+
303,
|
1250 |
+
175
|
1251 |
+
],
|
1252 |
+
"totals": [
|
1253 |
+
1809,
|
1254 |
+
1743,
|
1255 |
+
1677,
|
1256 |
+
1611
|
1257 |
+
],
|
1258 |
+
"precisions": [
|
1259 |
+
0.5925925925925926,
|
1260 |
+
0.30407343660355707,
|
1261 |
+
0.18067978533094814,
|
1262 |
+
0.10862818125387959
|
1263 |
+
],
|
1264 |
+
"bp": 1.0,
|
1265 |
+
"sys_len": 1809,
|
1266 |
+
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.24386343888275555,
|
1268 |
+
"score": 0.24386343888275555,
|
1269 |
+
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.2168231022955434,
|
1271 |
+
"score_ci_high": 0.2798922379137682,
|
1272 |
+
"sacrebleu_ci_low": 0.2168231022955434,
|
1273 |
+
"sacrebleu_ci_high": 0.2798922379137682
|
1274 |
+
},
|
1275 |
+
"score": 0.22997684790035774,
|
1276 |
+
"score_name": "subsets_mean",
|
1277 |
+
"num_of_instances": 990
|
1278 |
+
},
|
1279 |
+
"score": 0.3447885069799008,
|
1280 |
+
"score_name": "subsets_mean",
|
1281 |
+
"num_of_instances": 12472
|
1282 |
+
}
|
1283 |
+
}
|
results/bluebench/2025-06-19T18-10-05_evaluation_results.json
ADDED
@@ -0,0 +1,1283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-19T22:09:59.730715Z",
|
4 |
+
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
+
"--tasks",
|
7 |
+
"benchmarks.bluebench",
|
8 |
+
"--model",
|
9 |
+
"cross_provider",
|
10 |
+
"--model_args",
|
11 |
+
"model_name=watsonx/meta-llama/llama-3-2-3b-instruct,max_tokens=256",
|
12 |
+
"--output_path",
|
13 |
+
"./results/bluebench",
|
14 |
+
"--log_samples",
|
15 |
+
"--trust_remote_code",
|
16 |
+
"--batch_size",
|
17 |
+
"8",
|
18 |
+
"--verbosity",
|
19 |
+
"ERROR"
|
20 |
+
],
|
21 |
+
"parsed_arguments": {
|
22 |
+
"tasks": [
|
23 |
+
"benchmarks.bluebench"
|
24 |
+
],
|
25 |
+
"split": "test",
|
26 |
+
"num_fewshots": null,
|
27 |
+
"limit": null,
|
28 |
+
"batch_size": 8,
|
29 |
+
"model": "watsonx/meta-llama/llama-3-2-3b-instruct",
|
30 |
+
"model_args": {
|
31 |
+
"max_tokens": 256
|
32 |
+
},
|
33 |
+
"gen_kwargs": null,
|
34 |
+
"chat_template_kwargs": null,
|
35 |
+
"output_path": "./results/bluebench",
|
36 |
+
"output_file_prefix": "evaluation_results",
|
37 |
+
"log_samples": true,
|
38 |
+
"verbosity": "ERROR",
|
39 |
+
"apply_chat_template": false,
|
40 |
+
"trust_remote_code": true,
|
41 |
+
"disable_hf_cache": false,
|
42 |
+
"cache_dir": null
|
43 |
+
},
|
44 |
+
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
|
46 |
+
"python_version": "3.10.18",
|
47 |
+
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
+
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
+
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
63 |
+
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
78 |
+
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
80 |
+
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
+
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
+
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
105 |
+
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
108 |
+
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
112 |
+
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
+
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
+
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
137 |
+
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
+
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
+
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
+
"jaraco.text": "3.12.1",
|
170 |
+
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"results": {
|
177 |
+
"bias": {
|
178 |
+
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.5666666666666667,
|
180 |
+
"accuracy_ci_low": 0.4666666666666667,
|
181 |
+
"accuracy_ci_high": 0.6777777777777778,
|
182 |
+
"score_name": "accuracy",
|
183 |
+
"score": 0.5666666666666667,
|
184 |
+
"score_ci_high": 0.6777777777777778,
|
185 |
+
"score_ci_low": 0.4666666666666667,
|
186 |
+
"num_of_instances": 90
|
187 |
+
},
|
188 |
+
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.7333333333333333,
|
190 |
+
"accuracy_ci_low": 0.6333333333333333,
|
191 |
+
"accuracy_ci_high": 0.8222222222222222,
|
192 |
+
"score_name": "accuracy",
|
193 |
+
"score": 0.7333333333333333,
|
194 |
+
"score_ci_high": 0.8222222222222222,
|
195 |
+
"score_ci_low": 0.6333333333333333,
|
196 |
+
"num_of_instances": 90
|
197 |
+
},
|
198 |
+
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.8222222222222222,
|
200 |
+
"accuracy_ci_low": 0.7444444444444445,
|
201 |
+
"accuracy_ci_high": 0.9,
|
202 |
+
"score_name": "accuracy",
|
203 |
+
"score": 0.8222222222222222,
|
204 |
+
"score_ci_high": 0.9,
|
205 |
+
"score_ci_low": 0.7444444444444445,
|
206 |
+
"num_of_instances": 90
|
207 |
+
},
|
208 |
+
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.6444444444444445,
|
210 |
+
"accuracy_ci_low": 0.5333333333333333,
|
211 |
+
"accuracy_ci_high": 0.7333333333333333,
|
212 |
+
"score_name": "accuracy",
|
213 |
+
"score": 0.6444444444444445,
|
214 |
+
"score_ci_high": 0.7333333333333333,
|
215 |
+
"score_ci_low": 0.5333333333333333,
|
216 |
+
"num_of_instances": 90
|
217 |
+
},
|
218 |
+
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.6888888888888889,
|
220 |
+
"accuracy_ci_low": 0.5888888888888889,
|
221 |
+
"accuracy_ci_high": 0.7777777777777778,
|
222 |
+
"score_name": "accuracy",
|
223 |
+
"score": 0.6888888888888889,
|
224 |
+
"score_ci_high": 0.7777777777777778,
|
225 |
+
"score_ci_low": 0.5888888888888889,
|
226 |
+
"num_of_instances": 90
|
227 |
+
},
|
228 |
+
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.8111111111111111,
|
230 |
+
"accuracy_ci_low": 0.7222222222222222,
|
231 |
+
"accuracy_ci_high": 0.8777777777777778,
|
232 |
+
"score_name": "accuracy",
|
233 |
+
"score": 0.8111111111111111,
|
234 |
+
"score_ci_high": 0.8777777777777778,
|
235 |
+
"score_ci_low": 0.7222222222222222,
|
236 |
+
"num_of_instances": 90
|
237 |
+
},
|
238 |
+
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.7222222222222222,
|
240 |
+
"accuracy_ci_low": 0.6111111111111112,
|
241 |
+
"accuracy_ci_high": 0.8,
|
242 |
+
"score_name": "accuracy",
|
243 |
+
"score": 0.7222222222222222,
|
244 |
+
"score_ci_high": 0.8,
|
245 |
+
"score_ci_low": 0.6111111111111112,
|
246 |
+
"num_of_instances": 90
|
247 |
+
},
|
248 |
+
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.7111111111111111,
|
250 |
+
"accuracy_ci_low": 0.6111111111111112,
|
251 |
+
"accuracy_ci_high": 0.8,
|
252 |
+
"score_name": "accuracy",
|
253 |
+
"score": 0.7111111111111111,
|
254 |
+
"score_ci_high": 0.8,
|
255 |
+
"score_ci_low": 0.6111111111111112,
|
256 |
+
"num_of_instances": 90
|
257 |
+
},
|
258 |
+
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.7111111111111111,
|
260 |
+
"accuracy_ci_low": 0.6111111111111112,
|
261 |
+
"accuracy_ci_high": 0.8,
|
262 |
+
"score_name": "accuracy",
|
263 |
+
"score": 0.7111111111111111,
|
264 |
+
"score_ci_high": 0.8,
|
265 |
+
"score_ci_low": 0.6111111111111112,
|
266 |
+
"num_of_instances": 90
|
267 |
+
},
|
268 |
+
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.7666666666666667,
|
270 |
+
"accuracy_ci_low": 0.6666666666666666,
|
271 |
+
"accuracy_ci_high": 0.8444444444444444,
|
272 |
+
"score_name": "accuracy",
|
273 |
+
"score": 0.7666666666666667,
|
274 |
+
"score_ci_high": 0.8444444444444444,
|
275 |
+
"score_ci_low": 0.6666666666666666,
|
276 |
+
"num_of_instances": 90
|
277 |
+
},
|
278 |
+
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.8,
|
280 |
+
"accuracy_ci_low": 0.7111111111111111,
|
281 |
+
"accuracy_ci_high": 0.8777777777777778,
|
282 |
+
"score_name": "accuracy",
|
283 |
+
"score": 0.8,
|
284 |
+
"score_ci_high": 0.8777777777777778,
|
285 |
+
"score_ci_low": 0.7111111111111111,
|
286 |
+
"num_of_instances": 90
|
287 |
+
},
|
288 |
+
"score": 0.7252525252525253,
|
289 |
+
"score_name": "subsets_mean",
|
290 |
+
"num_of_instances": 990
|
291 |
+
},
|
292 |
+
"chatbot_abilities": {
|
293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
+
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.5,
|
296 |
+
"score": 0.5,
|
297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
+
},
|
299 |
+
"score": 0.5,
|
300 |
+
"score_name": "subsets_mean",
|
301 |
+
"num_of_instances": 500
|
302 |
+
},
|
303 |
+
"entity_extraction": {
|
304 |
+
"universal_ner_en_ewt": {
|
305 |
+
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.5026737967914439,
|
307 |
+
"f1_Organization": 0.2875,
|
308 |
+
"f1_Location": 0.28571428571428575,
|
309 |
+
"f1_macro": 0.35862936083524316,
|
310 |
+
"recall_macro": 0.3171773628661296,
|
311 |
+
"precision_macro": 0.4188335014421971,
|
312 |
+
"in_classes_support": 0.7664783427495292,
|
313 |
+
"f1_micro": 0.32954545454545453,
|
314 |
+
"recall_micro": 0.3314285714285714,
|
315 |
+
"precision_micro": 0.327683615819209,
|
316 |
+
"score": 0.32954545454545453,
|
317 |
+
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.28111538926666857,
|
319 |
+
"score_ci_high": 0.37977770945501865,
|
320 |
+
"f1_micro_ci_low": 0.28111538926666857,
|
321 |
+
"f1_micro_ci_high": 0.37977770945501865
|
322 |
+
},
|
323 |
+
"score": 0.32954545454545453,
|
324 |
+
"score_name": "subsets_mean",
|
325 |
+
"num_of_instances": 1000
|
326 |
+
},
|
327 |
+
"knowledge": {
|
328 |
+
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.4084507042253521,
|
330 |
+
"accuracy_ci_low": 0.30985915492957744,
|
331 |
+
"accuracy_ci_high": 0.5211267605633803,
|
332 |
+
"score_name": "accuracy",
|
333 |
+
"score": 0.4084507042253521,
|
334 |
+
"score_ci_high": 0.5211267605633803,
|
335 |
+
"score_ci_low": 0.30985915492957744,
|
336 |
+
"num_of_instances": 71
|
337 |
+
},
|
338 |
+
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.2535211267605634,
|
340 |
+
"accuracy_ci_low": 0.15492957746478872,
|
341 |
+
"accuracy_ci_high": 0.36619718309859156,
|
342 |
+
"score_name": "accuracy",
|
343 |
+
"score": 0.2535211267605634,
|
344 |
+
"score_ci_high": 0.36619718309859156,
|
345 |
+
"score_ci_low": 0.15492957746478872,
|
346 |
+
"num_of_instances": 71
|
347 |
+
},
|
348 |
+
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.19718309859154928,
|
350 |
+
"accuracy_ci_low": 0.11267605633802817,
|
351 |
+
"accuracy_ci_high": 0.29577464788732394,
|
352 |
+
"score_name": "accuracy",
|
353 |
+
"score": 0.19718309859154928,
|
354 |
+
"score_ci_high": 0.29577464788732394,
|
355 |
+
"score_ci_low": 0.11267605633802817,
|
356 |
+
"num_of_instances": 71
|
357 |
+
},
|
358 |
+
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.29577464788732394,
|
360 |
+
"accuracy_ci_low": 0.19718309859154928,
|
361 |
+
"accuracy_ci_high": 0.39436619718309857,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.29577464788732394,
|
364 |
+
"score_ci_high": 0.39436619718309857,
|
365 |
+
"score_ci_low": 0.19718309859154928,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.4788732394366197,
|
370 |
+
"accuracy_ci_low": 0.36619718309859156,
|
371 |
+
"accuracy_ci_high": 0.5915492957746479,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.4788732394366197,
|
374 |
+
"score_ci_high": 0.5915492957746479,
|
375 |
+
"score_ci_low": 0.36619718309859156,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.19718309859154928,
|
380 |
+
"accuracy_ci_low": 0.11267605633802817,
|
381 |
+
"accuracy_ci_high": 0.29577464788732394,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.19718309859154928,
|
384 |
+
"score_ci_high": 0.29577464788732394,
|
385 |
+
"score_ci_low": 0.11267605633802817,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.29577464788732394,
|
390 |
+
"accuracy_ci_low": 0.19718309859154928,
|
391 |
+
"accuracy_ci_high": 0.4225352112676056,
|
392 |
+
"score_name": "accuracy",
|
393 |
+
"score": 0.29577464788732394,
|
394 |
+
"score_ci_high": 0.4225352112676056,
|
395 |
+
"score_ci_low": 0.19718309859154928,
|
396 |
+
"num_of_instances": 71
|
397 |
+
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.30985915492957744,
|
400 |
+
"accuracy_ci_low": 0.2112676056338028,
|
401 |
+
"accuracy_ci_high": 0.4225352112676056,
|
402 |
+
"score_name": "accuracy",
|
403 |
+
"score": 0.30985915492957744,
|
404 |
+
"score_ci_high": 0.4225352112676056,
|
405 |
+
"score_ci_low": 0.2112676056338028,
|
406 |
+
"num_of_instances": 71
|
407 |
+
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.22535211267605634,
|
410 |
+
"accuracy_ci_low": 0.14084507042253522,
|
411 |
+
"accuracy_ci_high": 0.323943661971831,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.22535211267605634,
|
414 |
+
"score_ci_high": 0.323943661971831,
|
415 |
+
"score_ci_low": 0.14084507042253522,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.11267605633802817,
|
420 |
+
"accuracy_ci_low": 0.056338028169014086,
|
421 |
+
"accuracy_ci_high": 0.19718309859154928,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.11267605633802817,
|
424 |
+
"score_ci_high": 0.19718309859154928,
|
425 |
+
"score_ci_low": 0.056338028169014086,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
+
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.23943661971830985,
|
430 |
+
"accuracy_ci_low": 0.15492957746478872,
|
431 |
+
"accuracy_ci_high": 0.352112676056338,
|
432 |
+
"score_name": "accuracy",
|
433 |
+
"score": 0.23943661971830985,
|
434 |
+
"score_ci_high": 0.352112676056338,
|
435 |
+
"score_ci_low": 0.15492957746478872,
|
436 |
+
"num_of_instances": 71
|
437 |
+
},
|
438 |
+
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.2676056338028169,
|
440 |
+
"accuracy_ci_low": 0.16901408450704225,
|
441 |
+
"accuracy_ci_high": 0.38028169014084506,
|
442 |
+
"score_name": "accuracy",
|
443 |
+
"score": 0.2676056338028169,
|
444 |
+
"score_ci_high": 0.38028169014084506,
|
445 |
+
"score_ci_low": 0.16901408450704225,
|
446 |
+
"num_of_instances": 71
|
447 |
+
},
|
448 |
+
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.15492957746478872,
|
450 |
+
"accuracy_ci_low": 0.08450704225352113,
|
451 |
+
"accuracy_ci_high": 0.2535211267605634,
|
452 |
+
"score_name": "accuracy",
|
453 |
+
"score": 0.15492957746478872,
|
454 |
+
"score_ci_high": 0.2535211267605634,
|
455 |
+
"score_ci_low": 0.08450704225352113,
|
456 |
+
"num_of_instances": 71
|
457 |
+
},
|
458 |
+
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.49295774647887325,
|
460 |
+
"accuracy_ci_low": 0.38028169014084506,
|
461 |
+
"accuracy_ci_high": 0.6197183098591549,
|
462 |
+
"score_name": "accuracy",
|
463 |
+
"score": 0.49295774647887325,
|
464 |
+
"score_ci_high": 0.6197183098591549,
|
465 |
+
"score_ci_low": 0.38028169014084506,
|
466 |
+
"num_of_instances": 71
|
467 |
+
},
|
468 |
+
"score": 0.2806841046277666,
|
469 |
+
"score_name": "subsets_mean",
|
470 |
+
"num_of_instances": 994
|
471 |
+
},
|
472 |
+
"legal": {
|
473 |
+
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.322066985645933,
|
475 |
+
"f1_suggestive": 0.2,
|
476 |
+
"f1_descriptive": 0.4,
|
477 |
+
"f1_generic": 0.3157894736842105,
|
478 |
+
"f1_arbitrary": 0.45454545454545453,
|
479 |
+
"f1_fanciful": 0.24,
|
480 |
+
"f1_macro_ci_low": 0.2326130794928424,
|
481 |
+
"f1_macro_ci_high": 0.4327896628836512,
|
482 |
+
"score_name": "f1_micro",
|
483 |
+
"score": 0.34523809523809523,
|
484 |
+
"score_ci_high": 0.44408416032543374,
|
485 |
+
"score_ci_low": 0.25,
|
486 |
+
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.3411764705882353,
|
488 |
+
"accuracy_ci_low": 0.24705882352941178,
|
489 |
+
"accuracy_ci_high": 0.43529411764705883,
|
490 |
+
"f1_micro": 0.34523809523809523,
|
491 |
+
"f1_micro_ci_low": 0.25,
|
492 |
+
"f1_micro_ci_high": 0.44408416032543374
|
493 |
+
},
|
494 |
+
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.4862053369516056,
|
496 |
+
"f1_no": 0.48484848484848486,
|
497 |
+
"f1_yes": 0.48756218905472637,
|
498 |
+
"f1_macro_ci_low": 0.4185718876526183,
|
499 |
+
"f1_macro_ci_high": 0.5583302726222448,
|
500 |
+
"score_name": "f1_micro",
|
501 |
+
"score": 0.48621553884711777,
|
502 |
+
"score_ci_high": 0.555,
|
503 |
+
"score_ci_low": 0.41708542713567837,
|
504 |
+
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.485,
|
506 |
+
"accuracy_ci_low": 0.415,
|
507 |
+
"accuracy_ci_high": 0.555,
|
508 |
+
"f1_micro": 0.48621553884711777,
|
509 |
+
"f1_micro_ci_low": 0.41708542713567837,
|
510 |
+
"f1_micro_ci_high": 0.555
|
511 |
+
},
|
512 |
+
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.21190211834221687,
|
514 |
+
"f1_conclusion": 0.14634146341463414,
|
515 |
+
"f1_analysis": 0.3673469387755102,
|
516 |
+
"f1_decree": 0.07692307692307693,
|
517 |
+
"f1_issue": 0.2631578947368421,
|
518 |
+
"f1_facts": 0.13333333333333333,
|
519 |
+
"f1_procedural history": 0.12121212121212122,
|
520 |
+
"f1_rule": 0.375,
|
521 |
+
"f1_macro_ci_low": 0.16201147151923767,
|
522 |
+
"f1_macro_ci_high": 0.27835110124455087,
|
523 |
+
"score_name": "f1_micro",
|
524 |
+
"score": 0.24427480916030533,
|
525 |
+
"score_ci_high": 0.3110332844595539,
|
526 |
+
"score_ci_low": 0.18933051276149385,
|
527 |
+
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.24,
|
529 |
+
"accuracy_ci_low": 0.185,
|
530 |
+
"accuracy_ci_high": 0.305,
|
531 |
+
"f1_micro": 0.24427480916030533,
|
532 |
+
"f1_micro_ci_low": 0.18933051276149385,
|
533 |
+
"f1_micro_ci_high": 0.3110332844595539
|
534 |
+
},
|
535 |
+
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.5140298832430668,
|
537 |
+
"f1_yes": 0.541871921182266,
|
538 |
+
"f1_no": 0.4861878453038674,
|
539 |
+
"f1_macro_ci_low": 0.44358226014090585,
|
540 |
+
"f1_macro_ci_high": 0.5836656602180865,
|
541 |
+
"score_name": "f1_micro",
|
542 |
+
"score": 0.515625,
|
543 |
+
"score_ci_high": 0.583858269920324,
|
544 |
+
"score_ci_low": 0.4443197729294639,
|
545 |
+
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.495,
|
547 |
+
"accuracy_ci_low": 0.43,
|
548 |
+
"accuracy_ci_high": 0.565,
|
549 |
+
"f1_micro": 0.515625,
|
550 |
+
"f1_micro_ci_low": 0.4443197729294639,
|
551 |
+
"f1_micro_ci_high": 0.583858269920324
|
552 |
+
},
|
553 |
+
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.726027397260274,
|
555 |
+
"f1_yes": 0.7123287671232876,
|
556 |
+
"f1_no": 0.7397260273972602,
|
557 |
+
"f1_macro_ci_low": 0.6205412546681461,
|
558 |
+
"f1_macro_ci_high": 0.8095667611328509,
|
559 |
+
"score_name": "f1_micro",
|
560 |
+
"score": 0.726027397260274,
|
561 |
+
"score_ci_high": 0.8079470198675497,
|
562 |
+
"score_ci_low": 0.6153846153846154,
|
563 |
+
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.6235294117647059,
|
565 |
+
"accuracy_ci_low": 0.5058823529411764,
|
566 |
+
"accuracy_ci_high": 0.7176470588235294,
|
567 |
+
"f1_micro": 0.726027397260274,
|
568 |
+
"f1_micro_ci_low": 0.6153846153846154,
|
569 |
+
"f1_micro_ci_high": 0.8079470198675497
|
570 |
+
},
|
571 |
+
"score": 0.46347616810115844,
|
572 |
+
"score_name": "subsets_mean",
|
573 |
+
"num_of_instances": 770
|
574 |
+
},
|
575 |
+
"news_classification": {
|
576 |
+
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.3839629098938318,
|
578 |
+
"f1_cars": 0.6804123711340206,
|
579 |
+
"f1_windows x": 0.03125,
|
580 |
+
"f1_atheism": 0.48,
|
581 |
+
"f1_christianity": 0.425,
|
582 |
+
"f1_religion": 0.18461538461538463,
|
583 |
+
"f1_medicine": 0.6376811594202898,
|
584 |
+
"f1_computer graphics": 0.27979274611398963,
|
585 |
+
"f1_microsoft windows": 0.35294117647058826,
|
586 |
+
"f1_middle east": 0.12244897959183673,
|
587 |
+
"f1_politics": 0.26666666666666666,
|
588 |
+
"f1_motorcycles": 0.47619047619047616,
|
589 |
+
"f1_mac hardware": 0.14492753623188406,
|
590 |
+
"f1_pc hardware": 0.358974358974359,
|
591 |
+
"f1_for sale": 0.3018867924528302,
|
592 |
+
"f1_guns": 0.2,
|
593 |
+
"f1_baseball": 0.8130081300813008,
|
594 |
+
"f1_space": 0.5194805194805194,
|
595 |
+
"f1_cryptography": 0.3466666666666667,
|
596 |
+
"f1_electronics": 0.41025641025641024,
|
597 |
+
"f1_hockey": 0.6470588235294118,
|
598 |
+
"f1_macro_ci_low": 0.35856171946837523,
|
599 |
+
"f1_macro_ci_high": 0.41714627316225344,
|
600 |
+
"score_name": "f1_micro",
|
601 |
+
"score": 0.4028352037802717,
|
602 |
+
"score_ci_high": 0.433682467300905,
|
603 |
+
"score_ci_low": 0.3727273654547539,
|
604 |
+
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.341,
|
606 |
+
"accuracy_ci_low": 0.312,
|
607 |
+
"accuracy_ci_high": 0.369,
|
608 |
+
"f1_micro": 0.4028352037802717,
|
609 |
+
"f1_micro_ci_low": 0.3727273654547539,
|
610 |
+
"f1_micro_ci_high": 0.433682467300905
|
611 |
+
},
|
612 |
+
"score": 0.4028352037802717,
|
613 |
+
"score_name": "subsets_mean",
|
614 |
+
"num_of_instances": 1000
|
615 |
+
},
|
616 |
+
"product_help": {
|
617 |
+
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.522130366066412,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9174311926605505,
|
620 |
+
"f1_credit card or prepaid card": 0.2619047619047619,
|
621 |
+
"f1_debt collection": 0.47904191616766467,
|
622 |
+
"f1_checking or savings account": 0.6534653465346535,
|
623 |
+
"f1_money transfer or virtual currency or money service": 0.56,
|
624 |
+
"f1_vehicle loan or lease": 0.23076923076923078,
|
625 |
+
"f1_mortgage": 0.7037037037037037,
|
626 |
+
"f1_payday loan or title loan or personal loan": 0.14285714285714285,
|
627 |
+
"f1_student loan": 0.75,
|
628 |
+
"f1_macro_ci_low": 0.47475792993209676,
|
629 |
+
"f1_macro_ci_high": 0.5842900552170582,
|
630 |
+
"score_name": "f1_micro",
|
631 |
+
"score": 0.8006150691952845,
|
632 |
+
"score_ci_high": 0.8235944353763129,
|
633 |
+
"score_ci_low": 0.7741691905584849,
|
634 |
+
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.781,
|
636 |
+
"accuracy_ci_low": 0.753,
|
637 |
+
"accuracy_ci_high": 0.805,
|
638 |
+
"f1_micro": 0.8006150691952845,
|
639 |
+
"f1_micro_ci_low": 0.7741691905584849,
|
640 |
+
"f1_micro_ci_high": 0.8235944353763129
|
641 |
+
},
|
642 |
+
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.6094652526866444,
|
644 |
+
"f1_mortgages and loans": 0.7821229050279329,
|
645 |
+
"f1_credit card": 0.6666666666666666,
|
646 |
+
"f1_debt collection": 0.5684210526315789,
|
647 |
+
"f1_retail banking": 0.2912621359223301,
|
648 |
+
"f1_credit reporting": 0.7388535031847133,
|
649 |
+
"f1_macro_ci_low": 0.5670634281009035,
|
650 |
+
"f1_macro_ci_high": 0.6527536805294223,
|
651 |
+
"score_name": "f1_micro",
|
652 |
+
"score": 0.6524390243902439,
|
653 |
+
"score_ci_high": 0.6904276985743381,
|
654 |
+
"score_ci_low": 0.6066261962892265,
|
655 |
+
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.642,
|
657 |
+
"accuracy_ci_low": 0.594,
|
658 |
+
"accuracy_ci_high": 0.68,
|
659 |
+
"f1_micro": 0.6524390243902439,
|
660 |
+
"f1_micro_ci_low": 0.6066261962892265,
|
661 |
+
"f1_micro_ci_high": 0.6904276985743381
|
662 |
+
},
|
663 |
+
"score": 0.7265270467927643,
|
664 |
+
"score_name": "subsets_mean",
|
665 |
+
"num_of_instances": 1500
|
666 |
+
},
|
667 |
+
"qa_finance": {
|
668 |
+
"fin_qa": {
|
669 |
+
"num_of_instances": 1000,
|
670 |
+
"program_accuracy": 0.038,
|
671 |
+
"score": 0.038,
|
672 |
+
"score_name": "program_accuracy",
|
673 |
+
"execution_accuracy": 0.03,
|
674 |
+
"program_accuracy_ci_low": 0.027702359114314717,
|
675 |
+
"program_accuracy_ci_high": 0.05038214389818779,
|
676 |
+
"score_ci_low": 0.027702359114314717,
|
677 |
+
"score_ci_high": 0.05038214389818779,
|
678 |
+
"execution_accuracy_ci_low": 0.021,
|
679 |
+
"execution_accuracy_ci_high": 0.042
|
680 |
+
},
|
681 |
+
"score": 0.038,
|
682 |
+
"score_name": "subsets_mean",
|
683 |
+
"num_of_instances": 1000
|
684 |
+
},
|
685 |
+
"rag_general": {
|
686 |
+
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3238378212061128,
|
688 |
+
"recall": 0.5112460699696839,
|
689 |
+
"f1": 0.32731883927753985,
|
690 |
+
"precision_ci_low": 0.3032630188662264,
|
691 |
+
"precision_ci_high": 0.34255213352234126,
|
692 |
+
"recall_ci_low": 0.4946874887919962,
|
693 |
+
"recall_ci_high": 0.5283045722843877,
|
694 |
+
"f1_ci_low": 0.3123215475894989,
|
695 |
+
"f1_ci_high": 0.3436240931728296,
|
696 |
+
"score_name": "f1",
|
697 |
+
"score": 0.32731883927753985,
|
698 |
+
"score_ci_high": 0.3436240931728296,
|
699 |
+
"score_ci_low": 0.3123215475894989,
|
700 |
+
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5872805261611939,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6561146193246047,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5541991981615623,
|
704 |
+
"faithfullness_f1_token_overlap": 0.2552259281757564,
|
705 |
+
"faithfullness_recall_token_overlap": 0.18820051905597926,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5563191178123087,
|
707 |
+
"correctness_f1_token_overlap": 0.32731883927753985,
|
708 |
+
"correctness_recall_token_overlap": 0.5112460699696839,
|
709 |
+
"correctness_precision_token_overlap": 0.3238378212061128
|
710 |
+
},
|
711 |
+
"score": 0.32731883927753985,
|
712 |
+
"score_name": "subsets_mean",
|
713 |
+
"num_of_instances": 600
|
714 |
+
},
|
715 |
+
"reasoning": {
|
716 |
+
"hellaswag": {
|
717 |
+
"accuracy": 0.435,
|
718 |
+
"accuracy_ci_low": 0.403,
|
719 |
+
"accuracy_ci_high": 0.4660514308680489,
|
720 |
+
"score_name": "accuracy",
|
721 |
+
"score": 0.435,
|
722 |
+
"score_ci_high": 0.4660514308680489,
|
723 |
+
"score_ci_low": 0.403,
|
724 |
+
"num_of_instances": 1000
|
725 |
+
},
|
726 |
+
"openbook_qa": {
|
727 |
+
"accuracy": 0.744,
|
728 |
+
"accuracy_ci_low": 0.704,
|
729 |
+
"accuracy_ci_high": 0.782,
|
730 |
+
"score_name": "accuracy",
|
731 |
+
"score": 0.744,
|
732 |
+
"score_ci_high": 0.782,
|
733 |
+
"score_ci_low": 0.704,
|
734 |
+
"num_of_instances": 500
|
735 |
+
},
|
736 |
+
"score": 0.5895,
|
737 |
+
"score_name": "subsets_mean",
|
738 |
+
"num_of_instances": 1500
|
739 |
+
},
|
740 |
+
"safety": {
|
741 |
+
"attaq_500": {
|
742 |
+
"severity_critical": 0.0,
|
743 |
+
"severity_high": 0.0,
|
744 |
+
"severity_medium": 0.0,
|
745 |
+
"severity_low": 100.0,
|
746 |
+
"category_harmful_info": 0.8374238369454232,
|
747 |
+
"category_deception": 0.8230129361453682,
|
748 |
+
"category_explicit_content": 0.8649854577249951,
|
749 |
+
"category_violence": 0.8130838501311484,
|
750 |
+
"category_discrimination": 0.8325134645777916,
|
751 |
+
"category_substance_abuse": 0.9021757920583088,
|
752 |
+
"category_pii": 0.8319368116833545,
|
753 |
+
"safety": 0.8330806816902219,
|
754 |
+
"safety_ci_low": 0.8160996919801022,
|
755 |
+
"safety_ci_high": 0.8485437387510895,
|
756 |
+
"score_name": "safety",
|
757 |
+
"score": 0.8330806816902219,
|
758 |
+
"score_ci_high": 0.8485437387510895,
|
759 |
+
"score_ci_low": 0.8160996919801022,
|
760 |
+
"num_of_instances": 100
|
761 |
+
},
|
762 |
+
"score": 0.8330806816902219,
|
763 |
+
"score_name": "subsets_mean",
|
764 |
+
"num_of_instances": 100
|
765 |
+
},
|
766 |
+
"summarization": {
|
767 |
+
"billsum_document_filtered_to_6000_chars": {
|
768 |
+
"num_of_instances": 528,
|
769 |
+
"rouge2": 0.20526331379083082,
|
770 |
+
"rougeLsum": 0.3552440537707598,
|
771 |
+
"rougeL": 0.2898741985263412,
|
772 |
+
"score": 0.2898741985263412,
|
773 |
+
"score_name": "rougeL",
|
774 |
+
"rouge1": 0.41514442247584377,
|
775 |
+
"rouge2_ci_low": 0.19811392792291177,
|
776 |
+
"rouge2_ci_high": 0.21345928409773357,
|
777 |
+
"rougeLsum_ci_low": 0.34590687620199956,
|
778 |
+
"rougeLsum_ci_high": 0.3636463972631658,
|
779 |
+
"rougeL_ci_low": 0.2825891854116913,
|
780 |
+
"rougeL_ci_high": 0.29754587598623977,
|
781 |
+
"score_ci_low": 0.2825891854116913,
|
782 |
+
"score_ci_high": 0.29754587598623977,
|
783 |
+
"rouge1_ci_low": 0.404637721565675,
|
784 |
+
"rouge1_ci_high": 0.4244973791646393
|
785 |
+
},
|
786 |
+
"tldr_document_filtered_to_6000_chars": {
|
787 |
+
"num_of_instances": 1000,
|
788 |
+
"rouge2": 0.01759477613392835,
|
789 |
+
"rougeLsum": 0.10341637065855062,
|
790 |
+
"rougeL": 0.09030341343927119,
|
791 |
+
"score": 0.09030341343927119,
|
792 |
+
"score_name": "rougeL",
|
793 |
+
"rouge1": 0.12511971491959425,
|
794 |
+
"rouge2_ci_low": 0.0157175663284522,
|
795 |
+
"rouge2_ci_high": 0.01952934654149148,
|
796 |
+
"rougeLsum_ci_low": 0.099113259070034,
|
797 |
+
"rougeLsum_ci_high": 0.10780067044910371,
|
798 |
+
"rougeL_ci_low": 0.08652503643426336,
|
799 |
+
"rougeL_ci_high": 0.09415506056724576,
|
800 |
+
"score_ci_low": 0.08652503643426336,
|
801 |
+
"score_ci_high": 0.09415506056724576,
|
802 |
+
"rouge1_ci_low": 0.11934211041104513,
|
803 |
+
"rouge1_ci_high": 0.1306891027165195
|
804 |
+
},
|
805 |
+
"score": 0.1900888059828062,
|
806 |
+
"score_name": "subsets_mean",
|
807 |
+
"num_of_instances": 1528
|
808 |
+
},
|
809 |
+
"translation": {
|
810 |
+
"mt_flores_101_ara_eng": {
|
811 |
+
"num_of_instances": 66,
|
812 |
+
"counts": [
|
813 |
+
1145,
|
814 |
+
640,
|
815 |
+
404,
|
816 |
+
258
|
817 |
+
],
|
818 |
+
"totals": [
|
819 |
+
1853,
|
820 |
+
1787,
|
821 |
+
1721,
|
822 |
+
1655
|
823 |
+
],
|
824 |
+
"precisions": [
|
825 |
+
0.6179168915272532,
|
826 |
+
0.35814213766088415,
|
827 |
+
0.2347472399767577,
|
828 |
+
0.15589123867069488
|
829 |
+
],
|
830 |
+
"bp": 1.0,
|
831 |
+
"sys_len": 1853,
|
832 |
+
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.2999866463267908,
|
834 |
+
"score": 0.2999866463267908,
|
835 |
+
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.25379441814797343,
|
837 |
+
"score_ci_high": 0.34071904065425035,
|
838 |
+
"sacrebleu_ci_low": 0.25379441814797343,
|
839 |
+
"sacrebleu_ci_high": 0.34071904065425035
|
840 |
+
},
|
841 |
+
"mt_flores_101_deu_eng": {
|
842 |
+
"num_of_instances": 66,
|
843 |
+
"counts": [
|
844 |
+
1252,
|
845 |
+
756,
|
846 |
+
497,
|
847 |
+
327
|
848 |
+
],
|
849 |
+
"totals": [
|
850 |
+
1813,
|
851 |
+
1747,
|
852 |
+
1681,
|
853 |
+
1615
|
854 |
+
],
|
855 |
+
"precisions": [
|
856 |
+
0.6905681191395477,
|
857 |
+
0.4327418431597023,
|
858 |
+
0.2956573468173706,
|
859 |
+
0.20247678018575851
|
860 |
+
],
|
861 |
+
"bp": 1.0,
|
862 |
+
"sys_len": 1813,
|
863 |
+
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.3657209415128666,
|
865 |
+
"score": 0.3657209415128666,
|
866 |
+
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.3364370770791262,
|
868 |
+
"score_ci_high": 0.40451448818750857,
|
869 |
+
"sacrebleu_ci_low": 0.3364370770791262,
|
870 |
+
"sacrebleu_ci_high": 0.40451448818750857
|
871 |
+
},
|
872 |
+
"mt_flores_101_eng_ara": {
|
873 |
+
"num_of_instances": 66,
|
874 |
+
"counts": [
|
875 |
+
613,
|
876 |
+
215,
|
877 |
+
85,
|
878 |
+
31
|
879 |
+
],
|
880 |
+
"totals": [
|
881 |
+
1681,
|
882 |
+
1615,
|
883 |
+
1549,
|
884 |
+
1483
|
885 |
+
],
|
886 |
+
"precisions": [
|
887 |
+
0.36466389054134446,
|
888 |
+
0.1331269349845201,
|
889 |
+
0.05487411233053583,
|
890 |
+
0.020903573836817263
|
891 |
+
],
|
892 |
+
"bp": 1.0,
|
893 |
+
"sys_len": 1681,
|
894 |
+
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.08638467153981859,
|
896 |
+
"score": 0.08638467153981859,
|
897 |
+
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.07076583616200617,
|
899 |
+
"score_ci_high": 0.10779016383654101,
|
900 |
+
"sacrebleu_ci_low": 0.07076583616200617,
|
901 |
+
"sacrebleu_ci_high": 0.10779016383654101
|
902 |
+
},
|
903 |
+
"mt_flores_101_eng_deu": {
|
904 |
+
"num_of_instances": 66,
|
905 |
+
"counts": [
|
906 |
+
1062,
|
907 |
+
552,
|
908 |
+
321,
|
909 |
+
192
|
910 |
+
],
|
911 |
+
"totals": [
|
912 |
+
1791,
|
913 |
+
1725,
|
914 |
+
1659,
|
915 |
+
1593
|
916 |
+
],
|
917 |
+
"precisions": [
|
918 |
+
0.592964824120603,
|
919 |
+
0.32,
|
920 |
+
0.19349005424954793,
|
921 |
+
0.12052730696798493
|
922 |
+
],
|
923 |
+
"bp": 0.9757320386302776,
|
924 |
+
"sys_len": 1791,
|
925 |
+
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.25165833423579964,
|
927 |
+
"score": 0.25165833423579964,
|
928 |
+
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.21420916937636764,
|
930 |
+
"score_ci_high": 0.29216503924411175,
|
931 |
+
"sacrebleu_ci_low": 0.21420916937636764,
|
932 |
+
"sacrebleu_ci_high": 0.29216503924411175
|
933 |
+
},
|
934 |
+
"mt_flores_101_eng_fra": {
|
935 |
+
"num_of_instances": 66,
|
936 |
+
"counts": [
|
937 |
+
1358,
|
938 |
+
892,
|
939 |
+
643,
|
940 |
+
470
|
941 |
+
],
|
942 |
+
"totals": [
|
943 |
+
2016,
|
944 |
+
1950,
|
945 |
+
1884,
|
946 |
+
1818
|
947 |
+
],
|
948 |
+
"precisions": [
|
949 |
+
0.6736111111111112,
|
950 |
+
0.45743589743589746,
|
951 |
+
0.3412951167728238,
|
952 |
+
0.2585258525852585
|
953 |
+
],
|
954 |
+
"bp": 0.9745361636262269,
|
955 |
+
"sys_len": 2016,
|
956 |
+
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.395723047050928,
|
958 |
+
"score": 0.395723047050928,
|
959 |
+
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.3541486960862157,
|
961 |
+
"score_ci_high": 0.4412388057124849,
|
962 |
+
"sacrebleu_ci_low": 0.3541486960862157,
|
963 |
+
"sacrebleu_ci_high": 0.4412388057124849
|
964 |
+
},
|
965 |
+
"mt_flores_101_eng_kor": {
|
966 |
+
"num_of_instances": 66,
|
967 |
+
"counts": [
|
968 |
+
1152,
|
969 |
+
512,
|
970 |
+
255,
|
971 |
+
136
|
972 |
+
],
|
973 |
+
"totals": [
|
974 |
+
2575,
|
975 |
+
2509,
|
976 |
+
2443,
|
977 |
+
2377
|
978 |
+
],
|
979 |
+
"precisions": [
|
980 |
+
0.44737864077669903,
|
981 |
+
0.20406536468712633,
|
982 |
+
0.10437986082685223,
|
983 |
+
0.057214976861590244
|
984 |
+
],
|
985 |
+
"bp": 1.0,
|
986 |
+
"sys_len": 2575,
|
987 |
+
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.152806822981997,
|
989 |
+
"score": 0.152806822981997,
|
990 |
+
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.13380815870985033,
|
992 |
+
"score_ci_high": 0.17371873213476355,
|
993 |
+
"sacrebleu_ci_low": 0.13380815870985033,
|
994 |
+
"sacrebleu_ci_high": 0.17371873213476355
|
995 |
+
},
|
996 |
+
"mt_flores_101_eng_por": {
|
997 |
+
"num_of_instances": 66,
|
998 |
+
"counts": [
|
999 |
+
1330,
|
1000 |
+
860,
|
1001 |
+
596,
|
1002 |
+
421
|
1003 |
+
],
|
1004 |
+
"totals": [
|
1005 |
+
1885,
|
1006 |
+
1819,
|
1007 |
+
1753,
|
1008 |
+
1687
|
1009 |
+
],
|
1010 |
+
"precisions": [
|
1011 |
+
0.7055702917771883,
|
1012 |
+
0.4727872457394172,
|
1013 |
+
0.33998859098687967,
|
1014 |
+
0.24955542382928275
|
1015 |
+
],
|
1016 |
+
"bp": 0.9836888676493653,
|
1017 |
+
"sys_len": 1885,
|
1018 |
+
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.4034754414236742,
|
1020 |
+
"score": 0.4034754414236742,
|
1021 |
+
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.35937407097652574,
|
1023 |
+
"score_ci_high": 0.4406572095477221,
|
1024 |
+
"sacrebleu_ci_low": 0.35937407097652574,
|
1025 |
+
"sacrebleu_ci_high": 0.4406572095477221
|
1026 |
+
},
|
1027 |
+
"mt_flores_101_eng_ron": {
|
1028 |
+
"num_of_instances": 66,
|
1029 |
+
"counts": [
|
1030 |
+
1195,
|
1031 |
+
710,
|
1032 |
+
449,
|
1033 |
+
298
|
1034 |
+
],
|
1035 |
+
"totals": [
|
1036 |
+
1898,
|
1037 |
+
1832,
|
1038 |
+
1766,
|
1039 |
+
1700
|
1040 |
+
],
|
1041 |
+
"precisions": [
|
1042 |
+
0.6296101159114857,
|
1043 |
+
0.3875545851528384,
|
1044 |
+
0.25424688561721404,
|
1045 |
+
0.17529411764705885
|
1046 |
+
],
|
1047 |
+
"bp": 0.9734874071636694,
|
1048 |
+
"sys_len": 1898,
|
1049 |
+
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.3143672009255487,
|
1051 |
+
"score": 0.3143672009255487,
|
1052 |
+
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.28824612092656865,
|
1054 |
+
"score_ci_high": 0.3595680168847451,
|
1055 |
+
"sacrebleu_ci_low": 0.28824612092656865,
|
1056 |
+
"sacrebleu_ci_high": 0.3595680168847451
|
1057 |
+
},
|
1058 |
+
"mt_flores_101_eng_spa": {
|
1059 |
+
"num_of_instances": 66,
|
1060 |
+
"counts": [
|
1061 |
+
1185,
|
1062 |
+
632,
|
1063 |
+
363,
|
1064 |
+
210
|
1065 |
+
],
|
1066 |
+
"totals": [
|
1067 |
+
1964,
|
1068 |
+
1898,
|
1069 |
+
1832,
|
1070 |
+
1766
|
1071 |
+
],
|
1072 |
+
"precisions": [
|
1073 |
+
0.6033604887983707,
|
1074 |
+
0.332982086406744,
|
1075 |
+
0.19814410480349345,
|
1076 |
+
0.11891279728199321
|
1077 |
+
],
|
1078 |
+
"bp": 0.9340473875491699,
|
1079 |
+
"sys_len": 1964,
|
1080 |
+
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.24500265020019107,
|
1082 |
+
"score": 0.24500265020019107,
|
1083 |
+
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.21966892731983978,
|
1085 |
+
"score_ci_high": 0.27134777294522555,
|
1086 |
+
"sacrebleu_ci_low": 0.21966892731983978,
|
1087 |
+
"sacrebleu_ci_high": 0.27134777294522555
|
1088 |
+
},
|
1089 |
+
"mt_flores_101_fra_eng": {
|
1090 |
+
"num_of_instances": 66,
|
1091 |
+
"counts": [
|
1092 |
+
1271,
|
1093 |
+
807,
|
1094 |
+
551,
|
1095 |
+
380
|
1096 |
+
],
|
1097 |
+
"totals": [
|
1098 |
+
1794,
|
1099 |
+
1728,
|
1100 |
+
1662,
|
1101 |
+
1596
|
1102 |
+
],
|
1103 |
+
"precisions": [
|
1104 |
+
0.7084726867335562,
|
1105 |
+
0.46701388888888884,
|
1106 |
+
0.3315282791817088,
|
1107 |
+
0.2380952380952381
|
1108 |
+
],
|
1109 |
+
"bp": 1.0,
|
1110 |
+
"sys_len": 1794,
|
1111 |
+
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.40200462477302346,
|
1113 |
+
"score": 0.40200462477302346,
|
1114 |
+
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.36426910997213796,
|
1116 |
+
"score_ci_high": 0.44904824239366326,
|
1117 |
+
"sacrebleu_ci_low": 0.36426910997213796,
|
1118 |
+
"sacrebleu_ci_high": 0.44904824239366326
|
1119 |
+
},
|
1120 |
+
"mt_flores_101_jpn_eng": {
|
1121 |
+
"num_of_instances": 66,
|
1122 |
+
"counts": [
|
1123 |
+
989,
|
1124 |
+
453,
|
1125 |
+
243,
|
1126 |
+
137
|
1127 |
+
],
|
1128 |
+
"totals": [
|
1129 |
+
1812,
|
1130 |
+
1746,
|
1131 |
+
1680,
|
1132 |
+
1614
|
1133 |
+
],
|
1134 |
+
"precisions": [
|
1135 |
+
0.5458057395143487,
|
1136 |
+
0.25945017182130586,
|
1137 |
+
0.14464285714285713,
|
1138 |
+
0.0848822800495663
|
1139 |
+
],
|
1140 |
+
"bp": 1.0,
|
1141 |
+
"sys_len": 1812,
|
1142 |
+
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.20419801799597426,
|
1144 |
+
"score": 0.20419801799597426,
|
1145 |
+
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.18116496265849946,
|
1147 |
+
"score_ci_high": 0.25247298942958346,
|
1148 |
+
"sacrebleu_ci_low": 0.18116496265849946,
|
1149 |
+
"sacrebleu_ci_high": 0.25247298942958346
|
1150 |
+
},
|
1151 |
+
"mt_flores_101_kor_eng": {
|
1152 |
+
"num_of_instances": 66,
|
1153 |
+
"counts": [
|
1154 |
+
956,
|
1155 |
+
417,
|
1156 |
+
215,
|
1157 |
+
111
|
1158 |
+
],
|
1159 |
+
"totals": [
|
1160 |
+
1742,
|
1161 |
+
1676,
|
1162 |
+
1610,
|
1163 |
+
1544
|
1164 |
+
],
|
1165 |
+
"precisions": [
|
1166 |
+
0.5487944890929966,
|
1167 |
+
0.24880668257756564,
|
1168 |
+
0.13354037267080746,
|
1169 |
+
0.07189119170984455
|
1170 |
+
],
|
1171 |
+
"bp": 1.0,
|
1172 |
+
"sys_len": 1742,
|
1173 |
+
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.19027862841650364,
|
1175 |
+
"score": 0.19027862841650364,
|
1176 |
+
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.15954407468337273,
|
1178 |
+
"score_ci_high": 0.2316279594911264,
|
1179 |
+
"sacrebleu_ci_low": 0.15954407468337273,
|
1180 |
+
"sacrebleu_ci_high": 0.2316279594911264
|
1181 |
+
},
|
1182 |
+
"mt_flores_101_por_eng": {
|
1183 |
+
"num_of_instances": 66,
|
1184 |
+
"counts": [
|
1185 |
+
1274,
|
1186 |
+
831,
|
1187 |
+
594,
|
1188 |
+
443
|
1189 |
+
],
|
1190 |
+
"totals": [
|
1191 |
+
1787,
|
1192 |
+
1721,
|
1193 |
+
1655,
|
1194 |
+
1589
|
1195 |
+
],
|
1196 |
+
"precisions": [
|
1197 |
+
0.7129266927811976,
|
1198 |
+
0.48285880302149914,
|
1199 |
+
0.3589123867069486,
|
1200 |
+
0.27879169288860917
|
1201 |
+
],
|
1202 |
+
"bp": 1.0,
|
1203 |
+
"sys_len": 1787,
|
1204 |
+
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.43080757024308985,
|
1206 |
+
"score": 0.43080757024308985,
|
1207 |
+
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.3788088705992759,
|
1209 |
+
"score_ci_high": 0.48020746054863056,
|
1210 |
+
"sacrebleu_ci_low": 0.3788088705992759,
|
1211 |
+
"sacrebleu_ci_high": 0.48020746054863056
|
1212 |
+
},
|
1213 |
+
"mt_flores_101_ron_eng": {
|
1214 |
+
"num_of_instances": 66,
|
1215 |
+
"counts": [
|
1216 |
+
1296,
|
1217 |
+
820,
|
1218 |
+
558,
|
1219 |
+
381
|
1220 |
+
],
|
1221 |
+
"totals": [
|
1222 |
+
1844,
|
1223 |
+
1778,
|
1224 |
+
1712,
|
1225 |
+
1646
|
1226 |
+
],
|
1227 |
+
"precisions": [
|
1228 |
+
0.7028199566160521,
|
1229 |
+
0.4611923509561305,
|
1230 |
+
0.3259345794392523,
|
1231 |
+
0.23147023086269744
|
1232 |
+
],
|
1233 |
+
"bp": 1.0,
|
1234 |
+
"sys_len": 1844,
|
1235 |
+
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.3954466865992584,
|
1237 |
+
"score": 0.3954466865992584,
|
1238 |
+
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.3534147781069182,
|
1240 |
+
"score_ci_high": 0.4360974663884546,
|
1241 |
+
"sacrebleu_ci_low": 0.3534147781069182,
|
1242 |
+
"sacrebleu_ci_high": 0.4360974663884546
|
1243 |
+
},
|
1244 |
+
"mt_flores_101_spa_eng": {
|
1245 |
+
"num_of_instances": 66,
|
1246 |
+
"counts": [
|
1247 |
+
1125,
|
1248 |
+
590,
|
1249 |
+
351,
|
1250 |
+
212
|
1251 |
+
],
|
1252 |
+
"totals": [
|
1253 |
+
1856,
|
1254 |
+
1790,
|
1255 |
+
1724,
|
1256 |
+
1658
|
1257 |
+
],
|
1258 |
+
"precisions": [
|
1259 |
+
0.6061422413793103,
|
1260 |
+
0.3296089385474861,
|
1261 |
+
0.20359628770301627,
|
1262 |
+
0.1278648974668275
|
1263 |
+
],
|
1264 |
+
"bp": 1.0,
|
1265 |
+
"sys_len": 1856,
|
1266 |
+
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.26854908700533703,
|
1268 |
+
"score": 0.26854908700533703,
|
1269 |
+
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.2352580605615156,
|
1271 |
+
"score_ci_high": 0.30717127715915404,
|
1272 |
+
"sacrebleu_ci_low": 0.2352580605615156,
|
1273 |
+
"sacrebleu_ci_high": 0.30717127715915404
|
1274 |
+
},
|
1275 |
+
"score": 0.29376069141538674,
|
1276 |
+
"score_name": "subsets_mean",
|
1277 |
+
"num_of_instances": 990
|
1278 |
+
},
|
1279 |
+
"score": 0.43846688626660735,
|
1280 |
+
"score_name": "subsets_mean",
|
1281 |
+
"num_of_instances": 12472
|
1282 |
+
}
|
1283 |
+
}
|
results/bluebench/2025-06-19T20-10-50_evaluation_results.json
ADDED
@@ -0,0 +1,1283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-20T00:10:45.998753Z",
|
4 |
+
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
+
"--tasks",
|
7 |
+
"benchmarks.bluebench",
|
8 |
+
"--model",
|
9 |
+
"cross_provider",
|
10 |
+
"--model_args",
|
11 |
+
"model_name=watsonx/meta-llama/llama-4-maverick-17b-128e-instruct-fp8,max_tokens=256",
|
12 |
+
"--output_path",
|
13 |
+
"./results/bluebench",
|
14 |
+
"--log_samples",
|
15 |
+
"--trust_remote_code",
|
16 |
+
"--batch_size",
|
17 |
+
"8",
|
18 |
+
"--verbosity",
|
19 |
+
"ERROR"
|
20 |
+
],
|
21 |
+
"parsed_arguments": {
|
22 |
+
"tasks": [
|
23 |
+
"benchmarks.bluebench"
|
24 |
+
],
|
25 |
+
"split": "test",
|
26 |
+
"num_fewshots": null,
|
27 |
+
"limit": null,
|
28 |
+
"batch_size": 8,
|
29 |
+
"model": "watsonx/meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
|
30 |
+
"model_args": {
|
31 |
+
"max_tokens": 256
|
32 |
+
},
|
33 |
+
"gen_kwargs": null,
|
34 |
+
"chat_template_kwargs": null,
|
35 |
+
"output_path": "./results/bluebench",
|
36 |
+
"output_file_prefix": "evaluation_results",
|
37 |
+
"log_samples": true,
|
38 |
+
"verbosity": "ERROR",
|
39 |
+
"apply_chat_template": false,
|
40 |
+
"trust_remote_code": true,
|
41 |
+
"disable_hf_cache": false,
|
42 |
+
"cache_dir": null
|
43 |
+
},
|
44 |
+
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
|
46 |
+
"python_version": "3.10.18",
|
47 |
+
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
+
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
+
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
63 |
+
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
78 |
+
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
80 |
+
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
+
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
+
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
105 |
+
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
108 |
+
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
112 |
+
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
+
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
+
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
137 |
+
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
+
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
+
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
+
"jaraco.text": "3.12.1",
|
170 |
+
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"results": {
|
177 |
+
"bias": {
|
178 |
+
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.8888888888888888,
|
180 |
+
"accuracy_ci_low": 0.8111111111111111,
|
181 |
+
"accuracy_ci_high": 0.9444444444444444,
|
182 |
+
"score_name": "accuracy",
|
183 |
+
"score": 0.8888888888888888,
|
184 |
+
"score_ci_high": 0.9444444444444444,
|
185 |
+
"score_ci_low": 0.8111111111111111,
|
186 |
+
"num_of_instances": 90
|
187 |
+
},
|
188 |
+
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.9777777777777777,
|
190 |
+
"accuracy_ci_low": 0.9222222222222223,
|
191 |
+
"accuracy_ci_high": 1.0,
|
192 |
+
"score_name": "accuracy",
|
193 |
+
"score": 0.9777777777777777,
|
194 |
+
"score_ci_high": 1.0,
|
195 |
+
"score_ci_low": 0.9222222222222223,
|
196 |
+
"num_of_instances": 90
|
197 |
+
},
|
198 |
+
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 1.0,
|
200 |
+
"accuracy_ci_low": 1.0,
|
201 |
+
"accuracy_ci_high": 1.0,
|
202 |
+
"score_name": "accuracy",
|
203 |
+
"score": 1.0,
|
204 |
+
"score_ci_high": 1.0,
|
205 |
+
"score_ci_low": 1.0,
|
206 |
+
"num_of_instances": 90
|
207 |
+
},
|
208 |
+
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 1.0,
|
210 |
+
"accuracy_ci_low": 1.0,
|
211 |
+
"accuracy_ci_high": 1.0,
|
212 |
+
"score_name": "accuracy",
|
213 |
+
"score": 1.0,
|
214 |
+
"score_ci_high": 1.0,
|
215 |
+
"score_ci_low": 1.0,
|
216 |
+
"num_of_instances": 90
|
217 |
+
},
|
218 |
+
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.9888888888888889,
|
220 |
+
"accuracy_ci_low": 0.9389750917617445,
|
221 |
+
"accuracy_ci_high": 1.0,
|
222 |
+
"score_name": "accuracy",
|
223 |
+
"score": 0.9888888888888889,
|
224 |
+
"score_ci_high": 1.0,
|
225 |
+
"score_ci_low": 0.9389750917617445,
|
226 |
+
"num_of_instances": 90
|
227 |
+
},
|
228 |
+
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.9888888888888889,
|
230 |
+
"accuracy_ci_low": 0.9333333333333333,
|
231 |
+
"accuracy_ci_high": 1.0,
|
232 |
+
"score_name": "accuracy",
|
233 |
+
"score": 0.9888888888888889,
|
234 |
+
"score_ci_high": 1.0,
|
235 |
+
"score_ci_low": 0.9333333333333333,
|
236 |
+
"num_of_instances": 90
|
237 |
+
},
|
238 |
+
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 1.0,
|
240 |
+
"accuracy_ci_low": 1.0,
|
241 |
+
"accuracy_ci_high": 1.0,
|
242 |
+
"score_name": "accuracy",
|
243 |
+
"score": 1.0,
|
244 |
+
"score_ci_high": 1.0,
|
245 |
+
"score_ci_low": 1.0,
|
246 |
+
"num_of_instances": 90
|
247 |
+
},
|
248 |
+
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 1.0,
|
250 |
+
"accuracy_ci_low": 1.0,
|
251 |
+
"accuracy_ci_high": 1.0,
|
252 |
+
"score_name": "accuracy",
|
253 |
+
"score": 1.0,
|
254 |
+
"score_ci_high": 1.0,
|
255 |
+
"score_ci_low": 1.0,
|
256 |
+
"num_of_instances": 90
|
257 |
+
},
|
258 |
+
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.8888888888888888,
|
260 |
+
"accuracy_ci_low": 0.8111111111111111,
|
261 |
+
"accuracy_ci_high": 0.9444444444444444,
|
262 |
+
"score_name": "accuracy",
|
263 |
+
"score": 0.8888888888888888,
|
264 |
+
"score_ci_high": 0.9444444444444444,
|
265 |
+
"score_ci_low": 0.8111111111111111,
|
266 |
+
"num_of_instances": 90
|
267 |
+
},
|
268 |
+
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.9888888888888889,
|
270 |
+
"accuracy_ci_low": 0.9444444444444444,
|
271 |
+
"accuracy_ci_high": 1.0,
|
272 |
+
"score_name": "accuracy",
|
273 |
+
"score": 0.9888888888888889,
|
274 |
+
"score_ci_high": 1.0,
|
275 |
+
"score_ci_low": 0.9444444444444444,
|
276 |
+
"num_of_instances": 90
|
277 |
+
},
|
278 |
+
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.8666666666666667,
|
280 |
+
"accuracy_ci_low": 0.7888888888888889,
|
281 |
+
"accuracy_ci_high": 0.9333333333333333,
|
282 |
+
"score_name": "accuracy",
|
283 |
+
"score": 0.8666666666666667,
|
284 |
+
"score_ci_high": 0.9333333333333333,
|
285 |
+
"score_ci_low": 0.7888888888888889,
|
286 |
+
"num_of_instances": 90
|
287 |
+
},
|
288 |
+
"score": 0.9626262626262626,
|
289 |
+
"score_name": "subsets_mean",
|
290 |
+
"num_of_instances": 990
|
291 |
+
},
|
292 |
+
"chatbot_abilities": {
|
293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
+
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.5,
|
296 |
+
"score": 0.5,
|
297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
+
},
|
299 |
+
"score": 0.5,
|
300 |
+
"score_name": "subsets_mean",
|
301 |
+
"num_of_instances": 500
|
302 |
+
},
|
303 |
+
"entity_extraction": {
|
304 |
+
"universal_ner_en_ewt": {
|
305 |
+
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.5662337662337662,
|
307 |
+
"f1_Organization": 0.34810126582278483,
|
308 |
+
"f1_Location": 0.4031620553359684,
|
309 |
+
"f1_macro": 0.4391656957975065,
|
310 |
+
"recall_macro": 0.39723251248500296,
|
311 |
+
"precision_macro": 0.49375910707000065,
|
312 |
+
"in_classes_support": 0.5212636695018227,
|
313 |
+
"f1_micro": 0.31899109792284863,
|
314 |
+
"recall_micro": 0.4095238095238095,
|
315 |
+
"precision_micro": 0.26123936816524906,
|
316 |
+
"score": 0.31899109792284863,
|
317 |
+
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.277439612598807,
|
319 |
+
"score_ci_high": 0.3678125973620034,
|
320 |
+
"f1_micro_ci_low": 0.277439612598807,
|
321 |
+
"f1_micro_ci_high": 0.3678125973620034
|
322 |
+
},
|
323 |
+
"score": 0.31899109792284863,
|
324 |
+
"score_name": "subsets_mean",
|
325 |
+
"num_of_instances": 1000
|
326 |
+
},
|
327 |
+
"knowledge": {
|
328 |
+
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.5915492957746479,
|
330 |
+
"accuracy_ci_low": 0.4788732394366197,
|
331 |
+
"accuracy_ci_high": 0.704225352112676,
|
332 |
+
"score_name": "accuracy",
|
333 |
+
"score": 0.5915492957746479,
|
334 |
+
"score_ci_high": 0.704225352112676,
|
335 |
+
"score_ci_low": 0.4788732394366197,
|
336 |
+
"num_of_instances": 71
|
337 |
+
},
|
338 |
+
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.2676056338028169,
|
340 |
+
"accuracy_ci_low": 0.16901408450704225,
|
341 |
+
"accuracy_ci_high": 0.38028169014084506,
|
342 |
+
"score_name": "accuracy",
|
343 |
+
"score": 0.2676056338028169,
|
344 |
+
"score_ci_high": 0.38028169014084506,
|
345 |
+
"score_ci_low": 0.16901408450704225,
|
346 |
+
"num_of_instances": 71
|
347 |
+
},
|
348 |
+
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.23943661971830985,
|
350 |
+
"accuracy_ci_low": 0.15492957746478872,
|
351 |
+
"accuracy_ci_high": 0.352112676056338,
|
352 |
+
"score_name": "accuracy",
|
353 |
+
"score": 0.23943661971830985,
|
354 |
+
"score_ci_high": 0.352112676056338,
|
355 |
+
"score_ci_low": 0.15492957746478872,
|
356 |
+
"num_of_instances": 71
|
357 |
+
},
|
358 |
+
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.5070422535211268,
|
360 |
+
"accuracy_ci_low": 0.39436619718309857,
|
361 |
+
"accuracy_ci_high": 0.6197183098591549,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.5070422535211268,
|
364 |
+
"score_ci_high": 0.6197183098591549,
|
365 |
+
"score_ci_low": 0.39436619718309857,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.6901408450704225,
|
370 |
+
"accuracy_ci_low": 0.5633802816901409,
|
371 |
+
"accuracy_ci_high": 0.7887323943661971,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.6901408450704225,
|
374 |
+
"score_ci_high": 0.7887323943661971,
|
375 |
+
"score_ci_low": 0.5633802816901409,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.39436619718309857,
|
380 |
+
"accuracy_ci_low": 0.29577464788732394,
|
381 |
+
"accuracy_ci_high": 0.5070422535211268,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.39436619718309857,
|
384 |
+
"score_ci_high": 0.5070422535211268,
|
385 |
+
"score_ci_low": 0.29577464788732394,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.5211267605633803,
|
390 |
+
"accuracy_ci_low": 0.39436619718309857,
|
391 |
+
"accuracy_ci_high": 0.6197183098591549,
|
392 |
+
"score_name": "accuracy",
|
393 |
+
"score": 0.5211267605633803,
|
394 |
+
"score_ci_high": 0.6197183098591549,
|
395 |
+
"score_ci_low": 0.39436619718309857,
|
396 |
+
"num_of_instances": 71
|
397 |
+
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.7746478873239436,
|
400 |
+
"accuracy_ci_low": 0.6619718309859155,
|
401 |
+
"accuracy_ci_high": 0.8591549295774648,
|
402 |
+
"score_name": "accuracy",
|
403 |
+
"score": 0.7746478873239436,
|
404 |
+
"score_ci_high": 0.8591549295774648,
|
405 |
+
"score_ci_low": 0.6619718309859155,
|
406 |
+
"num_of_instances": 71
|
407 |
+
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.5774647887323944,
|
410 |
+
"accuracy_ci_low": 0.4631453652997223,
|
411 |
+
"accuracy_ci_high": 0.6901408450704225,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.5774647887323944,
|
414 |
+
"score_ci_high": 0.6901408450704225,
|
415 |
+
"score_ci_low": 0.4631453652997223,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.2112676056338028,
|
420 |
+
"accuracy_ci_low": 0.1267605633802817,
|
421 |
+
"accuracy_ci_high": 0.323943661971831,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.2112676056338028,
|
424 |
+
"score_ci_high": 0.323943661971831,
|
425 |
+
"score_ci_low": 0.1267605633802817,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
+
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.5633802816901409,
|
430 |
+
"accuracy_ci_low": 0.4507042253521127,
|
431 |
+
"accuracy_ci_high": 0.668060546470624,
|
432 |
+
"score_name": "accuracy",
|
433 |
+
"score": 0.5633802816901409,
|
434 |
+
"score_ci_high": 0.668060546470624,
|
435 |
+
"score_ci_low": 0.4507042253521127,
|
436 |
+
"num_of_instances": 71
|
437 |
+
},
|
438 |
+
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.6901408450704225,
|
440 |
+
"accuracy_ci_low": 0.5774647887323944,
|
441 |
+
"accuracy_ci_high": 0.7887323943661971,
|
442 |
+
"score_name": "accuracy",
|
443 |
+
"score": 0.6901408450704225,
|
444 |
+
"score_ci_high": 0.7887323943661971,
|
445 |
+
"score_ci_low": 0.5774647887323944,
|
446 |
+
"num_of_instances": 71
|
447 |
+
},
|
448 |
+
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.4084507042253521,
|
450 |
+
"accuracy_ci_low": 0.29577464788732394,
|
451 |
+
"accuracy_ci_high": 0.5211267605633803,
|
452 |
+
"score_name": "accuracy",
|
453 |
+
"score": 0.4084507042253521,
|
454 |
+
"score_ci_high": 0.5211267605633803,
|
455 |
+
"score_ci_low": 0.29577464788732394,
|
456 |
+
"num_of_instances": 71
|
457 |
+
},
|
458 |
+
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.647887323943662,
|
460 |
+
"accuracy_ci_low": 0.5352112676056338,
|
461 |
+
"accuracy_ci_high": 0.7605633802816901,
|
462 |
+
"score_name": "accuracy",
|
463 |
+
"score": 0.647887323943662,
|
464 |
+
"score_ci_high": 0.7605633802816901,
|
465 |
+
"score_ci_low": 0.5352112676056338,
|
466 |
+
"num_of_instances": 71
|
467 |
+
},
|
468 |
+
"score": 0.506036217303823,
|
469 |
+
"score_name": "subsets_mean",
|
470 |
+
"num_of_instances": 994
|
471 |
+
},
|
472 |
+
"legal": {
|
473 |
+
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.5831370899915895,
|
475 |
+
"f1_suggestive": 0.4827586206896552,
|
476 |
+
"f1_generic": 0.6666666666666666,
|
477 |
+
"f1_descriptive": 0.6666666666666666,
|
478 |
+
"f1_fanciful": 0.4166666666666667,
|
479 |
+
"f1_arbitrary": 0.6829268292682927,
|
480 |
+
"f1_macro_ci_low": 0.47939113487694995,
|
481 |
+
"f1_macro_ci_high": 0.6897999117090845,
|
482 |
+
"score_name": "f1_micro",
|
483 |
+
"score": 0.5987261146496815,
|
484 |
+
"score_ci_high": 0.6962025316455697,
|
485 |
+
"score_ci_low": 0.4807376602538022,
|
486 |
+
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.5529411764705883,
|
488 |
+
"accuracy_ci_low": 0.43529411764705883,
|
489 |
+
"accuracy_ci_high": 0.6588235294117647,
|
490 |
+
"f1_micro": 0.5987261146496815,
|
491 |
+
"f1_micro_ci_low": 0.4807376602538022,
|
492 |
+
"f1_micro_ci_high": 0.6962025316455697
|
493 |
+
},
|
494 |
+
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.6763636363636364,
|
496 |
+
"f1_no": 0.7927272727272727,
|
497 |
+
"f1_yes": 0.56,
|
498 |
+
"f1_macro_ci_low": 0.6036339063806182,
|
499 |
+
"f1_macro_ci_high": 0.7468677315003386,
|
500 |
+
"score_name": "f1_micro",
|
501 |
+
"score": 0.7306666666666667,
|
502 |
+
"score_ci_high": 0.7853403141361257,
|
503 |
+
"score_ci_low": 0.6630296211830374,
|
504 |
+
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.685,
|
506 |
+
"accuracy_ci_low": 0.62,
|
507 |
+
"accuracy_ci_high": 0.745,
|
508 |
+
"f1_micro": 0.7306666666666667,
|
509 |
+
"f1_micro_ci_low": 0.6630296211830374,
|
510 |
+
"f1_micro_ci_high": 0.7853403141361257
|
511 |
+
},
|
512 |
+
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.2642730181773706,
|
514 |
+
"f1_conclusion": 0.0625,
|
515 |
+
"f1_issue": 0.16326530612244897,
|
516 |
+
"f1_decree": 0.2,
|
517 |
+
"f1_analysis": 0.4375,
|
518 |
+
"f1_facts": 0.32558139534883723,
|
519 |
+
"f1_procedural history": 0.19047619047619047,
|
520 |
+
"f1_rule": 0.47058823529411764,
|
521 |
+
"f1_macro_ci_low": 0.20968219014642994,
|
522 |
+
"f1_macro_ci_high": 0.33253527853885895,
|
523 |
+
"score_name": "f1_micro",
|
524 |
+
"score": 0.28938906752411575,
|
525 |
+
"score_ci_high": 0.3618842117391186,
|
526 |
+
"score_ci_low": 0.22364217252396165,
|
527 |
+
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.225,
|
529 |
+
"accuracy_ci_low": 0.175,
|
530 |
+
"accuracy_ci_high": 0.29,
|
531 |
+
"f1_micro": 0.28938906752411575,
|
532 |
+
"f1_micro_ci_low": 0.22364217252396165,
|
533 |
+
"f1_micro_ci_high": 0.3618842117391186
|
534 |
+
},
|
535 |
+
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.5830978545264259,
|
537 |
+
"f1_yes": 0.5918367346938775,
|
538 |
+
"f1_no": 0.5743589743589743,
|
539 |
+
"f1_macro_ci_low": 0.5111003138485096,
|
540 |
+
"f1_macro_ci_high": 0.6506689237239318,
|
541 |
+
"score_name": "f1_micro",
|
542 |
+
"score": 0.5831202046035806,
|
543 |
+
"score_ci_high": 0.649616368286445,
|
544 |
+
"score_ci_low": 0.5114249450573659,
|
545 |
+
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.57,
|
547 |
+
"accuracy_ci_low": 0.4968446470094224,
|
548 |
+
"accuracy_ci_high": 0.635,
|
549 |
+
"f1_micro": 0.5831202046035806,
|
550 |
+
"f1_micro_ci_low": 0.5114249450573659,
|
551 |
+
"f1_micro_ci_high": 0.649616368286445
|
552 |
+
},
|
553 |
+
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.7776061776061776,
|
555 |
+
"f1_yes": 0.7714285714285715,
|
556 |
+
"f1_no": 0.7837837837837838,
|
557 |
+
"f1_macro_ci_low": 0.6936100514418908,
|
558 |
+
"f1_macro_ci_high": 0.8455722600304791,
|
559 |
+
"score_name": "f1_micro",
|
560 |
+
"score": 0.7777777777777778,
|
561 |
+
"score_ci_high": 0.8435374149659864,
|
562 |
+
"score_ci_low": 0.6950354609929078,
|
563 |
+
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.6588235294117647,
|
565 |
+
"accuracy_ci_low": 0.5647058823529412,
|
566 |
+
"accuracy_ci_high": 0.7411764705882353,
|
567 |
+
"f1_micro": 0.7777777777777778,
|
568 |
+
"f1_micro_ci_low": 0.6950354609929078,
|
569 |
+
"f1_micro_ci_high": 0.8435374149659864
|
570 |
+
},
|
571 |
+
"score": 0.5959359662443645,
|
572 |
+
"score_name": "subsets_mean",
|
573 |
+
"num_of_instances": 770
|
574 |
+
},
|
575 |
+
"news_classification": {
|
576 |
+
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.6167905620006249,
|
578 |
+
"f1_cars": 0.8089887640449438,
|
579 |
+
"f1_windows x": 0.06153846153846154,
|
580 |
+
"f1_computer graphics": 0.5510204081632653,
|
581 |
+
"f1_atheism": 0.1951219512195122,
|
582 |
+
"f1_christianity": 0.8288288288288288,
|
583 |
+
"f1_religion": 0.1568627450980392,
|
584 |
+
"f1_medicine": 0.8505747126436781,
|
585 |
+
"f1_microsoft windows": 0.75,
|
586 |
+
"f1_middle east": 0.6666666666666666,
|
587 |
+
"f1_motorcycles": 0.7619047619047619,
|
588 |
+
"f1_politics": 0.359375,
|
589 |
+
"f1_pc hardware": 0.6619718309859155,
|
590 |
+
"f1_mac hardware": 0.7358490566037735,
|
591 |
+
"f1_for sale": 0.5806451612903226,
|
592 |
+
"f1_guns": 0.3561643835616438,
|
593 |
+
"f1_space": 0.82,
|
594 |
+
"f1_cryptography": 0.6666666666666666,
|
595 |
+
"f1_baseball": 0.9166666666666666,
|
596 |
+
"f1_hockey": 0.9402985074626866,
|
597 |
+
"f1_electronics": 0.6666666666666666,
|
598 |
+
"f1_macro_ci_low": 0.5929180247135345,
|
599 |
+
"f1_macro_ci_high": 0.6464945617502024,
|
600 |
+
"score_name": "f1_micro",
|
601 |
+
"score": 0.661588683351469,
|
602 |
+
"score_ci_high": 0.6918918918918919,
|
603 |
+
"score_ci_low": 0.6351762173413632,
|
604 |
+
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.608,
|
606 |
+
"accuracy_ci_low": 0.582,
|
607 |
+
"accuracy_ci_high": 0.6398246959343236,
|
608 |
+
"f1_micro": 0.661588683351469,
|
609 |
+
"f1_micro_ci_low": 0.6351762173413632,
|
610 |
+
"f1_micro_ci_high": 0.6918918918918919
|
611 |
+
},
|
612 |
+
"score": 0.661588683351469,
|
613 |
+
"score_name": "subsets_mean",
|
614 |
+
"num_of_instances": 1000
|
615 |
+
},
|
616 |
+
"product_help": {
|
617 |
+
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.7156339434074247,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.924907063197026,
|
620 |
+
"f1_credit card or prepaid card": 0.8,
|
621 |
+
"f1_debt collection": 0.659217877094972,
|
622 |
+
"f1_checking or savings account": 0.8070175438596491,
|
623 |
+
"f1_money transfer or virtual currency or money service": 0.6896551724137931,
|
624 |
+
"f1_student loan": 0.7741935483870968,
|
625 |
+
"f1_vehicle loan or lease": 0.625,
|
626 |
+
"f1_mortgage": 0.875,
|
627 |
+
"f1_payday loan or title loan or personal loan": 0.2857142857142857,
|
628 |
+
"f1_macro_ci_low": 0.6660333456490072,
|
629 |
+
"f1_macro_ci_high": 0.7763587756574478,
|
630 |
+
"score_name": "f1_micro",
|
631 |
+
"score": 0.863659401926001,
|
632 |
+
"score_ci_high": 0.8836978702477332,
|
633 |
+
"score_ci_low": 0.8417078870760507,
|
634 |
+
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.852,
|
636 |
+
"accuracy_ci_low": 0.8286876874270778,
|
637 |
+
"accuracy_ci_high": 0.871,
|
638 |
+
"f1_micro": 0.863659401926001,
|
639 |
+
"f1_micro_ci_low": 0.8417078870760507,
|
640 |
+
"f1_micro_ci_high": 0.8836978702477332
|
641 |
+
},
|
642 |
+
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.7760995406721664,
|
644 |
+
"f1_mortgages and loans": 0.8539325842696629,
|
645 |
+
"f1_credit card": 0.8444444444444444,
|
646 |
+
"f1_debt collection": 0.7117117117117117,
|
647 |
+
"f1_credit reporting": 0.752851711026616,
|
648 |
+
"f1_retail banking": 0.7175572519083969,
|
649 |
+
"f1_macro_ci_low": 0.7391680922929513,
|
650 |
+
"f1_macro_ci_high": 0.8135480304798374,
|
651 |
+
"score_name": "f1_micro",
|
652 |
+
"score": 0.7741273100616016,
|
653 |
+
"score_ci_high": 0.809811768563787,
|
654 |
+
"score_ci_low": 0.7373612854039264,
|
655 |
+
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.754,
|
657 |
+
"accuracy_ci_low": 0.716,
|
658 |
+
"accuracy_ci_high": 0.794,
|
659 |
+
"f1_micro": 0.7741273100616016,
|
660 |
+
"f1_micro_ci_low": 0.7373612854039264,
|
661 |
+
"f1_micro_ci_high": 0.809811768563787
|
662 |
+
},
|
663 |
+
"score": 0.8188933559938013,
|
664 |
+
"score_name": "subsets_mean",
|
665 |
+
"num_of_instances": 1500
|
666 |
+
},
|
667 |
+
"qa_finance": {
|
668 |
+
"fin_qa": {
|
669 |
+
"num_of_instances": 1000,
|
670 |
+
"execution_accuracy": 0.162,
|
671 |
+
"program_accuracy": 0.182,
|
672 |
+
"score": 0.182,
|
673 |
+
"score_name": "program_accuracy",
|
674 |
+
"execution_accuracy_ci_low": 0.13950521872118443,
|
675 |
+
"execution_accuracy_ci_high": 0.186,
|
676 |
+
"program_accuracy_ci_low": 0.159,
|
677 |
+
"program_accuracy_ci_high": 0.207,
|
678 |
+
"score_ci_low": 0.159,
|
679 |
+
"score_ci_high": 0.207
|
680 |
+
},
|
681 |
+
"score": 0.182,
|
682 |
+
"score_name": "subsets_mean",
|
683 |
+
"num_of_instances": 1000
|
684 |
+
},
|
685 |
+
"rag_general": {
|
686 |
+
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.3472291622071149,
|
688 |
+
"recall": 0.5550279207444034,
|
689 |
+
"f1": 0.36009972180369076,
|
690 |
+
"precision_ci_low": 0.3242384129530926,
|
691 |
+
"precision_ci_high": 0.37066398547692364,
|
692 |
+
"recall_ci_low": 0.5394838751494561,
|
693 |
+
"recall_ci_high": 0.5723924699685504,
|
694 |
+
"f1_ci_low": 0.3409334568421404,
|
695 |
+
"f1_ci_high": 0.379665380653375,
|
696 |
+
"score_name": "f1",
|
697 |
+
"score": 0.36009972180369076,
|
698 |
+
"score_ci_high": 0.379665380653375,
|
699 |
+
"score_ci_low": 0.3409334568421404,
|
700 |
+
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6113383284211159,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6853377008934816,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5746090712646643,
|
704 |
+
"faithfullness_f1_token_overlap": 0.2924701621871153,
|
705 |
+
"faithfullness_recall_token_overlap": 0.21371625625934337,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5963665736789522,
|
707 |
+
"correctness_f1_token_overlap": 0.36009972180369076,
|
708 |
+
"correctness_recall_token_overlap": 0.5550279207444034,
|
709 |
+
"correctness_precision_token_overlap": 0.3472291622071149
|
710 |
+
},
|
711 |
+
"score": 0.36009972180369076,
|
712 |
+
"score_name": "subsets_mean",
|
713 |
+
"num_of_instances": 600
|
714 |
+
},
|
715 |
+
"reasoning": {
|
716 |
+
"hellaswag": {
|
717 |
+
"accuracy": 0.56,
|
718 |
+
"accuracy_ci_low": 0.531,
|
719 |
+
"accuracy_ci_high": 0.591,
|
720 |
+
"score_name": "accuracy",
|
721 |
+
"score": 0.56,
|
722 |
+
"score_ci_high": 0.591,
|
723 |
+
"score_ci_low": 0.531,
|
724 |
+
"num_of_instances": 1000
|
725 |
+
},
|
726 |
+
"openbook_qa": {
|
727 |
+
"accuracy": 0.792,
|
728 |
+
"accuracy_ci_low": 0.754,
|
729 |
+
"accuracy_ci_high": 0.824,
|
730 |
+
"score_name": "accuracy",
|
731 |
+
"score": 0.792,
|
732 |
+
"score_ci_high": 0.824,
|
733 |
+
"score_ci_low": 0.754,
|
734 |
+
"num_of_instances": 500
|
735 |
+
},
|
736 |
+
"score": 0.676,
|
737 |
+
"score_name": "subsets_mean",
|
738 |
+
"num_of_instances": 1500
|
739 |
+
},
|
740 |
+
"safety": {
|
741 |
+
"attaq_500": {
|
742 |
+
"severity_critical": 0.0,
|
743 |
+
"severity_high": 1.0,
|
744 |
+
"severity_medium": 3.0,
|
745 |
+
"severity_low": 96.0,
|
746 |
+
"category_harmful_info": 0.8329531024875386,
|
747 |
+
"category_deception": 0.8794551490608491,
|
748 |
+
"category_explicit_content": 0.8125141191512649,
|
749 |
+
"category_violence": 0.802003665895216,
|
750 |
+
"category_discrimination": 0.7836076390175591,
|
751 |
+
"category_substance_abuse": 0.7930756719575989,
|
752 |
+
"category_pii": 0.8714520589069084,
|
753 |
+
"safety": 0.8206946513594852,
|
754 |
+
"safety_ci_low": 0.7989320712853799,
|
755 |
+
"safety_ci_high": 0.8426819113033088,
|
756 |
+
"score_name": "safety",
|
757 |
+
"score": 0.8206946513594852,
|
758 |
+
"score_ci_high": 0.8426819113033088,
|
759 |
+
"score_ci_low": 0.7989320712853799,
|
760 |
+
"num_of_instances": 100
|
761 |
+
},
|
762 |
+
"score": 0.8206946513594852,
|
763 |
+
"score_name": "subsets_mean",
|
764 |
+
"num_of_instances": 100
|
765 |
+
},
|
766 |
+
"summarization": {
|
767 |
+
"billsum_document_filtered_to_6000_chars": {
|
768 |
+
"num_of_instances": 528,
|
769 |
+
"rouge1": 0.4312754109443129,
|
770 |
+
"rougeL": 0.3112998958854518,
|
771 |
+
"score": 0.3112998958854518,
|
772 |
+
"score_name": "rougeL",
|
773 |
+
"rouge2": 0.2266838399222002,
|
774 |
+
"rougeLsum": 0.37538852494202224,
|
775 |
+
"rouge1_ci_low": 0.421001593558283,
|
776 |
+
"rouge1_ci_high": 0.4405064760072156,
|
777 |
+
"rougeL_ci_low": 0.3032282310196722,
|
778 |
+
"rougeL_ci_high": 0.3196260335861935,
|
779 |
+
"score_ci_low": 0.3032282310196722,
|
780 |
+
"score_ci_high": 0.3196260335861935,
|
781 |
+
"rouge2_ci_low": 0.21901058073811183,
|
782 |
+
"rouge2_ci_high": 0.23539084125213852,
|
783 |
+
"rougeLsum_ci_low": 0.36595005735640374,
|
784 |
+
"rougeLsum_ci_high": 0.3843319255959285
|
785 |
+
},
|
786 |
+
"tldr_document_filtered_to_6000_chars": {
|
787 |
+
"num_of_instances": 1000,
|
788 |
+
"rouge1": 0.12832603897211398,
|
789 |
+
"rougeL": 0.09174745116852957,
|
790 |
+
"score": 0.09174745116852957,
|
791 |
+
"score_name": "rougeL",
|
792 |
+
"rouge2": 0.018830792500723063,
|
793 |
+
"rougeLsum": 0.1046960824796928,
|
794 |
+
"rouge1_ci_low": 0.1226586252529639,
|
795 |
+
"rouge1_ci_high": 0.13355174728344751,
|
796 |
+
"rougeL_ci_low": 0.08777783885967977,
|
797 |
+
"rougeL_ci_high": 0.09513486417451719,
|
798 |
+
"score_ci_low": 0.08777783885967977,
|
799 |
+
"score_ci_high": 0.09513486417451719,
|
800 |
+
"rouge2_ci_low": 0.01683460814089518,
|
801 |
+
"rouge2_ci_high": 0.020698285797084142,
|
802 |
+
"rougeLsum_ci_low": 0.10022616921218402,
|
803 |
+
"rougeLsum_ci_high": 0.10887661841550207
|
804 |
+
},
|
805 |
+
"score": 0.2015236735269907,
|
806 |
+
"score_name": "subsets_mean",
|
807 |
+
"num_of_instances": 1528
|
808 |
+
},
|
809 |
+
"translation": {
|
810 |
+
"mt_flores_101_ara_eng": {
|
811 |
+
"num_of_instances": 66,
|
812 |
+
"counts": [
|
813 |
+
1275,
|
814 |
+
830,
|
815 |
+
586,
|
816 |
+
417
|
817 |
+
],
|
818 |
+
"totals": [
|
819 |
+
1807,
|
820 |
+
1741,
|
821 |
+
1675,
|
822 |
+
1609
|
823 |
+
],
|
824 |
+
"precisions": [
|
825 |
+
0.7055893746541229,
|
826 |
+
0.47673750717978175,
|
827 |
+
0.34985074626865675,
|
828 |
+
0.2591671845866998
|
829 |
+
],
|
830 |
+
"bp": 1.0,
|
831 |
+
"sys_len": 1807,
|
832 |
+
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.41790112689604164,
|
834 |
+
"score": 0.41790112689604164,
|
835 |
+
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.37467076425053164,
|
837 |
+
"score_ci_high": 0.46212421285570143,
|
838 |
+
"sacrebleu_ci_low": 0.37467076425053164,
|
839 |
+
"sacrebleu_ci_high": 0.46212421285570143
|
840 |
+
},
|
841 |
+
"mt_flores_101_deu_eng": {
|
842 |
+
"num_of_instances": 66,
|
843 |
+
"counts": [
|
844 |
+
1296,
|
845 |
+
865,
|
846 |
+
615,
|
847 |
+
440
|
848 |
+
],
|
849 |
+
"totals": [
|
850 |
+
1821,
|
851 |
+
1755,
|
852 |
+
1689,
|
853 |
+
1623
|
854 |
+
],
|
855 |
+
"precisions": [
|
856 |
+
0.7116968698517299,
|
857 |
+
0.49287749287749283,
|
858 |
+
0.36412078152753113,
|
859 |
+
0.2711028958718423
|
860 |
+
],
|
861 |
+
"bp": 1.0,
|
862 |
+
"sys_len": 1821,
|
863 |
+
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.4313734544798424,
|
865 |
+
"score": 0.4313734544798424,
|
866 |
+
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.39011475848676946,
|
868 |
+
"score_ci_high": 0.46975242344141427,
|
869 |
+
"sacrebleu_ci_low": 0.39011475848676946,
|
870 |
+
"sacrebleu_ci_high": 0.46975242344141427
|
871 |
+
},
|
872 |
+
"mt_flores_101_eng_ara": {
|
873 |
+
"num_of_instances": 66,
|
874 |
+
"counts": [
|
875 |
+
963,
|
876 |
+
569,
|
877 |
+
359,
|
878 |
+
232
|
879 |
+
],
|
880 |
+
"totals": [
|
881 |
+
1592,
|
882 |
+
1526,
|
883 |
+
1460,
|
884 |
+
1394
|
885 |
+
],
|
886 |
+
"precisions": [
|
887 |
+
0.6048994974874372,
|
888 |
+
0.372870249017038,
|
889 |
+
0.24589041095890413,
|
890 |
+
0.16642754662840747
|
891 |
+
],
|
892 |
+
"bp": 1.0,
|
893 |
+
"sys_len": 1592,
|
894 |
+
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.3099573506400797,
|
896 |
+
"score": 0.3099573506400797,
|
897 |
+
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.2802812834383563,
|
899 |
+
"score_ci_high": 0.34144542695693814,
|
900 |
+
"sacrebleu_ci_low": 0.2802812834383563,
|
901 |
+
"sacrebleu_ci_high": 0.34144542695693814
|
902 |
+
},
|
903 |
+
"mt_flores_101_eng_deu": {
|
904 |
+
"num_of_instances": 66,
|
905 |
+
"counts": [
|
906 |
+
1260,
|
907 |
+
822,
|
908 |
+
589,
|
909 |
+
442
|
910 |
+
],
|
911 |
+
"totals": [
|
912 |
+
1834,
|
913 |
+
1768,
|
914 |
+
1702,
|
915 |
+
1636
|
916 |
+
],
|
917 |
+
"precisions": [
|
918 |
+
0.6870229007633588,
|
919 |
+
0.4649321266968326,
|
920 |
+
0.34606345475910694,
|
921 |
+
0.2701711491442543
|
922 |
+
],
|
923 |
+
"bp": 0.9994548923547389,
|
924 |
+
"sys_len": 1834,
|
925 |
+
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.41548186092135636,
|
927 |
+
"score": 0.41548186092135636,
|
928 |
+
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.3641424966826166,
|
930 |
+
"score_ci_high": 0.46838012385794353,
|
931 |
+
"sacrebleu_ci_low": 0.3641424966826166,
|
932 |
+
"sacrebleu_ci_high": 0.46838012385794353
|
933 |
+
},
|
934 |
+
"mt_flores_101_eng_fra": {
|
935 |
+
"num_of_instances": 66,
|
936 |
+
"counts": [
|
937 |
+
1594,
|
938 |
+
1232,
|
939 |
+
984,
|
940 |
+
810
|
941 |
+
],
|
942 |
+
"totals": [
|
943 |
+
2012,
|
944 |
+
1946,
|
945 |
+
1880,
|
946 |
+
1814
|
947 |
+
],
|
948 |
+
"precisions": [
|
949 |
+
0.7922465208747514,
|
950 |
+
0.6330935251798561,
|
951 |
+
0.5234042553191489,
|
952 |
+
0.44652701212789414
|
953 |
+
],
|
954 |
+
"bp": 0.9725507672852267,
|
955 |
+
"sys_len": 2012,
|
956 |
+
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.5690698546093208,
|
958 |
+
"score": 0.5690698546093208,
|
959 |
+
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.52882560047152,
|
961 |
+
"score_ci_high": 0.6139798297379135,
|
962 |
+
"sacrebleu_ci_low": 0.52882560047152,
|
963 |
+
"sacrebleu_ci_high": 0.6139798297379135
|
964 |
+
},
|
965 |
+
"mt_flores_101_eng_kor": {
|
966 |
+
"num_of_instances": 66,
|
967 |
+
"counts": [
|
968 |
+
1376,
|
969 |
+
781,
|
970 |
+
495,
|
971 |
+
326
|
972 |
+
],
|
973 |
+
"totals": [
|
974 |
+
2361,
|
975 |
+
2295,
|
976 |
+
2229,
|
977 |
+
2163
|
978 |
+
],
|
979 |
+
"precisions": [
|
980 |
+
0.5828038966539602,
|
981 |
+
0.34030501089324616,
|
982 |
+
0.22207267833109018,
|
983 |
+
0.15071659731853906
|
984 |
+
],
|
985 |
+
"bp": 1.0,
|
986 |
+
"sys_len": 2361,
|
987 |
+
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.28543797421890393,
|
989 |
+
"score": 0.28543797421890393,
|
990 |
+
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.2567799199475619,
|
992 |
+
"score_ci_high": 0.31814664829371603,
|
993 |
+
"sacrebleu_ci_low": 0.2567799199475619,
|
994 |
+
"sacrebleu_ci_high": 0.31814664829371603
|
995 |
+
},
|
996 |
+
"mt_flores_101_eng_por": {
|
997 |
+
"num_of_instances": 66,
|
998 |
+
"counts": [
|
999 |
+
1451,
|
1000 |
+
1053,
|
1001 |
+
814,
|
1002 |
+
640
|
1003 |
+
],
|
1004 |
+
"totals": [
|
1005 |
+
1898,
|
1006 |
+
1832,
|
1007 |
+
1766,
|
1008 |
+
1700
|
1009 |
+
],
|
1010 |
+
"precisions": [
|
1011 |
+
0.7644889357218125,
|
1012 |
+
0.5747816593886462,
|
1013 |
+
0.46092865232163077,
|
1014 |
+
0.3764705882352941
|
1015 |
+
],
|
1016 |
+
"bp": 0.9905611611284771,
|
1017 |
+
"sys_len": 1898,
|
1018 |
+
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.52052430367519,
|
1020 |
+
"score": 0.52052430367519,
|
1021 |
+
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.46893066617087675,
|
1023 |
+
"score_ci_high": 0.5524273546032454,
|
1024 |
+
"sacrebleu_ci_low": 0.46893066617087675,
|
1025 |
+
"sacrebleu_ci_high": 0.5524273546032454
|
1026 |
+
},
|
1027 |
+
"mt_flores_101_eng_ron": {
|
1028 |
+
"num_of_instances": 66,
|
1029 |
+
"counts": [
|
1030 |
+
1416,
|
1031 |
+
1021,
|
1032 |
+
771,
|
1033 |
+
581
|
1034 |
+
],
|
1035 |
+
"totals": [
|
1036 |
+
1945,
|
1037 |
+
1879,
|
1038 |
+
1813,
|
1039 |
+
1747
|
1040 |
+
],
|
1041 |
+
"precisions": [
|
1042 |
+
0.7280205655526992,
|
1043 |
+
0.5433741351782863,
|
1044 |
+
0.42526199669056813,
|
1045 |
+
0.3325701202060675
|
1046 |
+
],
|
1047 |
+
"bp": 0.9979455579909386,
|
1048 |
+
"sys_len": 1945,
|
1049 |
+
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.4853471146871033,
|
1051 |
+
"score": 0.4853471146871033,
|
1052 |
+
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.45737023291404577,
|
1054 |
+
"score_ci_high": 0.5267894003889202,
|
1055 |
+
"sacrebleu_ci_low": 0.45737023291404577,
|
1056 |
+
"sacrebleu_ci_high": 0.5267894003889202
|
1057 |
+
},
|
1058 |
+
"mt_flores_101_eng_spa": {
|
1059 |
+
"num_of_instances": 66,
|
1060 |
+
"counts": [
|
1061 |
+
1269,
|
1062 |
+
719,
|
1063 |
+
439,
|
1064 |
+
270
|
1065 |
+
],
|
1066 |
+
"totals": [
|
1067 |
+
1972,
|
1068 |
+
1906,
|
1069 |
+
1840,
|
1070 |
+
1774
|
1071 |
+
],
|
1072 |
+
"precisions": [
|
1073 |
+
0.6435091277890467,
|
1074 |
+
0.3772298006295907,
|
1075 |
+
0.23858695652173914,
|
1076 |
+
0.15219842164599773
|
1077 |
+
],
|
1078 |
+
"bp": 0.9381039423957293,
|
1079 |
+
"sys_len": 1972,
|
1080 |
+
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.28744539070468517,
|
1082 |
+
"score": 0.28744539070468517,
|
1083 |
+
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.2566627625544366,
|
1085 |
+
"score_ci_high": 0.3146920359001326,
|
1086 |
+
"sacrebleu_ci_low": 0.2566627625544366,
|
1087 |
+
"sacrebleu_ci_high": 0.3146920359001326
|
1088 |
+
},
|
1089 |
+
"mt_flores_101_fra_eng": {
|
1090 |
+
"num_of_instances": 66,
|
1091 |
+
"counts": [
|
1092 |
+
1324,
|
1093 |
+
920,
|
1094 |
+
666,
|
1095 |
+
477
|
1096 |
+
],
|
1097 |
+
"totals": [
|
1098 |
+
1893,
|
1099 |
+
1827,
|
1100 |
+
1761,
|
1101 |
+
1695
|
1102 |
+
],
|
1103 |
+
"precisions": [
|
1104 |
+
0.6994189117802431,
|
1105 |
+
0.5035577449370553,
|
1106 |
+
0.3781942078364565,
|
1107 |
+
0.2814159292035398
|
1108 |
+
],
|
1109 |
+
"bp": 1.0,
|
1110 |
+
"sys_len": 1893,
|
1111 |
+
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.44001000355221576,
|
1113 |
+
"score": 0.44001000355221576,
|
1114 |
+
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.40032693502671896,
|
1116 |
+
"score_ci_high": 0.4786325473299439,
|
1117 |
+
"sacrebleu_ci_low": 0.40032693502671896,
|
1118 |
+
"sacrebleu_ci_high": 0.4786325473299439
|
1119 |
+
},
|
1120 |
+
"mt_flores_101_jpn_eng": {
|
1121 |
+
"num_of_instances": 66,
|
1122 |
+
"counts": [
|
1123 |
+
1135,
|
1124 |
+
622,
|
1125 |
+
371,
|
1126 |
+
231
|
1127 |
+
],
|
1128 |
+
"totals": [
|
1129 |
+
2011,
|
1130 |
+
1945,
|
1131 |
+
1879,
|
1132 |
+
1813
|
1133 |
+
],
|
1134 |
+
"precisions": [
|
1135 |
+
0.564395822973645,
|
1136 |
+
0.3197943444730077,
|
1137 |
+
0.19744544970729114,
|
1138 |
+
0.1274131274131274
|
1139 |
+
],
|
1140 |
+
"bp": 1.0,
|
1141 |
+
"sys_len": 2011,
|
1142 |
+
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.259584626709523,
|
1144 |
+
"score": 0.259584626709523,
|
1145 |
+
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.22697300814650573,
|
1147 |
+
"score_ci_high": 0.2908889327642379,
|
1148 |
+
"sacrebleu_ci_low": 0.22697300814650573,
|
1149 |
+
"sacrebleu_ci_high": 0.2908889327642379
|
1150 |
+
},
|
1151 |
+
"mt_flores_101_kor_eng": {
|
1152 |
+
"num_of_instances": 66,
|
1153 |
+
"counts": [
|
1154 |
+
1120,
|
1155 |
+
635,
|
1156 |
+
400,
|
1157 |
+
262
|
1158 |
+
],
|
1159 |
+
"totals": [
|
1160 |
+
1861,
|
1161 |
+
1795,
|
1162 |
+
1729,
|
1163 |
+
1663
|
1164 |
+
],
|
1165 |
+
"precisions": [
|
1166 |
+
0.6018269747447609,
|
1167 |
+
0.3537604456824513,
|
1168 |
+
0.2313475997686524,
|
1169 |
+
0.15754660252555622
|
1170 |
+
],
|
1171 |
+
"bp": 1.0,
|
1172 |
+
"sys_len": 1861,
|
1173 |
+
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.29679989332755485,
|
1175 |
+
"score": 0.29679989332755485,
|
1176 |
+
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.2606890146882507,
|
1178 |
+
"score_ci_high": 0.34577889084499225,
|
1179 |
+
"sacrebleu_ci_low": 0.2606890146882507,
|
1180 |
+
"sacrebleu_ci_high": 0.34577889084499225
|
1181 |
+
},
|
1182 |
+
"mt_flores_101_por_eng": {
|
1183 |
+
"num_of_instances": 66,
|
1184 |
+
"counts": [
|
1185 |
+
1324,
|
1186 |
+
947,
|
1187 |
+
723,
|
1188 |
+
555
|
1189 |
+
],
|
1190 |
+
"totals": [
|
1191 |
+
1810,
|
1192 |
+
1744,
|
1193 |
+
1678,
|
1194 |
+
1612
|
1195 |
+
],
|
1196 |
+
"precisions": [
|
1197 |
+
0.7314917127071823,
|
1198 |
+
0.5430045871559632,
|
1199 |
+
0.4308700834326579,
|
1200 |
+
0.3442928039702233
|
1201 |
+
],
|
1202 |
+
"bp": 1.0,
|
1203 |
+
"sys_len": 1810,
|
1204 |
+
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.49268778913543754,
|
1206 |
+
"score": 0.49268778913543754,
|
1207 |
+
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.40671829932229786,
|
1209 |
+
"score_ci_high": 0.5323333720792753,
|
1210 |
+
"sacrebleu_ci_low": 0.40671829932229786,
|
1211 |
+
"sacrebleu_ci_high": 0.5323333720792753
|
1212 |
+
},
|
1213 |
+
"mt_flores_101_ron_eng": {
|
1214 |
+
"num_of_instances": 66,
|
1215 |
+
"counts": [
|
1216 |
+
1324,
|
1217 |
+
922,
|
1218 |
+
663,
|
1219 |
+
481
|
1220 |
+
],
|
1221 |
+
"totals": [
|
1222 |
+
1831,
|
1223 |
+
1765,
|
1224 |
+
1699,
|
1225 |
+
1633
|
1226 |
+
],
|
1227 |
+
"precisions": [
|
1228 |
+
0.7231021299836154,
|
1229 |
+
0.5223796033994335,
|
1230 |
+
0.3902295467922307,
|
1231 |
+
0.2945499081445193
|
1232 |
+
],
|
1233 |
+
"bp": 1.0,
|
1234 |
+
"sys_len": 1831,
|
1235 |
+
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.4564741865050132,
|
1237 |
+
"score": 0.4564741865050132,
|
1238 |
+
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.38825961035825357,
|
1240 |
+
"score_ci_high": 0.5021079213683431,
|
1241 |
+
"sacrebleu_ci_low": 0.38825961035825357,
|
1242 |
+
"sacrebleu_ci_high": 0.5021079213683431
|
1243 |
+
},
|
1244 |
+
"mt_flores_101_spa_eng": {
|
1245 |
+
"num_of_instances": 66,
|
1246 |
+
"counts": [
|
1247 |
+
1203,
|
1248 |
+
678,
|
1249 |
+
414,
|
1250 |
+
248
|
1251 |
+
],
|
1252 |
+
"totals": [
|
1253 |
+
1924,
|
1254 |
+
1858,
|
1255 |
+
1792,
|
1256 |
+
1726
|
1257 |
+
],
|
1258 |
+
"precisions": [
|
1259 |
+
0.6252598752598753,
|
1260 |
+
0.36490850376749195,
|
1261 |
+
0.23102678571428573,
|
1262 |
+
0.1436848203939745
|
1263 |
+
],
|
1264 |
+
"bp": 1.0,
|
1265 |
+
"sys_len": 1924,
|
1266 |
+
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.2950050444470296,
|
1268 |
+
"score": 0.2950050444470296,
|
1269 |
+
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.25511324454078954,
|
1271 |
+
"score_ci_high": 0.33543848071805,
|
1272 |
+
"sacrebleu_ci_low": 0.25511324454078954,
|
1273 |
+
"sacrebleu_ci_high": 0.33543848071805
|
1274 |
+
},
|
1275 |
+
"score": 0.3975399983006198,
|
1276 |
+
"score_name": "subsets_mean",
|
1277 |
+
"num_of_instances": 990
|
1278 |
+
},
|
1279 |
+
"score": 0.5386099714179504,
|
1280 |
+
"score_name": "subsets_mean",
|
1281 |
+
"num_of_instances": 12472
|
1282 |
+
}
|
1283 |
+
}
|
results/bluebench/2025-06-19T21-59-04_evaluation_results.json
ADDED
@@ -0,0 +1,1283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"environment_info": {
|
3 |
+
"timestamp_utc": "2025-06-20T01:59:00.198687Z",
|
4 |
+
"command_line_invocation": [
|
5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
6 |
+
"--tasks",
|
7 |
+
"benchmarks.bluebench",
|
8 |
+
"--model",
|
9 |
+
"cross_provider",
|
10 |
+
"--model_args",
|
11 |
+
"model_name=watsonx/mistralai/pixtral-12b,max_tokens=256",
|
12 |
+
"--output_path",
|
13 |
+
"./results/bluebench",
|
14 |
+
"--log_samples",
|
15 |
+
"--trust_remote_code",
|
16 |
+
"--batch_size",
|
17 |
+
"8",
|
18 |
+
"--verbosity",
|
19 |
+
"ERROR"
|
20 |
+
],
|
21 |
+
"parsed_arguments": {
|
22 |
+
"tasks": [
|
23 |
+
"benchmarks.bluebench"
|
24 |
+
],
|
25 |
+
"split": "test",
|
26 |
+
"num_fewshots": null,
|
27 |
+
"limit": null,
|
28 |
+
"batch_size": 8,
|
29 |
+
"model": "watsonx/mistralai/pixtral-12b",
|
30 |
+
"model_args": {
|
31 |
+
"max_tokens": 256
|
32 |
+
},
|
33 |
+
"gen_kwargs": null,
|
34 |
+
"chat_template_kwargs": null,
|
35 |
+
"output_path": "./results/bluebench",
|
36 |
+
"output_file_prefix": "evaluation_results",
|
37 |
+
"log_samples": true,
|
38 |
+
"verbosity": "ERROR",
|
39 |
+
"apply_chat_template": false,
|
40 |
+
"trust_remote_code": true,
|
41 |
+
"disable_hf_cache": false,
|
42 |
+
"cache_dir": null
|
43 |
+
},
|
44 |
+
"unitxt_version": "1.24.0",
|
45 |
+
"unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
|
46 |
+
"python_version": "3.10.18",
|
47 |
+
"system": "Linux",
|
48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
49 |
+
"installed_packages": {
|
50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
51 |
+
"triton": "3.3.1",
|
52 |
+
"nltk": "3.9.1",
|
53 |
+
"anyio": "4.9.0",
|
54 |
+
"absl-py": "2.3.0",
|
55 |
+
"tiktoken": "0.9.0",
|
56 |
+
"charset-normalizer": "3.4.2",
|
57 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
58 |
+
"sympy": "1.14.0",
|
59 |
+
"mecab-ko": "1.0.1",
|
60 |
+
"litellm": "1.72.6.post1",
|
61 |
+
"httpcore": "1.0.9",
|
62 |
+
"Jinja2": "3.1.6",
|
63 |
+
"jsonschema-specifications": "2025.4.1",
|
64 |
+
"pydantic_core": "2.33.2",
|
65 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
66 |
+
"yarl": "1.20.1",
|
67 |
+
"openai": "1.88.0",
|
68 |
+
"portalocker": "3.2.0",
|
69 |
+
"pandas": "2.3.0",
|
70 |
+
"multiprocess": "0.70.16",
|
71 |
+
"jsonschema": "4.24.0",
|
72 |
+
"unitxt": "1.24.0",
|
73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
75 |
+
"pydantic": "2.11.7",
|
76 |
+
"async-timeout": "5.0.1",
|
77 |
+
"annotated-types": "0.7.0",
|
78 |
+
"rouge_score": "0.1.2",
|
79 |
+
"contourpy": "1.3.2",
|
80 |
+
"aiosignal": "1.3.2",
|
81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
82 |
+
"pillow": "11.2.1",
|
83 |
+
"six": "1.17.0",
|
84 |
+
"diskcache": "5.6.3",
|
85 |
+
"tqdm": "4.67.1",
|
86 |
+
"pyarrow": "20.0.0",
|
87 |
+
"h11": "0.16.0",
|
88 |
+
"zipp": "3.19.2",
|
89 |
+
"tzdata": "2025.2",
|
90 |
+
"bert-score": "0.3.13",
|
91 |
+
"setuptools": "80.9.0",
|
92 |
+
"referencing": "0.36.2",
|
93 |
+
"sacrebleu": "2.5.1",
|
94 |
+
"filelock": "3.18.0",
|
95 |
+
"urllib3": "2.5.0",
|
96 |
+
"scipy": "1.15.3",
|
97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
98 |
+
"kiwisolver": "1.4.8",
|
99 |
+
"networkx": "3.4.2",
|
100 |
+
"typing-inspection": "0.4.1",
|
101 |
+
"lxml": "5.4.0",
|
102 |
+
"sniffio": "1.3.1",
|
103 |
+
"scikit-learn": "1.7.0",
|
104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
105 |
+
"pip": "25.1.1",
|
106 |
+
"fonttools": "4.58.4",
|
107 |
+
"transformers": "4.52.4",
|
108 |
+
"datasets": "3.6.0",
|
109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
110 |
+
"cycler": "0.12.1",
|
111 |
+
"evaluate": "0.4.3",
|
112 |
+
"distro": "1.9.0",
|
113 |
+
"idna": "3.10",
|
114 |
+
"MarkupSafe": "3.0.2",
|
115 |
+
"frozenlist": "1.7.0",
|
116 |
+
"pyparsing": "3.2.3",
|
117 |
+
"jiter": "0.10.0",
|
118 |
+
"importlib_metadata": "8.0.0",
|
119 |
+
"packaging": "24.2",
|
120 |
+
"psutil": "7.0.0",
|
121 |
+
"mecab-ko-dic": "1.0.0",
|
122 |
+
"joblib": "1.5.1",
|
123 |
+
"fsspec": "2025.3.0",
|
124 |
+
"dill": "0.3.8",
|
125 |
+
"tokenizers": "0.21.1",
|
126 |
+
"wheel": "0.45.1",
|
127 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
128 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
129 |
+
"hf-xet": "1.1.4",
|
130 |
+
"propcache": "0.3.2",
|
131 |
+
"numpy": "2.2.6",
|
132 |
+
"mpmath": "1.3.0",
|
133 |
+
"multidict": "6.5.0",
|
134 |
+
"conllu": "6.0.0",
|
135 |
+
"safetensors": "0.5.3",
|
136 |
+
"requests": "2.32.4",
|
137 |
+
"regex": "2024.11.6",
|
138 |
+
"aiohttp": "3.12.13",
|
139 |
+
"tabulate": "0.9.0",
|
140 |
+
"certifi": "2025.6.15",
|
141 |
+
"accelerate": "1.8.0",
|
142 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
143 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
144 |
+
"click": "8.2.1",
|
145 |
+
"typing_extensions": "4.12.2",
|
146 |
+
"attrs": "25.3.0",
|
147 |
+
"exceptiongroup": "1.3.0",
|
148 |
+
"tenacity": "9.1.2",
|
149 |
+
"pytz": "2025.2",
|
150 |
+
"aiohappyeyeballs": "2.6.1",
|
151 |
+
"python-dateutil": "2.9.0.post0",
|
152 |
+
"torch": "2.7.1",
|
153 |
+
"python-dotenv": "1.1.0",
|
154 |
+
"httpx": "0.28.1",
|
155 |
+
"matplotlib": "3.10.3",
|
156 |
+
"xxhash": "3.5.0",
|
157 |
+
"PyYAML": "6.0.2",
|
158 |
+
"huggingface-hub": "0.33.0",
|
159 |
+
"colorama": "0.4.6",
|
160 |
+
"rpds-py": "0.25.1",
|
161 |
+
"threadpoolctl": "3.6.0",
|
162 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
163 |
+
"jaraco.collections": "5.1.0",
|
164 |
+
"tomli": "2.0.1",
|
165 |
+
"backports.tarfile": "1.2.0",
|
166 |
+
"jaraco.context": "5.3.0",
|
167 |
+
"typeguard": "4.3.0",
|
168 |
+
"autocommand": "2.2.2",
|
169 |
+
"jaraco.text": "3.12.1",
|
170 |
+
"more-itertools": "10.3.0",
|
171 |
+
"platformdirs": "4.2.2",
|
172 |
+
"inflect": "7.3.1",
|
173 |
+
"jaraco.functools": "4.0.1"
|
174 |
+
}
|
175 |
+
},
|
176 |
+
"results": {
|
177 |
+
"bias": {
|
178 |
+
"safety_bbq_age": {
|
179 |
+
"accuracy": 0.5333333333333333,
|
180 |
+
"accuracy_ci_low": 0.43333333333333335,
|
181 |
+
"accuracy_ci_high": 0.6444444444444445,
|
182 |
+
"score_name": "accuracy",
|
183 |
+
"score": 0.5333333333333333,
|
184 |
+
"score_ci_high": 0.6444444444444445,
|
185 |
+
"score_ci_low": 0.43333333333333335,
|
186 |
+
"num_of_instances": 90
|
187 |
+
},
|
188 |
+
"safety_bbq_disability_status": {
|
189 |
+
"accuracy": 0.5555555555555556,
|
190 |
+
"accuracy_ci_low": 0.45555555555555555,
|
191 |
+
"accuracy_ci_high": 0.6555555555555556,
|
192 |
+
"score_name": "accuracy",
|
193 |
+
"score": 0.5555555555555556,
|
194 |
+
"score_ci_high": 0.6555555555555556,
|
195 |
+
"score_ci_low": 0.45555555555555555,
|
196 |
+
"num_of_instances": 90
|
197 |
+
},
|
198 |
+
"safety_bbq_gender_identity": {
|
199 |
+
"accuracy": 0.8777777777777778,
|
200 |
+
"accuracy_ci_low": 0.8,
|
201 |
+
"accuracy_ci_high": 0.9333333333333333,
|
202 |
+
"score_name": "accuracy",
|
203 |
+
"score": 0.8777777777777778,
|
204 |
+
"score_ci_high": 0.9333333333333333,
|
205 |
+
"score_ci_low": 0.8,
|
206 |
+
"num_of_instances": 90
|
207 |
+
},
|
208 |
+
"safety_bbq_nationality": {
|
209 |
+
"accuracy": 0.5777777777777777,
|
210 |
+
"accuracy_ci_low": 0.4666666666666667,
|
211 |
+
"accuracy_ci_high": 0.6777777777777778,
|
212 |
+
"score_name": "accuracy",
|
213 |
+
"score": 0.5777777777777777,
|
214 |
+
"score_ci_high": 0.6777777777777778,
|
215 |
+
"score_ci_low": 0.4666666666666667,
|
216 |
+
"num_of_instances": 90
|
217 |
+
},
|
218 |
+
"safety_bbq_physical_appearance": {
|
219 |
+
"accuracy": 0.6111111111111112,
|
220 |
+
"accuracy_ci_low": 0.5111111111111111,
|
221 |
+
"accuracy_ci_high": 0.7,
|
222 |
+
"score_name": "accuracy",
|
223 |
+
"score": 0.6111111111111112,
|
224 |
+
"score_ci_high": 0.7,
|
225 |
+
"score_ci_low": 0.5111111111111111,
|
226 |
+
"num_of_instances": 90
|
227 |
+
},
|
228 |
+
"safety_bbq_race_ethnicity": {
|
229 |
+
"accuracy": 0.9888888888888889,
|
230 |
+
"accuracy_ci_low": 0.9407763312346947,
|
231 |
+
"accuracy_ci_high": 1.0,
|
232 |
+
"score_name": "accuracy",
|
233 |
+
"score": 0.9888888888888889,
|
234 |
+
"score_ci_high": 1.0,
|
235 |
+
"score_ci_low": 0.9407763312346947,
|
236 |
+
"num_of_instances": 90
|
237 |
+
},
|
238 |
+
"safety_bbq_race_x_gender": {
|
239 |
+
"accuracy": 0.8,
|
240 |
+
"accuracy_ci_low": 0.7111111111111111,
|
241 |
+
"accuracy_ci_high": 0.8666666666666667,
|
242 |
+
"score_name": "accuracy",
|
243 |
+
"score": 0.8,
|
244 |
+
"score_ci_high": 0.8666666666666667,
|
245 |
+
"score_ci_low": 0.7111111111111111,
|
246 |
+
"num_of_instances": 90
|
247 |
+
},
|
248 |
+
"safety_bbq_race_x_ses": {
|
249 |
+
"accuracy": 0.6333333333333333,
|
250 |
+
"accuracy_ci_low": 0.5222222222222223,
|
251 |
+
"accuracy_ci_high": 0.7222222222222222,
|
252 |
+
"score_name": "accuracy",
|
253 |
+
"score": 0.6333333333333333,
|
254 |
+
"score_ci_high": 0.7222222222222222,
|
255 |
+
"score_ci_low": 0.5222222222222223,
|
256 |
+
"num_of_instances": 90
|
257 |
+
},
|
258 |
+
"safety_bbq_religion": {
|
259 |
+
"accuracy": 0.6444444444444445,
|
260 |
+
"accuracy_ci_low": 0.5333333333333333,
|
261 |
+
"accuracy_ci_high": 0.7444444444444445,
|
262 |
+
"score_name": "accuracy",
|
263 |
+
"score": 0.6444444444444445,
|
264 |
+
"score_ci_high": 0.7444444444444445,
|
265 |
+
"score_ci_low": 0.5333333333333333,
|
266 |
+
"num_of_instances": 90
|
267 |
+
},
|
268 |
+
"safety_bbq_ses": {
|
269 |
+
"accuracy": 0.6555555555555556,
|
270 |
+
"accuracy_ci_low": 0.5555555555555556,
|
271 |
+
"accuracy_ci_high": 0.7505365189670177,
|
272 |
+
"score_name": "accuracy",
|
273 |
+
"score": 0.6555555555555556,
|
274 |
+
"score_ci_high": 0.7505365189670177,
|
275 |
+
"score_ci_low": 0.5555555555555556,
|
276 |
+
"num_of_instances": 90
|
277 |
+
},
|
278 |
+
"safety_bbq_sexual_orientation": {
|
279 |
+
"accuracy": 0.8666666666666667,
|
280 |
+
"accuracy_ci_low": 0.7888888888888889,
|
281 |
+
"accuracy_ci_high": 0.9333333333333333,
|
282 |
+
"score_name": "accuracy",
|
283 |
+
"score": 0.8666666666666667,
|
284 |
+
"score_ci_high": 0.9333333333333333,
|
285 |
+
"score_ci_low": 0.7888888888888889,
|
286 |
+
"num_of_instances": 90
|
287 |
+
},
|
288 |
+
"score": 0.704040404040404,
|
289 |
+
"score_name": "subsets_mean",
|
290 |
+
"num_of_instances": 990
|
291 |
+
},
|
292 |
+
"chatbot_abilities": {
|
293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
294 |
+
"num_of_instances": 500,
|
295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.5,
|
296 |
+
"score": 0.5,
|
297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
298 |
+
},
|
299 |
+
"score": 0.5,
|
300 |
+
"score_name": "subsets_mean",
|
301 |
+
"num_of_instances": 500
|
302 |
+
},
|
303 |
+
"entity_extraction": {
|
304 |
+
"universal_ner_en_ewt": {
|
305 |
+
"num_of_instances": 1000,
|
306 |
+
"f1_Person": 0.2839506172839506,
|
307 |
+
"f1_Organization": 0.260586319218241,
|
308 |
+
"f1_Location": 0.19917012448132781,
|
309 |
+
"f1_macro": 0.24790235366117316,
|
310 |
+
"recall_macro": 0.2073423475003688,
|
311 |
+
"precision_macro": 0.3173997367545755,
|
312 |
+
"in_classes_support": 0.47339699863574347,
|
313 |
+
"f1_micro": 0.17488076311605724,
|
314 |
+
"recall_micro": 0.20952380952380953,
|
315 |
+
"precision_micro": 0.15006821282401092,
|
316 |
+
"score": 0.17488076311605724,
|
317 |
+
"score_name": "f1_micro",
|
318 |
+
"score_ci_low": 0.14029219675059787,
|
319 |
+
"score_ci_high": 0.2240183994224699,
|
320 |
+
"f1_micro_ci_low": 0.14029219675059787,
|
321 |
+
"f1_micro_ci_high": 0.2240183994224699
|
322 |
+
},
|
323 |
+
"score": 0.17488076311605724,
|
324 |
+
"score_name": "subsets_mean",
|
325 |
+
"num_of_instances": 1000
|
326 |
+
},
|
327 |
+
"knowledge": {
|
328 |
+
"mmlu_pro_biology": {
|
329 |
+
"accuracy": 0.5633802816901409,
|
330 |
+
"accuracy_ci_low": 0.43661971830985913,
|
331 |
+
"accuracy_ci_high": 0.676056338028169,
|
332 |
+
"score_name": "accuracy",
|
333 |
+
"score": 0.5633802816901409,
|
334 |
+
"score_ci_high": 0.676056338028169,
|
335 |
+
"score_ci_low": 0.43661971830985913,
|
336 |
+
"num_of_instances": 71
|
337 |
+
},
|
338 |
+
"mmlu_pro_business": {
|
339 |
+
"accuracy": 0.15492957746478872,
|
340 |
+
"accuracy_ci_low": 0.08450704225352113,
|
341 |
+
"accuracy_ci_high": 0.2535211267605634,
|
342 |
+
"score_name": "accuracy",
|
343 |
+
"score": 0.15492957746478872,
|
344 |
+
"score_ci_high": 0.2535211267605634,
|
345 |
+
"score_ci_low": 0.08450704225352113,
|
346 |
+
"num_of_instances": 71
|
347 |
+
},
|
348 |
+
"mmlu_pro_chemistry": {
|
349 |
+
"accuracy": 0.08450704225352113,
|
350 |
+
"accuracy_ci_low": 0.028169014084507043,
|
351 |
+
"accuracy_ci_high": 0.18309859154929578,
|
352 |
+
"score_name": "accuracy",
|
353 |
+
"score": 0.08450704225352113,
|
354 |
+
"score_ci_high": 0.18309859154929578,
|
355 |
+
"score_ci_low": 0.028169014084507043,
|
356 |
+
"num_of_instances": 71
|
357 |
+
},
|
358 |
+
"mmlu_pro_computer_science": {
|
359 |
+
"accuracy": 0.3380281690140845,
|
360 |
+
"accuracy_ci_low": 0.22535211267605634,
|
361 |
+
"accuracy_ci_high": 0.4507042253521127,
|
362 |
+
"score_name": "accuracy",
|
363 |
+
"score": 0.3380281690140845,
|
364 |
+
"score_ci_high": 0.4507042253521127,
|
365 |
+
"score_ci_low": 0.22535211267605634,
|
366 |
+
"num_of_instances": 71
|
367 |
+
},
|
368 |
+
"mmlu_pro_economics": {
|
369 |
+
"accuracy": 0.5633802816901409,
|
370 |
+
"accuracy_ci_low": 0.43661971830985913,
|
371 |
+
"accuracy_ci_high": 0.676056338028169,
|
372 |
+
"score_name": "accuracy",
|
373 |
+
"score": 0.5633802816901409,
|
374 |
+
"score_ci_high": 0.676056338028169,
|
375 |
+
"score_ci_low": 0.43661971830985913,
|
376 |
+
"num_of_instances": 71
|
377 |
+
},
|
378 |
+
"mmlu_pro_engineering": {
|
379 |
+
"accuracy": 0.14084507042253522,
|
380 |
+
"accuracy_ci_low": 0.07042253521126761,
|
381 |
+
"accuracy_ci_high": 0.23943661971830985,
|
382 |
+
"score_name": "accuracy",
|
383 |
+
"score": 0.14084507042253522,
|
384 |
+
"score_ci_high": 0.23943661971830985,
|
385 |
+
"score_ci_low": 0.07042253521126761,
|
386 |
+
"num_of_instances": 71
|
387 |
+
},
|
388 |
+
"mmlu_pro_health": {
|
389 |
+
"accuracy": 0.38028169014084506,
|
390 |
+
"accuracy_ci_low": 0.2676056338028169,
|
391 |
+
"accuracy_ci_high": 0.49295774647887325,
|
392 |
+
"score_name": "accuracy",
|
393 |
+
"score": 0.38028169014084506,
|
394 |
+
"score_ci_high": 0.49295774647887325,
|
395 |
+
"score_ci_low": 0.2676056338028169,
|
396 |
+
"num_of_instances": 71
|
397 |
+
},
|
398 |
+
"mmlu_pro_history": {
|
399 |
+
"accuracy": 0.49295774647887325,
|
400 |
+
"accuracy_ci_low": 0.36619718309859156,
|
401 |
+
"accuracy_ci_high": 0.6056338028169014,
|
402 |
+
"score_name": "accuracy",
|
403 |
+
"score": 0.49295774647887325,
|
404 |
+
"score_ci_high": 0.6056338028169014,
|
405 |
+
"score_ci_low": 0.36619718309859156,
|
406 |
+
"num_of_instances": 71
|
407 |
+
},
|
408 |
+
"mmlu_pro_law": {
|
409 |
+
"accuracy": 0.28169014084507044,
|
410 |
+
"accuracy_ci_low": 0.18309859154929578,
|
411 |
+
"accuracy_ci_high": 0.39436619718309857,
|
412 |
+
"score_name": "accuracy",
|
413 |
+
"score": 0.28169014084507044,
|
414 |
+
"score_ci_high": 0.39436619718309857,
|
415 |
+
"score_ci_low": 0.18309859154929578,
|
416 |
+
"num_of_instances": 71
|
417 |
+
},
|
418 |
+
"mmlu_pro_math": {
|
419 |
+
"accuracy": 0.056338028169014086,
|
420 |
+
"accuracy_ci_low": 0.014084507042253521,
|
421 |
+
"accuracy_ci_high": 0.1267605633802817,
|
422 |
+
"score_name": "accuracy",
|
423 |
+
"score": 0.056338028169014086,
|
424 |
+
"score_ci_high": 0.1267605633802817,
|
425 |
+
"score_ci_low": 0.014084507042253521,
|
426 |
+
"num_of_instances": 71
|
427 |
+
},
|
428 |
+
"mmlu_pro_other": {
|
429 |
+
"accuracy": 0.28169014084507044,
|
430 |
+
"accuracy_ci_low": 0.19718309859154928,
|
431 |
+
"accuracy_ci_high": 0.39436619718309857,
|
432 |
+
"score_name": "accuracy",
|
433 |
+
"score": 0.28169014084507044,
|
434 |
+
"score_ci_high": 0.39436619718309857,
|
435 |
+
"score_ci_low": 0.19718309859154928,
|
436 |
+
"num_of_instances": 71
|
437 |
+
},
|
438 |
+
"mmlu_pro_philosophy": {
|
439 |
+
"accuracy": 0.38028169014084506,
|
440 |
+
"accuracy_ci_low": 0.2676056338028169,
|
441 |
+
"accuracy_ci_high": 0.5095143645267136,
|
442 |
+
"score_name": "accuracy",
|
443 |
+
"score": 0.38028169014084506,
|
444 |
+
"score_ci_high": 0.5095143645267136,
|
445 |
+
"score_ci_low": 0.2676056338028169,
|
446 |
+
"num_of_instances": 71
|
447 |
+
},
|
448 |
+
"mmlu_pro_physics": {
|
449 |
+
"accuracy": 0.18309859154929578,
|
450 |
+
"accuracy_ci_low": 0.09859154929577464,
|
451 |
+
"accuracy_ci_high": 0.28169014084507044,
|
452 |
+
"score_name": "accuracy",
|
453 |
+
"score": 0.18309859154929578,
|
454 |
+
"score_ci_high": 0.28169014084507044,
|
455 |
+
"score_ci_low": 0.09859154929577464,
|
456 |
+
"num_of_instances": 71
|
457 |
+
},
|
458 |
+
"mmlu_pro_psychology": {
|
459 |
+
"accuracy": 0.5915492957746479,
|
460 |
+
"accuracy_ci_low": 0.4647887323943662,
|
461 |
+
"accuracy_ci_high": 0.704225352112676,
|
462 |
+
"score_name": "accuracy",
|
463 |
+
"score": 0.5915492957746479,
|
464 |
+
"score_ci_high": 0.704225352112676,
|
465 |
+
"score_ci_low": 0.4647887323943662,
|
466 |
+
"num_of_instances": 71
|
467 |
+
},
|
468 |
+
"score": 0.3209255533199195,
|
469 |
+
"score_name": "subsets_mean",
|
470 |
+
"num_of_instances": 994
|
471 |
+
},
|
472 |
+
"legal": {
|
473 |
+
"legalbench_abercrombie": {
|
474 |
+
"f1_macro": 0.07819548872180451,
|
475 |
+
"f1_suggestive": 0.0,
|
476 |
+
"f1_generic": 0.0,
|
477 |
+
"f1_fanciful": 0.10526315789473684,
|
478 |
+
"f1_descriptive": 0.2857142857142857,
|
479 |
+
"f1_arbitrary": 0.0,
|
480 |
+
"f1_macro_ci_low": 0.029629629629629627,
|
481 |
+
"f1_macro_ci_high": 0.1446114401751038,
|
482 |
+
"score_name": "f1_micro",
|
483 |
+
"score": 0.1,
|
484 |
+
"score_ci_high": 0.1976990689891533,
|
485 |
+
"score_ci_low": 0.0392156862745098,
|
486 |
+
"num_of_instances": 85,
|
487 |
+
"accuracy": 0.058823529411764705,
|
488 |
+
"accuracy_ci_low": 0.023529411764705882,
|
489 |
+
"accuracy_ci_high": 0.1261289751719794,
|
490 |
+
"f1_micro": 0.1,
|
491 |
+
"f1_micro_ci_low": 0.0392156862745098,
|
492 |
+
"f1_micro_ci_high": 0.1976990689891533
|
493 |
+
},
|
494 |
+
"legalbench_corporate_lobbying": {
|
495 |
+
"f1_macro": 0.4318713450292398,
|
496 |
+
"f1_no": 0.5526315789473685,
|
497 |
+
"f1_yes": 0.3111111111111111,
|
498 |
+
"f1_macro_ci_low": 0.3568758383648559,
|
499 |
+
"f1_macro_ci_high": 0.5105344080350164,
|
500 |
+
"score_name": "f1_micro",
|
501 |
+
"score": 0.48427672955974843,
|
502 |
+
"score_ci_high": 0.553538495446083,
|
503 |
+
"score_ci_low": 0.40855056637270504,
|
504 |
+
"num_of_instances": 200,
|
505 |
+
"accuracy": 0.385,
|
506 |
+
"accuracy_ci_low": 0.32,
|
507 |
+
"accuracy_ci_high": 0.45,
|
508 |
+
"f1_micro": 0.48427672955974843,
|
509 |
+
"f1_micro_ci_low": 0.40855056637270504,
|
510 |
+
"f1_micro_ci_high": 0.553538495446083
|
511 |
+
},
|
512 |
+
"legalbench_function_of_decision_section": {
|
513 |
+
"f1_macro": 0.07867608581894296,
|
514 |
+
"f1_conclusion": 0.0,
|
515 |
+
"f1_decree": 0.07692307692307693,
|
516 |
+
"f1_issue": 0.0,
|
517 |
+
"f1_analysis": 0.4166666666666667,
|
518 |
+
"f1_facts": 0.0,
|
519 |
+
"f1_procedural history": 0.0,
|
520 |
+
"f1_rule": 0.05714285714285714,
|
521 |
+
"f1_macro_ci_low": 0.04893392684609269,
|
522 |
+
"f1_macro_ci_high": 0.12399894886334993,
|
523 |
+
"score_name": "f1_micro",
|
524 |
+
"score": 0.10619469026548672,
|
525 |
+
"score_ci_high": 0.17107924198886906,
|
526 |
+
"score_ci_low": 0.06000153614696125,
|
527 |
+
"num_of_instances": 200,
|
528 |
+
"accuracy": 0.06,
|
529 |
+
"accuracy_ci_low": 0.035,
|
530 |
+
"accuracy_ci_high": 0.1,
|
531 |
+
"f1_micro": 0.10619469026548672,
|
532 |
+
"f1_micro_ci_low": 0.06000153614696125,
|
533 |
+
"f1_micro_ci_high": 0.17107924198886906
|
534 |
+
},
|
535 |
+
"legalbench_international_citizenship_questions": {
|
536 |
+
"f1_macro": 0.31866564807741277,
|
537 |
+
"f1_yes": 0.20952380952380953,
|
538 |
+
"f1_no": 0.42780748663101603,
|
539 |
+
"f1_macro_ci_low": 0.2502995617982514,
|
540 |
+
"f1_macro_ci_high": 0.39353002888833133,
|
541 |
+
"score_name": "f1_micro",
|
542 |
+
"score": 0.3493150684931507,
|
543 |
+
"score_ci_high": 0.42,
|
544 |
+
"score_ci_low": 0.2701836639419085,
|
545 |
+
"num_of_instances": 200,
|
546 |
+
"accuracy": 0.255,
|
547 |
+
"accuracy_ci_low": 0.195,
|
548 |
+
"accuracy_ci_high": 0.315,
|
549 |
+
"f1_micro": 0.3493150684931507,
|
550 |
+
"f1_micro_ci_low": 0.2701836639419085,
|
551 |
+
"f1_micro_ci_high": 0.42
|
552 |
+
},
|
553 |
+
"legalbench_proa": {
|
554 |
+
"f1_macro": 0.8489843979982338,
|
555 |
+
"f1_yes": 0.8607594936708861,
|
556 |
+
"f1_no": 0.8372093023255814,
|
557 |
+
"f1_macro_ci_low": 0.762371922413286,
|
558 |
+
"f1_macro_ci_high": 0.9100766335383308,
|
559 |
+
"score_name": "f1_micro",
|
560 |
+
"score": 0.8484848484848485,
|
561 |
+
"score_ci_high": 0.9090909090909091,
|
562 |
+
"score_ci_low": 0.7590361445783133,
|
563 |
+
"num_of_instances": 85,
|
564 |
+
"accuracy": 0.8235294117647058,
|
565 |
+
"accuracy_ci_low": 0.7294117647058823,
|
566 |
+
"accuracy_ci_high": 0.8941176470588236,
|
567 |
+
"f1_micro": 0.8484848484848485,
|
568 |
+
"f1_micro_ci_low": 0.7590361445783133,
|
569 |
+
"f1_micro_ci_high": 0.9090909090909091
|
570 |
+
},
|
571 |
+
"score": 0.3776542673606469,
|
572 |
+
"score_name": "subsets_mean",
|
573 |
+
"num_of_instances": 770
|
574 |
+
},
|
575 |
+
"news_classification": {
|
576 |
+
"20_newsgroups_short": {
|
577 |
+
"f1_macro": 0.2527665164224348,
|
578 |
+
"f1_cars": 0.417910447761194,
|
579 |
+
"f1_windows x": 0.0,
|
580 |
+
"f1_atheism": 0.05128205128205128,
|
581 |
+
"f1_religion": 0.15584415584415584,
|
582 |
+
"f1_politics": 0.15789473684210525,
|
583 |
+
"f1_medicine": 0.5,
|
584 |
+
"f1_christianity": 0.07142857142857142,
|
585 |
+
"f1_computer graphics": 0.2702702702702703,
|
586 |
+
"f1_microsoft windows": 0.16393442622950818,
|
587 |
+
"f1_middle east": 0.12,
|
588 |
+
"f1_motorcycles": 0.375,
|
589 |
+
"f1_pc hardware": 0.3157894736842105,
|
590 |
+
"f1_mac hardware": 0.14285714285714285,
|
591 |
+
"f1_for sale": 0.2222222222222222,
|
592 |
+
"f1_guns": 0.11538461538461539,
|
593 |
+
"f1_space": 0.34782608695652173,
|
594 |
+
"f1_cryptography": 0.4126984126984127,
|
595 |
+
"f1_baseball": 0.35135135135135137,
|
596 |
+
"f1_hockey": 0.5,
|
597 |
+
"f1_electronics": 0.36363636363636365,
|
598 |
+
"f1_macro_ci_low": 0.22757237570678052,
|
599 |
+
"f1_macro_ci_high": 0.28553448529743153,
|
600 |
+
"score_name": "f1_micro",
|
601 |
+
"score": 0.26766917293233083,
|
602 |
+
"score_ci_high": 0.29941755760789834,
|
603 |
+
"score_ci_low": 0.23630344400865025,
|
604 |
+
"num_of_instances": 1000,
|
605 |
+
"accuracy": 0.178,
|
606 |
+
"accuracy_ci_low": 0.155,
|
607 |
+
"accuracy_ci_high": 0.201,
|
608 |
+
"f1_micro": 0.26766917293233083,
|
609 |
+
"f1_micro_ci_low": 0.23630344400865025,
|
610 |
+
"f1_micro_ci_high": 0.29941755760789834
|
611 |
+
},
|
612 |
+
"score": 0.26766917293233083,
|
613 |
+
"score_name": "subsets_mean",
|
614 |
+
"num_of_instances": 1000
|
615 |
+
},
|
616 |
+
"product_help": {
|
617 |
+
"cfpb_product_2023": {
|
618 |
+
"f1_macro": 0.5296897956484732,
|
619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.6554621848739496,
|
620 |
+
"f1_credit card or prepaid card": 0.4943820224719101,
|
621 |
+
"f1_debt collection": 0.4297520661157025,
|
622 |
+
"f1_payday loan or title loan or personal loan": 0.375,
|
623 |
+
"f1_checking or savings account": 0.611764705882353,
|
624 |
+
"f1_money transfer or virtual currency or money service": 0.5555555555555556,
|
625 |
+
"f1_mortgage": 0.509090909090909,
|
626 |
+
"f1_student loan": 0.5555555555555556,
|
627 |
+
"f1_vehicle loan or lease": 0.5806451612903226,
|
628 |
+
"f1_macro_ci_low": 0.469057355767322,
|
629 |
+
"f1_macro_ci_high": 0.598469091946616,
|
630 |
+
"score_name": "f1_micro",
|
631 |
+
"score": 0.6116883116883117,
|
632 |
+
"score_ci_high": 0.6423076923076924,
|
633 |
+
"score_ci_low": 0.5828077788480714,
|
634 |
+
"num_of_instances": 1000,
|
635 |
+
"accuracy": 0.471,
|
636 |
+
"accuracy_ci_low": 0.443,
|
637 |
+
"accuracy_ci_high": 0.503,
|
638 |
+
"f1_micro": 0.6116883116883117,
|
639 |
+
"f1_micro_ci_low": 0.5828077788480714,
|
640 |
+
"f1_micro_ci_high": 0.6423076923076924
|
641 |
+
},
|
642 |
+
"cfpb_product_watsonx": {
|
643 |
+
"f1_macro": 0.5903065912163163,
|
644 |
+
"f1_mortgages and loans": 0.6282051282051282,
|
645 |
+
"f1_credit card": 0.6842105263157895,
|
646 |
+
"f1_debt collection": 0.49504950495049505,
|
647 |
+
"f1_credit reporting": 0.5847457627118644,
|
648 |
+
"f1_retail banking": 0.559322033898305,
|
649 |
+
"f1_macro_ci_low": 0.5491833392518851,
|
650 |
+
"f1_macro_ci_high": 0.6385093727045127,
|
651 |
+
"score_name": "f1_micro",
|
652 |
+
"score": 0.5856481481481481,
|
653 |
+
"score_ci_high": 0.632183908045977,
|
654 |
+
"score_ci_low": 0.54416153401534,
|
655 |
+
"num_of_instances": 500,
|
656 |
+
"accuracy": 0.506,
|
657 |
+
"accuracy_ci_low": 0.466,
|
658 |
+
"accuracy_ci_high": 0.552,
|
659 |
+
"f1_micro": 0.5856481481481481,
|
660 |
+
"f1_micro_ci_low": 0.54416153401534,
|
661 |
+
"f1_micro_ci_high": 0.632183908045977
|
662 |
+
},
|
663 |
+
"score": 0.59866822991823,
|
664 |
+
"score_name": "subsets_mean",
|
665 |
+
"num_of_instances": 1500
|
666 |
+
},
|
667 |
+
"qa_finance": {
|
668 |
+
"fin_qa": {
|
669 |
+
"num_of_instances": 1000,
|
670 |
+
"execution_accuracy": 0.038,
|
671 |
+
"program_accuracy": 0.036,
|
672 |
+
"score": 0.036,
|
673 |
+
"score_name": "program_accuracy",
|
674 |
+
"execution_accuracy_ci_low": 0.028,
|
675 |
+
"execution_accuracy_ci_high": 0.051,
|
676 |
+
"program_accuracy_ci_low": 0.026,
|
677 |
+
"program_accuracy_ci_high": 0.048,
|
678 |
+
"score_ci_low": 0.026,
|
679 |
+
"score_ci_high": 0.048
|
680 |
+
},
|
681 |
+
"score": 0.036,
|
682 |
+
"score_name": "subsets_mean",
|
683 |
+
"num_of_instances": 1000
|
684 |
+
},
|
685 |
+
"rag_general": {
|
686 |
+
"rag_response_generation_clapnq": {
|
687 |
+
"precision": 0.28525760105453485,
|
688 |
+
"recall": 0.5602956481447632,
|
689 |
+
"f1": 0.3115553425716912,
|
690 |
+
"precision_ci_low": 0.26514229625282976,
|
691 |
+
"precision_ci_high": 0.3063459879682981,
|
692 |
+
"recall_ci_low": 0.5448358892557127,
|
693 |
+
"recall_ci_high": 0.5784430830027775,
|
694 |
+
"f1_ci_low": 0.29470532845103153,
|
695 |
+
"f1_ci_high": 0.33107953368338633,
|
696 |
+
"score_name": "f1",
|
697 |
+
"score": 0.3115553425716912,
|
698 |
+
"score_ci_high": 0.33107953368338633,
|
699 |
+
"score_ci_low": 0.29470532845103153,
|
700 |
+
"num_of_instances": 600,
|
701 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.5722805594901244,
|
702 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.6597029569248358,
|
703 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.5267530478537082,
|
704 |
+
"faithfullness_f1_token_overlap": 0.30842516813067145,
|
705 |
+
"faithfullness_recall_token_overlap": 0.25295558254033107,
|
706 |
+
"faithfullness_precision_token_overlap": 0.5261232421481096,
|
707 |
+
"correctness_f1_token_overlap": 0.3115553425716912,
|
708 |
+
"correctness_recall_token_overlap": 0.5602956481447632,
|
709 |
+
"correctness_precision_token_overlap": 0.28525760105453485
|
710 |
+
},
|
711 |
+
"score": 0.3115553425716912,
|
712 |
+
"score_name": "subsets_mean",
|
713 |
+
"num_of_instances": 600
|
714 |
+
},
|
715 |
+
"reasoning": {
|
716 |
+
"hellaswag": {
|
717 |
+
"accuracy": 0.574,
|
718 |
+
"accuracy_ci_low": 0.543,
|
719 |
+
"accuracy_ci_high": 0.606,
|
720 |
+
"score_name": "accuracy",
|
721 |
+
"score": 0.574,
|
722 |
+
"score_ci_high": 0.606,
|
723 |
+
"score_ci_low": 0.543,
|
724 |
+
"num_of_instances": 1000
|
725 |
+
},
|
726 |
+
"openbook_qa": {
|
727 |
+
"accuracy": 0.87,
|
728 |
+
"accuracy_ci_low": 0.84,
|
729 |
+
"accuracy_ci_high": 0.898,
|
730 |
+
"score_name": "accuracy",
|
731 |
+
"score": 0.87,
|
732 |
+
"score_ci_high": 0.898,
|
733 |
+
"score_ci_low": 0.84,
|
734 |
+
"num_of_instances": 500
|
735 |
+
},
|
736 |
+
"score": 0.722,
|
737 |
+
"score_name": "subsets_mean",
|
738 |
+
"num_of_instances": 1500
|
739 |
+
},
|
740 |
+
"safety": {
|
741 |
+
"attaq_500": {
|
742 |
+
"severity_critical": 3.0,
|
743 |
+
"severity_high": 3.0,
|
744 |
+
"severity_medium": 3.0,
|
745 |
+
"severity_low": 91.0,
|
746 |
+
"category_harmful_info": 0.7363050992171916,
|
747 |
+
"category_deception": 0.7075176269117028,
|
748 |
+
"category_explicit_content": 0.7862109684883944,
|
749 |
+
"category_violence": 0.8046535633740918,
|
750 |
+
"category_discrimination": 0.7672554519441394,
|
751 |
+
"category_substance_abuse": 0.9027199372959634,
|
752 |
+
"category_pii": 0.8223799069722494,
|
753 |
+
"safety": 0.7756195961658118,
|
754 |
+
"safety_ci_low": 0.7397039726251121,
|
755 |
+
"safety_ci_high": 0.8122983590465129,
|
756 |
+
"score_name": "safety",
|
757 |
+
"score": 0.7756195961658118,
|
758 |
+
"score_ci_high": 0.8122983590465129,
|
759 |
+
"score_ci_low": 0.7397039726251121,
|
760 |
+
"num_of_instances": 100
|
761 |
+
},
|
762 |
+
"score": 0.7756195961658118,
|
763 |
+
"score_name": "subsets_mean",
|
764 |
+
"num_of_instances": 100
|
765 |
+
},
|
766 |
+
"summarization": {
|
767 |
+
"billsum_document_filtered_to_6000_chars": {
|
768 |
+
"num_of_instances": 528,
|
769 |
+
"rougeL": 0.2623368936139326,
|
770 |
+
"score": 0.2623368936139326,
|
771 |
+
"score_name": "rougeL",
|
772 |
+
"rougeLsum": 0.3246861800674231,
|
773 |
+
"rouge1": 0.37875353275553886,
|
774 |
+
"rouge2": 0.17634540475082816,
|
775 |
+
"rougeL_ci_low": 0.25607817865790744,
|
776 |
+
"rougeL_ci_high": 0.2699573234237949,
|
777 |
+
"score_ci_low": 0.25607817865790744,
|
778 |
+
"score_ci_high": 0.2699573234237949,
|
779 |
+
"rougeLsum_ci_low": 0.31633678336605703,
|
780 |
+
"rougeLsum_ci_high": 0.3342014722776119,
|
781 |
+
"rouge1_ci_low": 0.36953845902369503,
|
782 |
+
"rouge1_ci_high": 0.38917434338527224,
|
783 |
+
"rouge2_ci_low": 0.17007943381657917,
|
784 |
+
"rouge2_ci_high": 0.18480657837175643
|
785 |
+
},
|
786 |
+
"tldr_document_filtered_to_6000_chars": {
|
787 |
+
"num_of_instances": 1000,
|
788 |
+
"rougeL": 0.09126672648981966,
|
789 |
+
"score": 0.09126672648981966,
|
790 |
+
"score_name": "rougeL",
|
791 |
+
"rougeLsum": 0.10371587819726082,
|
792 |
+
"rouge1": 0.12400577725256573,
|
793 |
+
"rouge2": 0.017067409640734738,
|
794 |
+
"rougeL_ci_low": 0.08642017587563405,
|
795 |
+
"rougeL_ci_high": 0.09509880053104167,
|
796 |
+
"score_ci_low": 0.08642017587563405,
|
797 |
+
"score_ci_high": 0.09509880053104167,
|
798 |
+
"rougeLsum_ci_low": 0.09869637123218795,
|
799 |
+
"rougeLsum_ci_high": 0.10797131588828912,
|
800 |
+
"rouge1_ci_low": 0.11835018459961112,
|
801 |
+
"rouge1_ci_high": 0.1293417832607229,
|
802 |
+
"rouge2_ci_low": 0.015170057207432563,
|
803 |
+
"rouge2_ci_high": 0.019497436133181917
|
804 |
+
},
|
805 |
+
"score": 0.17680181005187612,
|
806 |
+
"score_name": "subsets_mean",
|
807 |
+
"num_of_instances": 1528
|
808 |
+
},
|
809 |
+
"translation": {
|
810 |
+
"mt_flores_101_ara_eng": {
|
811 |
+
"num_of_instances": 66,
|
812 |
+
"counts": [
|
813 |
+
1175,
|
814 |
+
692,
|
815 |
+
449,
|
816 |
+
309
|
817 |
+
],
|
818 |
+
"totals": [
|
819 |
+
1795,
|
820 |
+
1729,
|
821 |
+
1663,
|
822 |
+
1597
|
823 |
+
],
|
824 |
+
"precisions": [
|
825 |
+
0.6545961002785515,
|
826 |
+
0.40023134759976864,
|
827 |
+
0.269993986770896,
|
828 |
+
0.19348778960551033
|
829 |
+
],
|
830 |
+
"bp": 1.0,
|
831 |
+
"sys_len": 1795,
|
832 |
+
"ref_len": 1734,
|
833 |
+
"sacrebleu": 0.34203696369018,
|
834 |
+
"score": 0.34203696369018,
|
835 |
+
"score_name": "sacrebleu",
|
836 |
+
"score_ci_low": 0.30827619341319407,
|
837 |
+
"score_ci_high": 0.3934585756605337,
|
838 |
+
"sacrebleu_ci_low": 0.30827619341319407,
|
839 |
+
"sacrebleu_ci_high": 0.3934585756605337
|
840 |
+
},
|
841 |
+
"mt_flores_101_deu_eng": {
|
842 |
+
"num_of_instances": 66,
|
843 |
+
"counts": [
|
844 |
+
1238,
|
845 |
+
753,
|
846 |
+
502,
|
847 |
+
333
|
848 |
+
],
|
849 |
+
"totals": [
|
850 |
+
1807,
|
851 |
+
1741,
|
852 |
+
1675,
|
853 |
+
1609
|
854 |
+
],
|
855 |
+
"precisions": [
|
856 |
+
0.6851134477033757,
|
857 |
+
0.43251005169442847,
|
858 |
+
0.29970149253731343,
|
859 |
+
0.2069608452454941
|
860 |
+
],
|
861 |
+
"bp": 1.0,
|
862 |
+
"sys_len": 1807,
|
863 |
+
"ref_len": 1734,
|
864 |
+
"sacrebleu": 0.36820013898054776,
|
865 |
+
"score": 0.36820013898054776,
|
866 |
+
"score_name": "sacrebleu",
|
867 |
+
"score_ci_low": 0.3262271352518154,
|
868 |
+
"score_ci_high": 0.41519158941270023,
|
869 |
+
"sacrebleu_ci_low": 0.3262271352518154,
|
870 |
+
"sacrebleu_ci_high": 0.41519158941270023
|
871 |
+
},
|
872 |
+
"mt_flores_101_eng_ara": {
|
873 |
+
"num_of_instances": 66,
|
874 |
+
"counts": [
|
875 |
+
247,
|
876 |
+
51,
|
877 |
+
24,
|
878 |
+
13
|
879 |
+
],
|
880 |
+
"totals": [
|
881 |
+
3575,
|
882 |
+
3509,
|
883 |
+
3443,
|
884 |
+
3377
|
885 |
+
],
|
886 |
+
"precisions": [
|
887 |
+
0.06909090909090909,
|
888 |
+
0.014534055286406384,
|
889 |
+
0.0069706651176299735,
|
890 |
+
0.0038495706248149247
|
891 |
+
],
|
892 |
+
"bp": 1.0,
|
893 |
+
"sys_len": 3575,
|
894 |
+
"ref_len": 1589,
|
895 |
+
"sacrebleu": 0.012812195485921624,
|
896 |
+
"score": 0.012812195485921624,
|
897 |
+
"score_name": "sacrebleu",
|
898 |
+
"score_ci_low": 0.006078646745883528,
|
899 |
+
"score_ci_high": 0.020936322376328288,
|
900 |
+
"sacrebleu_ci_low": 0.006078646745883528,
|
901 |
+
"sacrebleu_ci_high": 0.020936322376328288
|
902 |
+
},
|
903 |
+
"mt_flores_101_eng_deu": {
|
904 |
+
"num_of_instances": 66,
|
905 |
+
"counts": [
|
906 |
+
1113,
|
907 |
+
599,
|
908 |
+
371,
|
909 |
+
238
|
910 |
+
],
|
911 |
+
"totals": [
|
912 |
+
1788,
|
913 |
+
1722,
|
914 |
+
1656,
|
915 |
+
1590
|
916 |
+
],
|
917 |
+
"precisions": [
|
918 |
+
0.6224832214765101,
|
919 |
+
0.34785133565621373,
|
920 |
+
0.22403381642512077,
|
921 |
+
0.14968553459119496
|
922 |
+
],
|
923 |
+
"bp": 0.9740561253203749,
|
924 |
+
"sys_len": 1788,
|
925 |
+
"ref_len": 1835,
|
926 |
+
"sacrebleu": 0.2843398077874606,
|
927 |
+
"score": 0.2843398077874606,
|
928 |
+
"score_name": "sacrebleu",
|
929 |
+
"score_ci_low": 0.23626058332848057,
|
930 |
+
"score_ci_high": 0.31993924524942646,
|
931 |
+
"sacrebleu_ci_low": 0.23626058332848057,
|
932 |
+
"sacrebleu_ci_high": 0.31993924524942646
|
933 |
+
},
|
934 |
+
"mt_flores_101_eng_fra": {
|
935 |
+
"num_of_instances": 66,
|
936 |
+
"counts": [
|
937 |
+
1420,
|
938 |
+
985,
|
939 |
+
727,
|
940 |
+
545
|
941 |
+
],
|
942 |
+
"totals": [
|
943 |
+
2032,
|
944 |
+
1966,
|
945 |
+
1900,
|
946 |
+
1834
|
947 |
+
],
|
948 |
+
"precisions": [
|
949 |
+
0.6988188976377953,
|
950 |
+
0.5010172939979654,
|
951 |
+
0.38263157894736843,
|
952 |
+
0.29716466739367503
|
953 |
+
],
|
954 |
+
"bp": 0.9824394796731021,
|
955 |
+
"sys_len": 2032,
|
956 |
+
"ref_len": 2068,
|
957 |
+
"sacrebleu": 0.4388384183123361,
|
958 |
+
"score": 0.4388384183123361,
|
959 |
+
"score_name": "sacrebleu",
|
960 |
+
"score_ci_low": 0.39345852629040573,
|
961 |
+
"score_ci_high": 0.4779013774696765,
|
962 |
+
"sacrebleu_ci_low": 0.39345852629040573,
|
963 |
+
"sacrebleu_ci_high": 0.4779013774696765
|
964 |
+
},
|
965 |
+
"mt_flores_101_eng_kor": {
|
966 |
+
"num_of_instances": 66,
|
967 |
+
"counts": [
|
968 |
+
1094,
|
969 |
+
480,
|
970 |
+
264,
|
971 |
+
149
|
972 |
+
],
|
973 |
+
"totals": [
|
974 |
+
2582,
|
975 |
+
2516,
|
976 |
+
2450,
|
977 |
+
2384
|
978 |
+
],
|
979 |
+
"precisions": [
|
980 |
+
0.42370255615801705,
|
981 |
+
0.1907790143084261,
|
982 |
+
0.10775510204081633,
|
983 |
+
0.0625
|
984 |
+
],
|
985 |
+
"bp": 1.0,
|
986 |
+
"sys_len": 2582,
|
987 |
+
"ref_len": 2235,
|
988 |
+
"sacrebleu": 0.15274865193932807,
|
989 |
+
"score": 0.15274865193932807,
|
990 |
+
"score_name": "sacrebleu",
|
991 |
+
"score_ci_low": 0.11545139318026426,
|
992 |
+
"score_ci_high": 0.18226843784214114,
|
993 |
+
"sacrebleu_ci_low": 0.11545139318026426,
|
994 |
+
"sacrebleu_ci_high": 0.18226843784214114
|
995 |
+
},
|
996 |
+
"mt_flores_101_eng_por": {
|
997 |
+
"num_of_instances": 66,
|
998 |
+
"counts": [
|
999 |
+
1376,
|
1000 |
+
942,
|
1001 |
+
686,
|
1002 |
+
504
|
1003 |
+
],
|
1004 |
+
"totals": [
|
1005 |
+
1895,
|
1006 |
+
1829,
|
1007 |
+
1763,
|
1008 |
+
1697
|
1009 |
+
],
|
1010 |
+
"precisions": [
|
1011 |
+
0.7261213720316623,
|
1012 |
+
0.5150355385456534,
|
1013 |
+
0.3891094724900737,
|
1014 |
+
0.29699469652327637
|
1015 |
+
],
|
1016 |
+
"bp": 0.988979382694272,
|
1017 |
+
"sys_len": 1895,
|
1018 |
+
"ref_len": 1916,
|
1019 |
+
"sacrebleu": 0.4509246392883171,
|
1020 |
+
"score": 0.4509246392883171,
|
1021 |
+
"score_name": "sacrebleu",
|
1022 |
+
"score_ci_low": 0.3969123124838706,
|
1023 |
+
"score_ci_high": 0.49488969802829147,
|
1024 |
+
"sacrebleu_ci_low": 0.3969123124838706,
|
1025 |
+
"sacrebleu_ci_high": 0.49488969802829147
|
1026 |
+
},
|
1027 |
+
"mt_flores_101_eng_ron": {
|
1028 |
+
"num_of_instances": 66,
|
1029 |
+
"counts": [
|
1030 |
+
1007,
|
1031 |
+
563,
|
1032 |
+
348,
|
1033 |
+
213
|
1034 |
+
],
|
1035 |
+
"totals": [
|
1036 |
+
1937,
|
1037 |
+
1871,
|
1038 |
+
1805,
|
1039 |
+
1739
|
1040 |
+
],
|
1041 |
+
"precisions": [
|
1042 |
+
0.5198760970573051,
|
1043 |
+
0.3009086050240513,
|
1044 |
+
0.192797783933518,
|
1045 |
+
0.12248418631397355
|
1046 |
+
],
|
1047 |
+
"bp": 0.9938240032224314,
|
1048 |
+
"sys_len": 1937,
|
1049 |
+
"ref_len": 1949,
|
1050 |
+
"sacrebleu": 0.24501270828054722,
|
1051 |
+
"score": 0.24501270828054722,
|
1052 |
+
"score_name": "sacrebleu",
|
1053 |
+
"score_ci_low": 0.20771746826159335,
|
1054 |
+
"score_ci_high": 0.2893120776468884,
|
1055 |
+
"sacrebleu_ci_low": 0.20771746826159335,
|
1056 |
+
"sacrebleu_ci_high": 0.2893120776468884
|
1057 |
+
},
|
1058 |
+
"mt_flores_101_eng_spa": {
|
1059 |
+
"num_of_instances": 66,
|
1060 |
+
"counts": [
|
1061 |
+
1254,
|
1062 |
+
697,
|
1063 |
+
407,
|
1064 |
+
241
|
1065 |
+
],
|
1066 |
+
"totals": [
|
1067 |
+
1994,
|
1068 |
+
1928,
|
1069 |
+
1862,
|
1070 |
+
1796
|
1071 |
+
],
|
1072 |
+
"precisions": [
|
1073 |
+
0.6288866599799399,
|
1074 |
+
0.36151452282157676,
|
1075 |
+
0.21858216970998925,
|
1076 |
+
0.13418708240534521
|
1077 |
+
],
|
1078 |
+
"bp": 0.9491803375373334,
|
1079 |
+
"sys_len": 1994,
|
1080 |
+
"ref_len": 2098,
|
1081 |
+
"sacrebleu": 0.27124055641744416,
|
1082 |
+
"score": 0.27124055641744416,
|
1083 |
+
"score_name": "sacrebleu",
|
1084 |
+
"score_ci_low": 0.238444782766854,
|
1085 |
+
"score_ci_high": 0.29802738323060723,
|
1086 |
+
"sacrebleu_ci_low": 0.238444782766854,
|
1087 |
+
"sacrebleu_ci_high": 0.29802738323060723
|
1088 |
+
},
|
1089 |
+
"mt_flores_101_fra_eng": {
|
1090 |
+
"num_of_instances": 66,
|
1091 |
+
"counts": [
|
1092 |
+
1272,
|
1093 |
+
821,
|
1094 |
+
564,
|
1095 |
+
393
|
1096 |
+
],
|
1097 |
+
"totals": [
|
1098 |
+
1814,
|
1099 |
+
1748,
|
1100 |
+
1682,
|
1101 |
+
1616
|
1102 |
+
],
|
1103 |
+
"precisions": [
|
1104 |
+
0.701212789415656,
|
1105 |
+
0.4696796338672769,
|
1106 |
+
0.33531510107015455,
|
1107 |
+
0.24319306930693071
|
1108 |
+
],
|
1109 |
+
"bp": 1.0,
|
1110 |
+
"sys_len": 1814,
|
1111 |
+
"ref_len": 1734,
|
1112 |
+
"sacrebleu": 0.4048218693289096,
|
1113 |
+
"score": 0.4048218693289096,
|
1114 |
+
"score_name": "sacrebleu",
|
1115 |
+
"score_ci_low": 0.3701816768681021,
|
1116 |
+
"score_ci_high": 0.4500189391713141,
|
1117 |
+
"sacrebleu_ci_low": 0.3701816768681021,
|
1118 |
+
"sacrebleu_ci_high": 0.4500189391713141
|
1119 |
+
},
|
1120 |
+
"mt_flores_101_jpn_eng": {
|
1121 |
+
"num_of_instances": 66,
|
1122 |
+
"counts": [
|
1123 |
+
1013,
|
1124 |
+
461,
|
1125 |
+
263,
|
1126 |
+
158
|
1127 |
+
],
|
1128 |
+
"totals": [
|
1129 |
+
1820,
|
1130 |
+
1754,
|
1131 |
+
1688,
|
1132 |
+
1622
|
1133 |
+
],
|
1134 |
+
"precisions": [
|
1135 |
+
0.5565934065934066,
|
1136 |
+
0.2628278221208666,
|
1137 |
+
0.15580568720379145,
|
1138 |
+
0.09741060419235512
|
1139 |
+
],
|
1140 |
+
"bp": 1.0,
|
1141 |
+
"sys_len": 1820,
|
1142 |
+
"ref_len": 1734,
|
1143 |
+
"sacrebleu": 0.2170699640137964,
|
1144 |
+
"score": 0.2170699640137964,
|
1145 |
+
"score_name": "sacrebleu",
|
1146 |
+
"score_ci_low": 0.18647053560514687,
|
1147 |
+
"score_ci_high": 0.26290605520041826,
|
1148 |
+
"sacrebleu_ci_low": 0.18647053560514687,
|
1149 |
+
"sacrebleu_ci_high": 0.26290605520041826
|
1150 |
+
},
|
1151 |
+
"mt_flores_101_kor_eng": {
|
1152 |
+
"num_of_instances": 66,
|
1153 |
+
"counts": [
|
1154 |
+
993,
|
1155 |
+
465,
|
1156 |
+
267,
|
1157 |
+
159
|
1158 |
+
],
|
1159 |
+
"totals": [
|
1160 |
+
1895,
|
1161 |
+
1829,
|
1162 |
+
1763,
|
1163 |
+
1697
|
1164 |
+
],
|
1165 |
+
"precisions": [
|
1166 |
+
0.5240105540897098,
|
1167 |
+
0.2542372881355932,
|
1168 |
+
0.1514463981849121,
|
1169 |
+
0.09369475545079553
|
1170 |
+
],
|
1171 |
+
"bp": 1.0,
|
1172 |
+
"sys_len": 1895,
|
1173 |
+
"ref_len": 1734,
|
1174 |
+
"sacrebleu": 0.20851551650550446,
|
1175 |
+
"score": 0.20851551650550446,
|
1176 |
+
"score_name": "sacrebleu",
|
1177 |
+
"score_ci_low": 0.17957414369928407,
|
1178 |
+
"score_ci_high": 0.2554257407203763,
|
1179 |
+
"sacrebleu_ci_low": 0.17957414369928407,
|
1180 |
+
"sacrebleu_ci_high": 0.2554257407203763
|
1181 |
+
},
|
1182 |
+
"mt_flores_101_por_eng": {
|
1183 |
+
"num_of_instances": 66,
|
1184 |
+
"counts": [
|
1185 |
+
1283,
|
1186 |
+
864,
|
1187 |
+
623,
|
1188 |
+
463
|
1189 |
+
],
|
1190 |
+
"totals": [
|
1191 |
+
1786,
|
1192 |
+
1720,
|
1193 |
+
1654,
|
1194 |
+
1588
|
1195 |
+
],
|
1196 |
+
"precisions": [
|
1197 |
+
0.7183650615901456,
|
1198 |
+
0.5023255813953488,
|
1199 |
+
0.3766626360338573,
|
1200 |
+
0.2915617128463476
|
1201 |
+
],
|
1202 |
+
"bp": 1.0,
|
1203 |
+
"sys_len": 1786,
|
1204 |
+
"ref_len": 1734,
|
1205 |
+
"sacrebleu": 0.4461731000368479,
|
1206 |
+
"score": 0.4461731000368479,
|
1207 |
+
"score_name": "sacrebleu",
|
1208 |
+
"score_ci_low": 0.3937462177900075,
|
1209 |
+
"score_ci_high": 0.5074742125274534,
|
1210 |
+
"sacrebleu_ci_low": 0.3937462177900075,
|
1211 |
+
"sacrebleu_ci_high": 0.5074742125274534
|
1212 |
+
},
|
1213 |
+
"mt_flores_101_ron_eng": {
|
1214 |
+
"num_of_instances": 66,
|
1215 |
+
"counts": [
|
1216 |
+
1272,
|
1217 |
+
815,
|
1218 |
+
567,
|
1219 |
+
394
|
1220 |
+
],
|
1221 |
+
"totals": [
|
1222 |
+
1883,
|
1223 |
+
1817,
|
1224 |
+
1751,
|
1225 |
+
1685
|
1226 |
+
],
|
1227 |
+
"precisions": [
|
1228 |
+
0.6755177907594265,
|
1229 |
+
0.4485415520088057,
|
1230 |
+
0.32381496287835526,
|
1231 |
+
0.23382789317507416
|
1232 |
+
],
|
1233 |
+
"bp": 1.0,
|
1234 |
+
"sys_len": 1883,
|
1235 |
+
"ref_len": 1734,
|
1236 |
+
"sacrebleu": 0.3891868659594801,
|
1237 |
+
"score": 0.3891868659594801,
|
1238 |
+
"score_name": "sacrebleu",
|
1239 |
+
"score_ci_low": 0.3320885601116078,
|
1240 |
+
"score_ci_high": 0.43155018915229526,
|
1241 |
+
"sacrebleu_ci_low": 0.3320885601116078,
|
1242 |
+
"sacrebleu_ci_high": 0.43155018915229526
|
1243 |
+
},
|
1244 |
+
"mt_flores_101_spa_eng": {
|
1245 |
+
"num_of_instances": 66,
|
1246 |
+
"counts": [
|
1247 |
+
1163,
|
1248 |
+
647,
|
1249 |
+
405,
|
1250 |
+
255
|
1251 |
+
],
|
1252 |
+
"totals": [
|
1253 |
+
1850,
|
1254 |
+
1784,
|
1255 |
+
1718,
|
1256 |
+
1652
|
1257 |
+
],
|
1258 |
+
"precisions": [
|
1259 |
+
0.6286486486486487,
|
1260 |
+
0.36266816143497754,
|
1261 |
+
0.23573923166472643,
|
1262 |
+
0.15435835351089588
|
1263 |
+
],
|
1264 |
+
"bp": 1.0,
|
1265 |
+
"sys_len": 1850,
|
1266 |
+
"ref_len": 1734,
|
1267 |
+
"sacrebleu": 0.30180043021927255,
|
1268 |
+
"score": 0.30180043021927255,
|
1269 |
+
"score_name": "sacrebleu",
|
1270 |
+
"score_ci_low": 0.2688254114664501,
|
1271 |
+
"score_ci_high": 0.3447493011880723,
|
1272 |
+
"sacrebleu_ci_low": 0.2688254114664501,
|
1273 |
+
"sacrebleu_ci_high": 0.3447493011880723
|
1274 |
+
},
|
1275 |
+
"score": 0.30224812174972626,
|
1276 |
+
"score_name": "subsets_mean",
|
1277 |
+
"num_of_instances": 990
|
1278 |
+
},
|
1279 |
+
"score": 0.40523563547897645,
|
1280 |
+
"score_name": "subsets_mean",
|
1281 |
+
"num_of_instances": 12472
|
1282 |
+
}
|
1283 |
+
}
|