jbnayahu commited on
Commit
189956e
·
unverified ·
1 Parent(s): 6066b5d

New results files.

Browse files

Signed-off-by: Jonathan Bnayahu <[email protected]>

results/bluebench/2025-06-16T17-40-01_evaluation_results.json DELETED
@@ -1,580 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-06-16T14:40:01.560857Z",
4
- "command_line_invocation": [
5
- "/home/bnayahu/miniforge3/envs/unitxt/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/meta-llama/llama-3-3-70b-instruct,max_tokens=256",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR",
20
- "--limit",
21
- "100"
22
- ],
23
- "parsed_arguments": {
24
- "tasks": [
25
- "benchmarks.bluebench"
26
- ],
27
- "split": "test",
28
- "num_fewshots": null,
29
- "limit": 100,
30
- "batch_size": 8,
31
- "model": "watsonx/meta-llama/llama-3-3-70b-instruct",
32
- "model_args": {
33
- "max_tokens": 256
34
- },
35
- "gen_kwargs": null,
36
- "chat_template_kwargs": null,
37
- "output_path": "./results/bluebench",
38
- "output_file_prefix": "evaluation_results",
39
- "log_samples": true,
40
- "verbosity": "ERROR",
41
- "apply_chat_template": false,
42
- "trust_remote_code": true,
43
- "disable_hf_cache": false,
44
- "cache_dir": null
45
- },
46
- "unitxt_version": "1.24.0",
47
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
48
- "python_version": "3.11.12",
49
- "system": "Linux",
50
- "system_version": "#1 SMP PREEMPT_DYNAMIC Mon Apr 21 17:08:54 UTC 2025",
51
- "installed_packages": {
52
- "tqdm": "4.67.1",
53
- "httpretty": "1.1.4",
54
- "evaluate": "0.4.3",
55
- "ruff": "0.11.10",
56
- "virtualenv": "20.31.2",
57
- "urllib3": "2.4.0",
58
- "httpcore": "1.0.9",
59
- "mecab-ko-dic": "1.0.0",
60
- "mecab-ko": "1.0.1",
61
- "identify": "2.6.10",
62
- "bert-score": "0.3.13",
63
- "lxml": "5.4.0",
64
- "python-dotenv": "1.1.0",
65
- "accelerate": "1.7.0",
66
- "httpx-sse": "0.4.0",
67
- "pillow": "11.2.1",
68
- "certifi": "2025.4.26",
69
- "pyparsing": "3.2.3",
70
- "nvidia-cusparselt-cu12": "0.6.3",
71
- "tzdata": "2025.2",
72
- "torch": "2.7.0",
73
- "MarkupSafe": "3.0.2",
74
- "setuptools": "80.1.0",
75
- "pydantic": "2.11.4",
76
- "yarl": "1.20.0",
77
- "importlib_metadata": "8.0.0",
78
- "pydantic_core": "2.33.2",
79
- "scipy": "1.15.3",
80
- "annotated-types": "0.7.0",
81
- "portalocker": "3.1.1",
82
- "packaging": "24.2",
83
- "Deprecated": "1.2.18",
84
- "typing_extensions": "4.12.2",
85
- "ibm-cos-sdk-s3transfer": "2.14.1",
86
- "nvidia-cufft-cu12": "11.3.0.4",
87
- "nvidia-cusolver-cu12": "11.7.1.2",
88
- "diskcache": "5.6.3",
89
- "fsspec": "2025.3.0",
90
- "transformers": "4.51.3",
91
- "platformdirs": "4.2.2",
92
- "nvidia-cublas-cu12": "12.6.4.1",
93
- "threadpoolctl": "3.6.0",
94
- "jsonschema-specifications": "2025.4.1",
95
- "tenacity": "9.1.2",
96
- "propcache": "0.3.1",
97
- "ibm-cos-sdk": "2.14.1",
98
- "mpmath": "1.3.0",
99
- "jiter": "0.9.0",
100
- "filelock": "3.18.0",
101
- "tomli": "2.0.1",
102
- "nvidia-nvjitlink-cu12": "12.6.85",
103
- "cfgv": "3.4.0",
104
- "ibm_watsonx_ai": "1.3.13",
105
- "ibm-generative-ai": "3.0.0",
106
- "wheel": "0.45.1",
107
- "sympy": "1.14.0",
108
- "requests": "2.32.2",
109
- "charset-normalizer": "3.4.2",
110
- "psutil": "7.0.0",
111
- "pre_commit": "4.2.0",
112
- "nodeenv": "1.9.1",
113
- "colorama": "0.4.6",
114
- "absl-py": "2.2.2",
115
- "rouge_score": "0.1.2",
116
- "scikit-learn": "1.6.1",
117
- "multiprocess": "0.70.16",
118
- "xxhash": "3.5.0",
119
- "detect-secrets": "1.5.0",
120
- "aiohttp": "3.11.18",
121
- "frozenlist": "1.6.0",
122
- "tabulate": "0.9.0",
123
- "triton": "3.3.0",
124
- "idna": "3.10",
125
- "PyYAML": "6.0.2",
126
- "ibm-cos-sdk-core": "2.14.1",
127
- "nvidia-curand-cu12": "10.3.7.77",
128
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
129
- "tiktoken": "0.9.0",
130
- "aiosignal": "1.3.2",
131
- "attrs": "25.3.0",
132
- "h11": "0.16.0",
133
- "anyio": "4.9.0",
134
- "wrapt": "1.17.2",
135
- "kiwisolver": "1.4.8",
136
- "nvidia-cudnn-cu12": "9.5.1.17",
137
- "matplotlib": "3.10.3",
138
- "aiolimiter": "1.2.1",
139
- "codespell": "2.4.1",
140
- "jmespath": "1.0.1",
141
- "nltk": "3.9.1",
142
- "unitxt": "1.24.0",
143
- "dill": "0.3.8",
144
- "multidict": "6.4.3",
145
- "conllu": "6.0.0",
146
- "litellm": "1.69.3",
147
- "joblib": "1.5.0",
148
- "cycler": "0.12.1",
149
- "pip": "25.1.1",
150
- "nvidia-nccl-cu12": "2.26.2",
151
- "click": "8.2.0",
152
- "fonttools": "4.58.0",
153
- "datasets": "3.6.0",
154
- "six": "1.17.0",
155
- "numpy": "2.2.5",
156
- "nvidia-cuda-runtime-cu12": "12.6.77",
157
- "huggingface-hub": "0.31.2",
158
- "aiohappyeyeballs": "2.6.1",
159
- "sacrebleu": "2.5.1",
160
- "pyarrow": "20.0.0",
161
- "openai": "1.75.0",
162
- "python-dateutil": "2.9.0.post0",
163
- "pytz": "2025.2",
164
- "contourpy": "1.3.2",
165
- "pandas": "2.2.3",
166
- "distro": "1.9.0",
167
- "httpx": "0.27.2",
168
- "rpds-py": "0.25.0",
169
- "Jinja2": "3.1.6",
170
- "nvidia-cusparse-cu12": "12.5.4.2",
171
- "nvidia-nvtx-cu12": "12.6.77",
172
- "fuzzywuzzy": "0.18.0",
173
- "tokenizers": "0.21.1",
174
- "lomond": "0.3.3",
175
- "nvidia-cufile-cu12": "1.11.1.6",
176
- "typing-inspection": "0.4.0",
177
- "safetensors": "0.5.3",
178
- "nvidia-cuda-cupti-cu12": "12.6.80",
179
- "referencing": "0.36.2",
180
- "networkx": "3.4.2",
181
- "jsonschema": "4.23.0",
182
- "zipp": "3.19.2",
183
- "regex": "2024.11.6",
184
- "distlib": "0.3.9",
185
- "sniffio": "1.3.1",
186
- "autocommand": "2.2.2",
187
- "jaraco.collections": "5.1.0",
188
- "typeguard": "4.3.0",
189
- "jaraco.text": "3.12.1",
190
- "jaraco.context": "5.3.0",
191
- "jaraco.functools": "4.0.1",
192
- "more-itertools": "10.3.0",
193
- "backports.tarfile": "1.2.0",
194
- "inflect": "7.3.1"
195
- }
196
- },
197
- "results": {
198
- "bias": {
199
- "safety_bbq_age": {
200
- "accuracy": 0.7888888888888889,
201
- "accuracy_ci_low": 0.7,
202
- "accuracy_ci_high": 0.8555555555555555,
203
- "score_name": "accuracy",
204
- "score": 0.7888888888888889,
205
- "score_ci_high": 0.8555555555555555,
206
- "score_ci_low": 0.7,
207
- "num_of_instances": 90
208
- },
209
- "safety_bbq_disability_status": {
210
- "accuracy": 1.0,
211
- "accuracy_ci_low": 1.0,
212
- "accuracy_ci_high": 1.0,
213
- "score_name": "accuracy",
214
- "score": 1.0,
215
- "score_ci_high": 1.0,
216
- "score_ci_low": 1.0,
217
- "num_of_instances": 10
218
- },
219
- "score": 0.8944444444444444,
220
- "score_name": "subsets_mean",
221
- "num_of_instances": 100
222
- },
223
- "chatbot_abilities": {
224
- "arena_hard_generation_english_gpt_4_0314_reference": {
225
- "num_of_instances": 100,
226
- "llama_3_70b_instruct_template_arena_hard": 0.5,
227
- "score": 0.5,
228
- "score_name": "llama_3_70b_instruct_template_arena_hard"
229
- },
230
- "score": 0.5,
231
- "score_name": "subsets_mean",
232
- "num_of_instances": 100
233
- },
234
- "entity_extraction": {
235
- "universal_ner_en_ewt": {
236
- "num_of_instances": 100,
237
- "f1_Person": 0.5294117647058824,
238
- "f1_Organization": 0.4489795918367347,
239
- "f1_Location": 0.3076923076923077,
240
- "f1_macro": 0.4286945547449749,
241
- "recall_macro": 0.3447204968944099,
242
- "precision_macro": 0.5806637806637807,
243
- "in_classes_support": 0.6266666666666667,
244
- "f1_micro": 0.3466666666666667,
245
- "recall_micro": 0.3466666666666667,
246
- "precision_micro": 0.3466666666666667,
247
- "score": 0.3466666666666667,
248
- "score_name": "f1_micro",
249
- "score_ci_low": 0.2410871556202116,
250
- "score_ci_high": 0.45611092451496155,
251
- "f1_micro_ci_low": 0.2410871556202116,
252
- "f1_micro_ci_high": 0.45611092451496155
253
- },
254
- "score": 0.3466666666666667,
255
- "score_name": "subsets_mean",
256
- "num_of_instances": 100
257
- },
258
- "knowledge": {
259
- "mmlu_pro_biology": {
260
- "accuracy": 0.704225352112676,
261
- "accuracy_ci_low": 0.5915492957746479,
262
- "accuracy_ci_high": 0.8028169014084507,
263
- "score_name": "accuracy",
264
- "score": 0.704225352112676,
265
- "score_ci_high": 0.8028169014084507,
266
- "score_ci_low": 0.5915492957746479,
267
- "num_of_instances": 71
268
- },
269
- "mmlu_pro_business": {
270
- "accuracy": 0.13793103448275862,
271
- "accuracy_ci_low": 0.034482758620689655,
272
- "accuracy_ci_high": 0.3103448275862069,
273
- "score_name": "accuracy",
274
- "score": 0.13793103448275862,
275
- "score_ci_high": 0.3103448275862069,
276
- "score_ci_low": 0.034482758620689655,
277
- "num_of_instances": 29
278
- },
279
- "score": 0.42107819329771734,
280
- "score_name": "subsets_mean",
281
- "num_of_instances": 100
282
- },
283
- "legal": {
284
- "legalbench_abercrombie": {
285
- "f1_macro": 0.6635397677258142,
286
- "f1_suggestive": 0.5555555555555556,
287
- "f1_generic": 0.7692307692307693,
288
- "f1_descriptive": 0.6976744186046512,
289
- "f1_fanciful": 0.6666666666666666,
290
- "f1_arbitrary": 0.6285714285714286,
291
- "f1_macro_ci_low": 0.5625965245413472,
292
- "f1_macro_ci_high": 0.7723256077319486,
293
- "score_name": "f1_micro",
294
- "score": 0.6586826347305389,
295
- "score_ci_high": 0.7515151515151515,
296
- "score_ci_low": 0.5524327906405184,
297
- "num_of_instances": 85,
298
- "accuracy": 0.6470588235294118,
299
- "accuracy_ci_low": 0.5411764705882353,
300
- "accuracy_ci_high": 0.7411764705882353,
301
- "f1_micro": 0.6586826347305389,
302
- "f1_micro_ci_low": 0.5524327906405184,
303
- "f1_micro_ci_high": 0.7515151515151515
304
- },
305
- "legalbench_corporate_lobbying": {
306
- "f1_macro": 0.5357142857142857,
307
- "f1_no": 0.5,
308
- "f1_yes": 0.5714285714285714,
309
- "f1_macro_ci_low": 0.2833333333333333,
310
- "f1_macro_ci_high": 0.7999279223515758,
311
- "score_name": "f1_micro",
312
- "score": 0.5384615384615384,
313
- "score_ci_high": 0.7857142857142857,
314
- "score_ci_low": 0.26917373942421613,
315
- "num_of_instances": 15,
316
- "accuracy": 0.4666666666666667,
317
- "accuracy_ci_low": 0.2,
318
- "accuracy_ci_high": 0.7333333333333333,
319
- "f1_micro": 0.5384615384615384,
320
- "f1_micro_ci_low": 0.26917373942421613,
321
- "f1_micro_ci_high": 0.7857142857142857
322
- },
323
- "score": 0.5985720865960387,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 100
326
- },
327
- "news_classification": {
328
- "20_newsgroups_short": {
329
- "f1_macro": 0.6443434343434343,
330
- "f1_cars": 0.9090909090909091,
331
- "f1_windows x": 0.5714285714285714,
332
- "f1_computer graphics": 0.6666666666666666,
333
- "f1_atheism": 0.5714285714285714,
334
- "f1_religion": 0.0,
335
- "f1_medicine": 1.0,
336
- "f1_christianity": 0.8571428571428571,
337
- "f1_microsoft windows": 0.6666666666666666,
338
- "f1_middle east": 0.5,
339
- "f1_motorcycles": 0.6,
340
- "f1_pc hardware": 0.8,
341
- "f1_mac hardware": 0.8,
342
- "f1_for sale": 0.5,
343
- "f1_guns": 0.4444444444444444,
344
- "f1_space": 0.75,
345
- "f1_cryptography": 0.3333333333333333,
346
- "f1_baseball": 1.0,
347
- "f1_politics": 0.5,
348
- "f1_hockey": 0.75,
349
- "f1_electronics": 0.6666666666666666,
350
- "f1_macro_ci_low": 0.5605248203581513,
351
- "f1_macro_ci_high": 0.7498000775037662,
352
- "score_name": "f1_micro",
353
- "score": 0.6740331491712708,
354
- "score_ci_high": 0.7567567567567568,
355
- "score_ci_low": 0.5654571096096505,
356
- "num_of_instances": 100,
357
- "accuracy": 0.61,
358
- "accuracy_ci_low": 0.5,
359
- "accuracy_ci_high": 0.7,
360
- "f1_micro": 0.6740331491712708,
361
- "f1_micro_ci_low": 0.5654571096096505,
362
- "f1_micro_ci_high": 0.7567567567567568
363
- },
364
- "score": 0.6740331491712708,
365
- "score_name": "subsets_mean",
366
- "num_of_instances": 100
367
- },
368
- "product_help": {
369
- "cfpb_product_2023": {
370
- "f1_macro": 0.8637383872166481,
371
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.927536231884058,
372
- "f1_credit card or prepaid card": 1.0,
373
- "f1_debt collection": 0.64,
374
- "f1_checking or savings account": 0.9230769230769231,
375
- "f1_mortgage": 0.8888888888888888,
376
- "f1_vehicle loan or lease": 0.6666666666666666,
377
- "f1_money transfer or virtual currency or money service": 1.0,
378
- "f1_macro_ci_low": 0.7066777160591827,
379
- "f1_macro_ci_high": 0.9300773607822144,
380
- "score_name": "f1_micro",
381
- "score": 0.8888888888888888,
382
- "score_ci_high": 0.9393939393939394,
383
- "score_ci_low": 0.8163265306122449,
384
- "num_of_instances": 100,
385
- "accuracy": 0.88,
386
- "accuracy_ci_low": 0.81,
387
- "accuracy_ci_high": 0.93,
388
- "f1_micro": 0.8888888888888888,
389
- "f1_micro_ci_low": 0.8163265306122449,
390
- "f1_micro_ci_high": 0.9393939393939394
391
- },
392
- "score": 0.8888888888888888,
393
- "score_name": "subsets_mean",
394
- "num_of_instances": 100
395
- },
396
- "qa_finance": {
397
- "fin_qa": {
398
- "num_of_instances": 100,
399
- "program_accuracy": 0.2,
400
- "score": 0.2,
401
- "score_name": "program_accuracy",
402
- "execution_accuracy": 0.2,
403
- "program_accuracy_ci_low": 0.13,
404
- "program_accuracy_ci_high": 0.29,
405
- "score_ci_low": 0.13,
406
- "score_ci_high": 0.29,
407
- "execution_accuracy_ci_low": 0.13,
408
- "execution_accuracy_ci_high": 0.29
409
- },
410
- "score": 0.2,
411
- "score_name": "subsets_mean",
412
- "num_of_instances": 100
413
- },
414
- "rag_general": {
415
- "rag_response_generation_clapnq": {
416
- "precision": 0.4639242544792729,
417
- "recall": 0.6403509065582018,
418
- "f1": 0.4976247962897783,
419
- "precision_ci_low": 0.42748095786992185,
420
- "precision_ci_high": 0.5047660147148081,
421
- "recall_ci_low": 0.6019930525471148,
422
- "recall_ci_high": 0.6774368805520771,
423
- "f1_ci_low": 0.4683673012782413,
424
- "f1_ci_high": 0.5304072668948948,
425
- "score_name": "f1",
426
- "score": 0.4976247962897783,
427
- "score_ci_high": 0.5304072668948948,
428
- "score_ci_low": 0.4683673012782413,
429
- "num_of_instances": 100,
430
- "correctness_f1_bert_score.deberta_large_mnli": 0.6864667510986329,
431
- "correctness_recall_bert_score.deberta_large_mnli": 0.7247687041759491,
432
- "correctness_precision_bert_score.deberta_large_mnli": 0.6621749752759933,
433
- "faithfullness_f1_token_overlap": 0.412101720837311,
434
- "faithfullness_recall_token_overlap": 0.32402585790821237,
435
- "faithfullness_precision_token_overlap": 0.6786708572055166,
436
- "correctness_f1_token_overlap": 0.4976247962897783,
437
- "correctness_recall_token_overlap": 0.6403509065582018,
438
- "correctness_precision_token_overlap": 0.4639242544792729
439
- },
440
- "score": 0.4976247962897783,
441
- "score_name": "subsets_mean",
442
- "num_of_instances": 100
443
- },
444
- "reasoning": {
445
- "hellaswag": {
446
- "accuracy": 0.57,
447
- "accuracy_ci_low": 0.47,
448
- "accuracy_ci_high": 0.66,
449
- "score_name": "accuracy",
450
- "score": 0.57,
451
- "score_ci_high": 0.66,
452
- "score_ci_low": 0.47,
453
- "num_of_instances": 100
454
- },
455
- "score": 0.57,
456
- "score_name": "subsets_mean",
457
- "num_of_instances": 100
458
- },
459
- "safety": {
460
- "attaq_500": {
461
- "severity_critical": 0.0,
462
- "severity_high": 0.0,
463
- "severity_medium": 1.0,
464
- "severity_low": 99.0,
465
- "category_harmful_info": 0.8407948520514996,
466
- "category_deception": 0.8690199783316466,
467
- "category_explicit_content": 0.8879365721962067,
468
- "category_violence": 0.8437370418910942,
469
- "category_discrimination": 0.7742161632530273,
470
- "category_substance_abuse": 0.8879393110672632,
471
- "category_pii": 0.7969612302603545,
472
- "safety": 0.8464966776832524,
473
- "safety_ci_low": 0.8296491155827732,
474
- "safety_ci_high": 0.8642379209192678,
475
- "score_name": "safety",
476
- "score": 0.8464966776832524,
477
- "score_ci_high": 0.8642379209192678,
478
- "score_ci_low": 0.8296491155827732,
479
- "num_of_instances": 100
480
- },
481
- "score": 0.8464966776832524,
482
- "score_name": "subsets_mean",
483
- "num_of_instances": 100
484
- },
485
- "summarization": {
486
- "billsum_document_filtered_to_6000_chars": {
487
- "num_of_instances": 100,
488
- "rouge1": 0.43029845221947843,
489
- "rougeL": 0.2955165700225417,
490
- "score": 0.2955165700225417,
491
- "score_name": "rougeL",
492
- "rouge2": 0.20799738817238542,
493
- "rougeLsum": 0.37091272315340484,
494
- "rouge1_ci_low": 0.40762860443579957,
495
- "rouge1_ci_high": 0.45046632483836146,
496
- "rougeL_ci_low": 0.2802994422178466,
497
- "rougeL_ci_high": 0.31441983596023754,
498
- "score_ci_low": 0.2802994422178466,
499
- "score_ci_high": 0.31441983596023754,
500
- "rouge2_ci_low": 0.193214668225847,
501
- "rouge2_ci_high": 0.22420116008616867,
502
- "rougeLsum_ci_low": 0.35057685960681195,
503
- "rougeLsum_ci_high": 0.3911461732163174
504
- },
505
- "score": 0.2955165700225417,
506
- "score_name": "subsets_mean",
507
- "num_of_instances": 100
508
- },
509
- "translation": {
510
- "mt_flores_101_ara_eng": {
511
- "num_of_instances": 66,
512
- "counts": [
513
- 1308,
514
- 854,
515
- 606,
516
- 437
517
- ],
518
- "totals": [
519
- 1801,
520
- 1735,
521
- 1669,
522
- 1603
523
- ],
524
- "precisions": [
525
- 0.7262631871182677,
526
- 0.49221902017291064,
527
- 0.36309167165967643,
528
- 0.272613849033063
529
- ],
530
- "bp": 1.0,
531
- "sys_len": 1801,
532
- "ref_len": 1734,
533
- "sacrebleu": 0.4337147141407253,
534
- "score": 0.4337147141407253,
535
- "score_name": "sacrebleu",
536
- "score_ci_low": 0.3842057657729977,
537
- "score_ci_high": 0.4730390019325389,
538
- "sacrebleu_ci_low": 0.3842057657729977,
539
- "sacrebleu_ci_high": 0.4730390019325389
540
- },
541
- "mt_flores_101_deu_eng": {
542
- "num_of_instances": 34,
543
- "counts": [
544
- 718,
545
- 461,
546
- 323,
547
- 234
548
- ],
549
- "totals": [
550
- 1016,
551
- 982,
552
- 948,
553
- 914
554
- ],
555
- "precisions": [
556
- 0.7066929133858268,
557
- 0.4694501018329939,
558
- 0.3407172995780591,
559
- 0.25601750547045954
560
- ],
561
- "bp": 1.0,
562
- "sys_len": 1016,
563
- "ref_len": 960,
564
- "sacrebleu": 0.4124497124322012,
565
- "score": 0.4124497124322012,
566
- "score_name": "sacrebleu",
567
- "score_ci_low": 0.3505214366395574,
568
- "score_ci_high": 0.4751525306662991,
569
- "sacrebleu_ci_low": 0.3505214366395574,
570
- "sacrebleu_ci_high": 0.4751525306662991
571
- },
572
- "score": 0.4230822132864632,
573
- "score_name": "subsets_mean",
574
- "num_of_instances": 100
575
- },
576
- "score": 0.5504925912574663,
577
- "score_name": "subsets_mean",
578
- "num_of_instances": 1300
579
- }
580
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/bluebench/{2025-06-16T11-59-29_evaluation_results.json → 2025-06-19T11-21-54_evaluation_results.json} RENAMED
@@ -1,14 +1,14 @@
1
  {
2
  "environment_info": {
3
- "timestamp_utc": "2025-06-16T08:59:29.752699Z",
4
  "command_line_invocation": [
5
- "/home/bnayahu/miniforge3/envs/unitxt/bin/unitxt-evaluate",
6
  "--tasks",
7
  "benchmarks.bluebench",
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
- "model_name=granite-3-3-8b-instruct,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
@@ -42,176 +42,157 @@
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "4cafeaa09eea146f2e2d0609974999a64dfffbbe",
46
- "python_version": "3.11.12",
47
  "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Mon Apr 21 17:08:54 UTC 2025",
49
  "installed_packages": {
50
- "tqdm": "4.67.1",
51
- "httpretty": "1.1.4",
52
- "evaluate": "0.4.3",
53
- "ruff": "0.11.10",
54
- "virtualenv": "20.31.2",
55
- "urllib3": "2.4.0",
56
- "httpcore": "1.0.9",
57
- "mecab-ko-dic": "1.0.0",
 
58
  "mecab-ko": "1.0.1",
59
- "identify": "2.6.10",
60
- "bert-score": "0.3.13",
61
- "lxml": "5.4.0",
62
- "python-dotenv": "1.1.0",
63
- "accelerate": "1.7.0",
64
- "httpx-sse": "0.4.0",
65
- "pillow": "11.2.1",
66
- "certifi": "2025.4.26",
67
- "pyparsing": "3.2.3",
68
- "nvidia-cusparselt-cu12": "0.6.3",
69
- "tzdata": "2025.2",
70
- "torch": "2.7.0",
71
- "MarkupSafe": "3.0.2",
72
- "setuptools": "80.1.0",
73
- "pydantic": "2.11.4",
74
- "yarl": "1.20.0",
75
- "importlib_metadata": "8.0.0",
76
- "pydantic_core": "2.33.2",
77
- "scipy": "1.15.3",
78
- "annotated-types": "0.7.0",
79
- "portalocker": "3.1.1",
80
- "packaging": "24.2",
81
- "Deprecated": "1.2.18",
82
- "typing_extensions": "4.12.2",
83
- "ibm-cos-sdk-s3transfer": "2.14.1",
84
- "nvidia-cufft-cu12": "11.3.0.4",
85
- "nvidia-cusolver-cu12": "11.7.1.2",
86
- "diskcache": "5.6.3",
87
- "fsspec": "2025.3.0",
88
- "transformers": "4.51.3",
89
- "platformdirs": "4.2.2",
90
- "nvidia-cublas-cu12": "12.6.4.1",
91
- "threadpoolctl": "3.6.0",
92
  "jsonschema-specifications": "2025.4.1",
93
- "tenacity": "9.1.2",
94
- "propcache": "0.3.1",
95
- "ibm-cos-sdk": "2.14.1",
96
- "mpmath": "1.3.0",
97
- "jiter": "0.9.0",
98
- "filelock": "3.18.0",
99
- "tomli": "2.0.1",
 
 
100
  "nvidia-nvjitlink-cu12": "12.6.85",
101
- "cfgv": "3.4.0",
102
- "ibm_watsonx_ai": "1.3.13",
103
- "ibm-generative-ai": "3.0.0",
104
- "wheel": "0.45.1",
105
- "sympy": "1.14.0",
106
- "requests": "2.32.2",
107
- "charset-normalizer": "3.4.2",
108
- "psutil": "7.0.0",
109
- "pre_commit": "4.2.0",
110
- "nodeenv": "1.9.1",
111
- "colorama": "0.4.6",
112
- "absl-py": "2.2.2",
113
  "rouge_score": "0.1.2",
114
- "scikit-learn": "1.6.1",
115
- "multiprocess": "0.70.16",
116
- "xxhash": "3.5.0",
117
- "detect-secrets": "1.5.0",
118
- "aiohttp": "3.11.18",
119
- "frozenlist": "1.6.0",
120
- "tabulate": "0.9.0",
121
- "triton": "3.3.0",
122
- "idna": "3.10",
123
- "PyYAML": "6.0.2",
124
- "ibm-cos-sdk-core": "2.14.1",
125
- "nvidia-curand-cu12": "10.3.7.77",
126
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
127
- "tiktoken": "0.9.0",
128
  "aiosignal": "1.3.2",
129
- "attrs": "25.3.0",
 
 
 
 
 
130
  "h11": "0.16.0",
131
- "anyio": "4.9.0",
132
- "wrapt": "1.17.2",
 
 
 
 
 
 
 
 
133
  "kiwisolver": "1.4.8",
134
- "nvidia-cudnn-cu12": "9.5.1.17",
135
- "matplotlib": "3.10.3",
136
- "aiolimiter": "1.2.1",
137
- "codespell": "2.4.1",
138
- "jmespath": "1.0.1",
139
- "nltk": "3.9.1",
140
- "unitxt": "1.24.0",
141
- "dill": "0.3.8",
142
- "multidict": "6.4.3",
143
- "conllu": "6.0.0",
144
- "litellm": "1.69.3",
145
- "joblib": "1.5.0",
146
- "cycler": "0.12.1",
147
  "pip": "25.1.1",
148
- "nvidia-nccl-cu12": "2.26.2",
149
- "click": "8.2.0",
150
- "fonttools": "4.58.0",
151
  "datasets": "3.6.0",
152
- "six": "1.17.0",
153
- "numpy": "2.2.5",
154
- "nvidia-cuda-runtime-cu12": "12.6.77",
155
- "huggingface-hub": "0.31.2",
156
- "aiohappyeyeballs": "2.6.1",
157
- "sacrebleu": "2.5.1",
158
- "pyarrow": "20.0.0",
159
- "openai": "1.75.0",
160
- "python-dateutil": "2.9.0.post0",
161
- "pytz": "2025.2",
162
- "contourpy": "1.3.2",
163
- "pandas": "2.2.3",
164
  "distro": "1.9.0",
165
- "httpx": "0.27.2",
166
- "rpds-py": "0.25.0",
167
- "Jinja2": "3.1.6",
168
- "nvidia-cusparse-cu12": "12.5.4.2",
169
- "nvidia-nvtx-cu12": "12.6.77",
170
- "fuzzywuzzy": "0.18.0",
 
 
 
 
 
 
171
  "tokenizers": "0.21.1",
172
- "lomond": "0.3.3",
173
- "nvidia-cufile-cu12": "1.11.1.6",
174
- "typing-inspection": "0.4.0",
 
 
 
 
 
 
175
  "safetensors": "0.5.3",
176
- "nvidia-cuda-cupti-cu12": "12.6.80",
177
- "referencing": "0.36.2",
178
- "networkx": "3.4.2",
179
- "jsonschema": "4.23.0",
180
- "zipp": "3.19.2",
181
  "regex": "2024.11.6",
182
- "distlib": "0.3.9",
183
- "sniffio": "1.3.1",
184
- "autocommand": "2.2.2",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  "jaraco.collections": "5.1.0",
 
 
 
186
  "typeguard": "4.3.0",
 
187
  "jaraco.text": "3.12.1",
188
- "jaraco.context": "5.3.0",
189
- "jaraco.functools": "4.0.1",
190
  "more-itertools": "10.3.0",
191
- "backports.tarfile": "1.2.0",
192
- "inflect": "7.3.1"
 
193
  }
194
  },
195
  "results": {
196
  "bias": {
197
  "safety_bbq_age": {
198
- "accuracy": 0.5111111111111111,
199
- "accuracy_ci_low": 0.4111111111111111,
200
- "accuracy_ci_high": 0.6111111111111112,
201
  "score_name": "accuracy",
202
- "score": 0.5111111111111111,
203
- "score_ci_high": 0.6111111111111112,
204
- "score_ci_low": 0.4111111111111111,
205
  "num_of_instances": 90
206
  },
207
  "safety_bbq_disability_status": {
208
- "accuracy": 0.6555555555555556,
209
- "accuracy_ci_low": 0.5444444444444444,
210
- "accuracy_ci_high": 0.7444444444444445,
211
  "score_name": "accuracy",
212
- "score": 0.6555555555555556,
213
- "score_ci_high": 0.7444444444444445,
214
- "score_ci_low": 0.5444444444444444,
215
  "num_of_instances": 90
216
  },
217
  "safety_bbq_gender_identity": {
@@ -225,86 +206,86 @@
225
  "num_of_instances": 90
226
  },
227
  "safety_bbq_nationality": {
228
- "accuracy": 0.6555555555555556,
229
- "accuracy_ci_low": 0.5555555555555556,
230
- "accuracy_ci_high": 0.7555555555555555,
231
  "score_name": "accuracy",
232
- "score": 0.6555555555555556,
233
- "score_ci_high": 0.7555555555555555,
234
- "score_ci_low": 0.5555555555555556,
235
  "num_of_instances": 90
236
  },
237
  "safety_bbq_physical_appearance": {
238
- "accuracy": 0.7333333333333333,
239
- "accuracy_ci_low": 0.6222222222222222,
240
- "accuracy_ci_high": 0.8222222222222222,
241
  "score_name": "accuracy",
242
- "score": 0.7333333333333333,
243
- "score_ci_high": 0.8222222222222222,
244
- "score_ci_low": 0.6222222222222222,
245
  "num_of_instances": 90
246
  },
247
  "safety_bbq_race_ethnicity": {
248
- "accuracy": 0.9,
249
- "accuracy_ci_low": 0.8222222222222222,
250
- "accuracy_ci_high": 0.9444444444444444,
251
  "score_name": "accuracy",
252
- "score": 0.9,
253
- "score_ci_high": 0.9444444444444444,
254
- "score_ci_low": 0.8222222222222222,
255
  "num_of_instances": 90
256
  },
257
  "safety_bbq_race_x_gender": {
258
- "accuracy": 0.9,
259
- "accuracy_ci_low": 0.8316765653997056,
260
  "accuracy_ci_high": 0.9444444444444444,
261
  "score_name": "accuracy",
262
- "score": 0.9,
263
  "score_ci_high": 0.9444444444444444,
264
- "score_ci_low": 0.8316765653997056,
265
  "num_of_instances": 90
266
  },
267
  "safety_bbq_race_x_ses": {
268
- "accuracy": 0.8555555555555555,
269
- "accuracy_ci_low": 0.7666666666666667,
270
- "accuracy_ci_high": 0.9111111111111111,
271
  "score_name": "accuracy",
272
- "score": 0.8555555555555555,
273
- "score_ci_high": 0.9111111111111111,
274
- "score_ci_low": 0.7666666666666667,
275
  "num_of_instances": 90
276
  },
277
  "safety_bbq_religion": {
278
- "accuracy": 0.7555555555555555,
279
- "accuracy_ci_low": 0.6666666666666666,
280
  "accuracy_ci_high": 0.8444444444444444,
281
  "score_name": "accuracy",
282
- "score": 0.7555555555555555,
283
  "score_ci_high": 0.8444444444444444,
284
- "score_ci_low": 0.6666666666666666,
285
  "num_of_instances": 90
286
  },
287
  "safety_bbq_ses": {
288
- "accuracy": 0.6777777777777778,
289
- "accuracy_ci_low": 0.5777777777777777,
290
- "accuracy_ci_high": 0.7666666666666667,
291
  "score_name": "accuracy",
292
- "score": 0.6777777777777778,
293
- "score_ci_high": 0.7666666666666667,
294
- "score_ci_low": 0.5777777777777777,
295
  "num_of_instances": 90
296
  },
297
  "safety_bbq_sexual_orientation": {
298
- "accuracy": 0.7888888888888889,
299
- "accuracy_ci_low": 0.7,
300
- "accuracy_ci_high": 0.8666666666666667,
301
  "score_name": "accuracy",
302
- "score": 0.7888888888888889,
303
- "score_ci_high": 0.8666666666666667,
304
- "score_ci_low": 0.7,
305
  "num_of_instances": 90
306
  },
307
- "score": 0.7555555555555555,
308
  "score_name": "subsets_mean",
309
  "num_of_instances": 990
310
  },
@@ -322,59 +303,69 @@
322
  "entity_extraction": {
323
  "universal_ner_en_ewt": {
324
  "num_of_instances": 1000,
325
- "f1_Person": 0.48648648648648646,
326
- "f1_Organization": 0.410958904109589,
327
- "f1_Location": 0.3448275862068966,
328
- "f1_macro": 0.4140909922676574,
329
- "recall_macro": 0.338801872982567,
330
- "precision_macro": 0.5399801587301587,
331
- "in_classes_support": 0.5876106194690265,
332
- "f1_micro": 0.3321100917431193,
333
- "recall_micro": 0.34476190476190477,
334
- "precision_micro": 0.32035398230088497,
335
- "score": 0.3321100917431193,
336
  "score_name": "f1_micro",
337
- "score_ci_low": 0.2855436018468573,
338
- "score_ci_high": 0.38276564122852924,
339
- "f1_micro_ci_low": 0.2855436018468573,
340
- "f1_micro_ci_high": 0.38276564122852924
341
  },
342
- "score": 0.3321100917431193,
343
  "score_name": "subsets_mean",
344
  "num_of_instances": 1000
345
  },
346
  "knowledge": {
347
  "mmlu_pro_biology": {
348
- "accuracy": 0.5352112676056338,
349
- "accuracy_ci_low": 0.4225352112676056,
350
- "accuracy_ci_high": 0.6510866959365942,
351
  "score_name": "accuracy",
352
- "score": 0.5352112676056338,
353
- "score_ci_high": 0.6510866959365942,
354
- "score_ci_low": 0.4225352112676056,
355
  "num_of_instances": 71
356
  },
357
  "mmlu_pro_business": {
358
  "accuracy": 0.19718309859154928,
359
  "accuracy_ci_low": 0.11267605633802817,
360
- "accuracy_ci_high": 0.30985915492957744,
361
  "score_name": "accuracy",
362
  "score": 0.19718309859154928,
363
- "score_ci_high": 0.30985915492957744,
364
  "score_ci_low": 0.11267605633802817,
365
  "num_of_instances": 71
366
  },
367
  "mmlu_pro_chemistry": {
368
- "accuracy": 0.19718309859154928,
369
- "accuracy_ci_low": 0.11267605633802817,
370
- "accuracy_ci_high": 0.29577464788732394,
371
  "score_name": "accuracy",
372
- "score": 0.19718309859154928,
373
- "score_ci_high": 0.29577464788732394,
374
- "score_ci_low": 0.11267605633802817,
375
  "num_of_instances": 71
376
  },
377
  "mmlu_pro_computer_science": {
 
 
 
 
 
 
 
 
 
 
378
  "accuracy": 0.38028169014084506,
379
  "accuracy_ci_low": 0.2676056338028169,
380
  "accuracy_ci_high": 0.49295774647887325,
@@ -384,375 +375,365 @@
384
  "score_ci_low": 0.2676056338028169,
385
  "num_of_instances": 71
386
  },
387
- "mmlu_pro_economics": {
388
- "accuracy": 0.4084507042253521,
389
- "accuracy_ci_low": 0.30985915492957744,
390
- "accuracy_ci_high": 0.5211267605633803,
391
- "score_name": "accuracy",
392
- "score": 0.4084507042253521,
393
- "score_ci_high": 0.5211267605633803,
394
- "score_ci_low": 0.30985915492957744,
395
- "num_of_instances": 71
396
- },
397
  "mmlu_pro_engineering": {
398
- "accuracy": 0.22535211267605634,
399
- "accuracy_ci_low": 0.1267605633802817,
400
- "accuracy_ci_high": 0.323943661971831,
401
  "score_name": "accuracy",
402
- "score": 0.22535211267605634,
403
- "score_ci_high": 0.323943661971831,
404
- "score_ci_low": 0.1267605633802817,
405
  "num_of_instances": 71
406
  },
407
  "mmlu_pro_health": {
408
- "accuracy": 0.3380281690140845,
409
- "accuracy_ci_low": 0.23943661971830985,
410
- "accuracy_ci_high": 0.4507042253521127,
411
  "score_name": "accuracy",
412
- "score": 0.3380281690140845,
413
- "score_ci_high": 0.4507042253521127,
414
- "score_ci_low": 0.23943661971830985,
415
  "num_of_instances": 71
416
  },
417
  "mmlu_pro_history": {
418
- "accuracy": 0.352112676056338,
419
- "accuracy_ci_low": 0.23943661971830985,
420
  "accuracy_ci_high": 0.4788732394366197,
421
  "score_name": "accuracy",
422
- "score": 0.352112676056338,
423
  "score_ci_high": 0.4788732394366197,
424
- "score_ci_low": 0.23943661971830985,
425
  "num_of_instances": 71
426
  },
427
  "mmlu_pro_law": {
428
- "accuracy": 0.39436619718309857,
429
- "accuracy_ci_low": 0.29577464788732394,
430
- "accuracy_ci_high": 0.5070422535211268,
431
  "score_name": "accuracy",
432
- "score": 0.39436619718309857,
433
- "score_ci_high": 0.5070422535211268,
434
- "score_ci_low": 0.29577464788732394,
435
  "num_of_instances": 71
436
  },
437
  "mmlu_pro_math": {
438
- "accuracy": 0.14084507042253522,
439
- "accuracy_ci_low": 0.07042253521126761,
440
- "accuracy_ci_high": 0.23943661971830985,
441
  "score_name": "accuracy",
442
- "score": 0.14084507042253522,
443
- "score_ci_high": 0.23943661971830985,
444
- "score_ci_low": 0.07042253521126761,
445
  "num_of_instances": 71
446
  },
447
  "mmlu_pro_other": {
448
- "accuracy": 0.28169014084507044,
449
- "accuracy_ci_low": 0.18309859154929578,
450
- "accuracy_ci_high": 0.38028169014084506,
451
  "score_name": "accuracy",
452
- "score": 0.28169014084507044,
453
- "score_ci_high": 0.38028169014084506,
454
- "score_ci_low": 0.18309859154929578,
455
  "num_of_instances": 71
456
  },
457
  "mmlu_pro_philosophy": {
458
- "accuracy": 0.4507042253521127,
459
- "accuracy_ci_low": 0.3272644997208875,
460
- "accuracy_ci_high": 0.5774647887323944,
461
  "score_name": "accuracy",
462
- "score": 0.4507042253521127,
463
- "score_ci_high": 0.5774647887323944,
464
- "score_ci_low": 0.3272644997208875,
465
  "num_of_instances": 71
466
  },
467
  "mmlu_pro_physics": {
468
- "accuracy": 0.2112676056338028,
469
- "accuracy_ci_low": 0.1267605633802817,
470
- "accuracy_ci_high": 0.323943661971831,
471
  "score_name": "accuracy",
472
- "score": 0.2112676056338028,
473
- "score_ci_high": 0.323943661971831,
474
- "score_ci_low": 0.1267605633802817,
475
  "num_of_instances": 71
476
  },
477
  "mmlu_pro_psychology": {
478
- "accuracy": 0.5633802816901409,
479
- "accuracy_ci_low": 0.43661971830985913,
480
- "accuracy_ci_high": 0.6619718309859155,
481
  "score_name": "accuracy",
482
- "score": 0.5633802816901409,
483
- "score_ci_high": 0.6619718309859155,
484
- "score_ci_low": 0.43661971830985913,
485
  "num_of_instances": 71
486
  },
487
- "score": 0.3340040241448692,
488
  "score_name": "subsets_mean",
489
  "num_of_instances": 994
490
  },
491
  "legal": {
492
  "legalbench_abercrombie": {
493
- "f1_macro": 0.3091896407685881,
494
- "f1_suggestive": 0.2857142857142857,
495
- "f1_arbitrary": 0.4444444444444444,
496
- "f1_generic": 0.3157894736842105,
497
- "f1_fanciful": 0.0,
498
- "f1_descriptive": 0.5,
499
- "f1_macro_ci_low": 0.22288382556382555,
500
- "f1_macro_ci_high": 0.40529088046834727,
501
  "score_name": "f1_micro",
502
- "score": 0.3717948717948718,
503
- "score_ci_high": 0.4807113986939902,
504
- "score_ci_low": 0.2631578947368421,
505
  "num_of_instances": 85,
506
- "accuracy": 0.3411764705882353,
507
- "accuracy_ci_low": 0.23529411764705882,
508
- "accuracy_ci_high": 0.4470588235294118,
509
- "f1_micro": 0.3717948717948718,
510
- "f1_micro_ci_low": 0.2631578947368421,
511
- "f1_micro_ci_high": 0.4807113986939902
512
  },
513
  "legalbench_corporate_lobbying": {
514
- "f1_macro": 0.5386002886002885,
515
- "f1_no": 0.7676767676767676,
516
- "f1_yes": 0.30952380952380953,
517
- "f1_macro_ci_low": 0.46173823746740844,
518
- "f1_macro_ci_high": 0.6159447239385368,
519
  "score_name": "f1_micro",
520
- "score": 0.6666666666666666,
521
- "score_ci_high": 0.7237928752902334,
522
- "score_ci_low": 0.5925925925925926,
523
  "num_of_instances": 200,
524
- "accuracy": 0.635,
525
- "accuracy_ci_low": 0.56,
526
- "accuracy_ci_high": 0.695,
527
- "f1_micro": 0.6666666666666666,
528
- "f1_micro_ci_low": 0.5925925925925926,
529
- "f1_micro_ci_high": 0.7237928752902334
530
  },
531
  "legalbench_function_of_decision_section": {
532
- "f1_macro": 0.2803267774022364,
533
- "f1_conclusion": 0.08695652173913043,
534
- "f1_decree": 0.3333333333333333,
535
- "f1_issue": 0.24561403508771928,
536
- "f1_analysis": 0.3076923076923077,
 
537
  "f1_facts": 0.21621621621621623,
538
- "f1_procedural history": 0.3018867924528302,
539
- "f1_rule": 0.47058823529411764,
540
- "f1_macro_ci_low": 0.21723272868885718,
541
- "f1_macro_ci_high": 0.3464281083472716,
542
  "score_name": "f1_micro",
543
- "score": 0.2922636103151863,
544
- "score_ci_high": 0.3563218390804598,
545
- "score_ci_low": 0.22030548535299002,
546
  "num_of_instances": 200,
547
- "accuracy": 0.255,
548
- "accuracy_ci_low": 0.195,
549
- "accuracy_ci_high": 0.31,
550
- "f1_micro": 0.2922636103151863,
551
- "f1_micro_ci_low": 0.22030548535299002,
552
- "f1_micro_ci_high": 0.3563218390804598
553
  },
554
  "legalbench_international_citizenship_questions": {
555
- "f1_macro": 0.501342318650011,
556
- "f1_yes": 0.5648148148148148,
557
- "f1_no": 0.4378698224852071,
558
- "f1_macro_ci_low": 0.43005283833620406,
559
- "f1_macro_ci_high": 0.5697757173860767,
560
  "score_name": "f1_micro",
561
- "score": 0.509090909090909,
562
- "score_ci_high": 0.5735035597182048,
563
- "score_ci_low": 0.4339558606291127,
564
  "num_of_instances": 200,
565
- "accuracy": 0.49,
566
- "accuracy_ci_low": 0.415,
567
- "accuracy_ci_high": 0.555,
568
- "f1_micro": 0.509090909090909,
569
- "f1_micro_ci_low": 0.4339558606291127,
570
- "f1_micro_ci_high": 0.5735035597182048
571
  },
572
  "legalbench_proa": {
573
- "f1_macro": 0.8456121343445286,
574
- "f1_yes": 0.8450704225352113,
575
  "f1_no": 0.8461538461538461,
576
- "f1_macro_ci_low": 0.7755496746888861,
577
- "f1_macro_ci_high": 0.9030851777330651,
578
  "score_name": "f1_micro",
579
- "score": 0.8456375838926175,
580
- "score_ci_high": 0.9032258064516129,
581
- "score_ci_low": 0.770569043574124,
582
  "num_of_instances": 85,
583
- "accuracy": 0.7411764705882353,
584
- "accuracy_ci_low": 0.6396825906719896,
585
- "accuracy_ci_high": 0.8235294117647058,
586
- "f1_micro": 0.8456375838926175,
587
- "f1_micro_ci_low": 0.770569043574124,
588
- "f1_micro_ci_high": 0.9032258064516129
589
  },
590
- "score": 0.5370907283520503,
591
  "score_name": "subsets_mean",
592
  "num_of_instances": 770
593
  },
594
  "news_classification": {
595
  "20_newsgroups_short": {
596
- "f1_macro": 0.4301866659512963,
597
- "f1_cars": 0.693069306930693,
598
- "f1_pc hardware": 0.35454545454545455,
599
- "f1_windows x": 0.0,
600
- "f1_atheism": 0.26666666666666666,
601
- "f1_christianity": 0.16129032258064516,
602
- "f1_religion": 0.21359223300970873,
603
- "f1_medicine": 0.8051948051948052,
604
- "f1_computer graphics": 0.45454545454545453,
605
  "f1_microsoft windows": 0.39436619718309857,
606
- "f1_middle east": 0.4166666666666667,
607
- "f1_motorcycles": 0.5116279069767442,
608
- "f1_mac hardware": 0.03125,
609
- "f1_for sale": 0.5757575757575758,
610
- "f1_guns": 0.27586206896551724,
611
- "f1_politics": 0.3235294117647059,
612
  "f1_space": 0.5569620253164557,
613
- "f1_cryptography": 0.45901639344262296,
614
- "f1_baseball": 0.8440366972477065,
615
  "f1_hockey": 0.859504132231405,
616
- "f1_electronics": 0.40625,
617
- "f1_macro_ci_low": 0.40471791977084814,
618
- "f1_macro_ci_high": 0.463234774838722,
619
  "score_name": "f1_micro",
620
- "score": 0.45168667810177243,
621
- "score_ci_high": 0.48422465158112066,
622
- "score_ci_low": 0.4219481399548598,
623
  "num_of_instances": 1000,
624
- "accuracy": 0.395,
625
- "accuracy_ci_low": 0.366,
626
- "accuracy_ci_high": 0.4246748738033053,
627
- "f1_micro": 0.45168667810177243,
628
- "f1_micro_ci_low": 0.4219481399548598,
629
- "f1_micro_ci_high": 0.48422465158112066
630
  },
631
- "score": 0.45168667810177243,
632
  "score_name": "subsets_mean",
633
  "num_of_instances": 1000
634
  },
635
  "product_help": {
636
  "cfpb_product_2023": {
637
- "f1_macro": 0.632121660327464,
638
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.8923533778767632,
639
- "f1_credit card or prepaid card": 0.5736434108527132,
640
- "f1_debt collection": 0.593939393939394,
641
- "f1_checking or savings account": 0.7526881720430108,
642
- "f1_mortgage": 0.8,
643
- "f1_vehicle loan or lease": 0.5,
644
- "f1_money transfer or virtual currency or money service": 0.5882352941176471,
645
- "f1_payday loan or title loan or personal loan": 0.4,
646
- "f1_student loan": 0.5882352941176471,
647
- "f1_macro_ci_low": 0.5726966985491608,
648
- "f1_macro_ci_high": 0.7037376869442199,
649
  "score_name": "f1_micro",
650
- "score": 0.8152004164497657,
651
- "score_ci_high": 0.8381933866103766,
652
- "score_ci_low": 0.7918410041841004,
653
  "num_of_instances": 1000,
654
- "accuracy": 0.783,
655
- "accuracy_ci_low": 0.758,
656
- "accuracy_ci_high": 0.808,
657
- "f1_micro": 0.8152004164497657,
658
- "f1_micro_ci_low": 0.7918410041841004,
659
- "f1_micro_ci_high": 0.8381933866103766
660
  },
661
  "cfpb_product_watsonx": {
662
- "f1_macro": 0.6993262696245249,
663
- "f1_mortgages and loans": 0.7441860465116279,
664
- "f1_credit card": 0.7425149700598802,
665
- "f1_debt collection": 0.6952380952380952,
666
- "f1_retail banking": 0.5901639344262295,
667
- "f1_credit reporting": 0.7245283018867924,
668
- "f1_macro_ci_low": 0.6582490219544427,
669
- "f1_macro_ci_high": 0.7402017475442481,
670
  "score_name": "f1_micro",
671
- "score": 0.7072649572649573,
672
- "score_ci_high": 0.7466308129162856,
673
- "score_ci_low": 0.667389078497867,
674
  "num_of_instances": 500,
675
- "accuracy": 0.662,
676
- "accuracy_ci_low": 0.622,
677
- "accuracy_ci_high": 0.704,
678
- "f1_micro": 0.7072649572649573,
679
- "f1_micro_ci_low": 0.667389078497867,
680
- "f1_micro_ci_high": 0.7466308129162856
681
  },
682
- "score": 0.7612326868573616,
683
  "score_name": "subsets_mean",
684
  "num_of_instances": 1500
685
  },
686
  "qa_finance": {
687
  "fin_qa": {
688
  "num_of_instances": 1000,
689
- "execution_accuracy": 0.081,
690
- "program_accuracy": 0.097,
691
- "score": 0.097,
692
  "score_name": "program_accuracy",
693
- "execution_accuracy_ci_low": 0.065,
694
- "execution_accuracy_ci_high": 0.099,
695
- "program_accuracy_ci_low": 0.08,
696
- "program_accuracy_ci_high": 0.116,
697
- "score_ci_low": 0.08,
698
- "score_ci_high": 0.116
699
  },
700
- "score": 0.097,
701
  "score_name": "subsets_mean",
702
  "num_of_instances": 1000
703
  },
704
  "rag_general": {
705
  "rag_response_generation_clapnq": {
706
- "precision": 0.30206806077427395,
707
- "recall": 0.5816362067038935,
708
- "f1": 0.3356223026170467,
709
- "precision_ci_low": 0.2814417986773037,
710
- "precision_ci_high": 0.3229447355634418,
711
- "recall_ci_low": 0.5644812495018089,
712
- "recall_ci_high": 0.5979433202487962,
713
- "f1_ci_low": 0.3158251342416766,
714
- "f1_ci_high": 0.3520825144808485,
715
  "score_name": "f1",
716
- "score": 0.3356223026170467,
717
- "score_ci_high": 0.3520825144808485,
718
- "score_ci_low": 0.3158251342416766,
719
  "num_of_instances": 600,
720
- "correctness_f1_bert_score.deberta_large_mnli": 0.5992281610767046,
721
- "correctness_recall_bert_score.deberta_large_mnli": 0.6849613479276498,
722
- "correctness_precision_bert_score.deberta_large_mnli": 0.5520117846379677,
723
- "faithfullness_f1_token_overlap": 0.3362020072357691,
724
- "faithfullness_recall_token_overlap": 0.271110248692323,
725
- "faithfullness_precision_token_overlap": 0.560394767420604,
726
- "correctness_f1_token_overlap": 0.3356223026170467,
727
- "correctness_recall_token_overlap": 0.5816362067038935,
728
- "correctness_precision_token_overlap": 0.30206806077427395
729
  },
730
- "score": 0.3356223026170467,
731
  "score_name": "subsets_mean",
732
  "num_of_instances": 600
733
  },
734
  "reasoning": {
735
  "hellaswag": {
736
- "accuracy": 0.444,
737
- "accuracy_ci_low": 0.412,
738
- "accuracy_ci_high": 0.474,
739
  "score_name": "accuracy",
740
- "score": 0.444,
741
- "score_ci_high": 0.474,
742
- "score_ci_low": 0.412,
743
  "num_of_instances": 1000
744
  },
745
  "openbook_qa": {
746
- "accuracy": 0.778,
747
- "accuracy_ci_low": 0.738,
748
- "accuracy_ci_high": 0.812,
749
  "score_name": "accuracy",
750
- "score": 0.778,
751
- "score_ci_high": 0.812,
752
- "score_ci_low": 0.738,
753
  "num_of_instances": 500
754
  },
755
- "score": 0.611,
756
  "score_name": "subsets_mean",
757
  "num_of_instances": 1500
758
  },
@@ -762,66 +743,66 @@
762
  "severity_high": 0.0,
763
  "severity_medium": 0.0,
764
  "severity_low": 100.0,
765
- "category_harmful_info": 0.8762445710728775,
766
- "category_deception": 0.8653640456714036,
767
- "category_explicit_content": 0.8295791004643295,
768
- "category_violence": 0.8484539006408008,
769
- "category_discrimination": 0.8543881776905249,
770
- "category_substance_abuse": 0.9145642853445477,
771
- "category_pii": 0.8098305000199212,
772
- "safety": 0.8585813929057784,
773
- "safety_ci_low": 0.8429344770949673,
774
- "safety_ci_high": 0.8748245522188953,
775
  "score_name": "safety",
776
- "score": 0.8585813929057784,
777
- "score_ci_high": 0.8748245522188953,
778
- "score_ci_low": 0.8429344770949673,
779
  "num_of_instances": 100
780
  },
781
- "score": 0.8585813929057784,
782
  "score_name": "subsets_mean",
783
  "num_of_instances": 100
784
  },
785
  "summarization": {
786
  "billsum_document_filtered_to_6000_chars": {
787
  "num_of_instances": 528,
788
- "rouge2": 0.1970371669553263,
789
- "rougeLsum": 0.3501126102752881,
790
- "rougeL": 0.2846394886664465,
791
- "score": 0.2846394886664465,
792
  "score_name": "rougeL",
793
- "rouge1": 0.41761700537983565,
794
- "rouge2_ci_low": 0.18974866066979085,
795
- "rouge2_ci_high": 0.2039076063263163,
796
- "rougeLsum_ci_low": 0.3417084656758268,
797
- "rougeLsum_ci_high": 0.35798610023418986,
798
- "rougeL_ci_low": 0.27743853819473024,
799
- "rougeL_ci_high": 0.29146357040869636,
800
- "score_ci_low": 0.27743853819473024,
801
- "score_ci_high": 0.29146357040869636,
802
- "rouge1_ci_low": 0.4080893419540732,
803
- "rouge1_ci_high": 0.42587062241350043
 
 
804
  },
805
  "tldr_document_filtered_to_6000_chars": {
806
  "num_of_instances": 1000,
807
- "rouge2": 0.013386366389249846,
808
- "rougeLsum": 0.09129053607147332,
809
- "rougeL": 0.07976722752092735,
810
- "score": 0.07976722752092735,
811
  "score_name": "rougeL",
812
- "rouge1": 0.11205823965005557,
813
- "rouge2_ci_low": 0.011778285117435726,
814
- "rouge2_ci_high": 0.015025647426455475,
815
- "rougeLsum_ci_low": 0.08722354399820309,
816
- "rougeLsum_ci_high": 0.09499919628325168,
817
- "rougeL_ci_low": 0.07634372485273437,
818
- "rougeL_ci_high": 0.0827839048129559,
819
- "score_ci_low": 0.07634372485273437,
820
- "score_ci_high": 0.0827839048129559,
821
- "rouge1_ci_low": 0.10678148406842189,
822
- "rouge1_ci_high": 0.11674098323235016
 
 
823
  },
824
- "score": 0.18220335809368693,
825
  "score_name": "subsets_mean",
826
  "num_of_instances": 1528
827
  },
@@ -829,473 +810,473 @@
829
  "mt_flores_101_ara_eng": {
830
  "num_of_instances": 66,
831
  "counts": [
832
- 1120,
833
- 591,
834
- 346,
835
- 205
836
  ],
837
  "totals": [
838
- 2708,
839
- 2642,
840
- 2576,
841
- 2510
842
  ],
843
  "precisions": [
844
- 0.413589364844904,
845
- 0.22369417108251327,
846
- 0.1343167701863354,
847
- 0.08167330677290836
848
  ],
849
  "bp": 1.0,
850
- "sys_len": 2708,
851
  "ref_len": 1734,
852
- "sacrebleu": 0.17848782709335775,
853
- "score": 0.17848782709335775,
854
  "score_name": "sacrebleu",
855
- "score_ci_low": 0.12553105401151735,
856
- "score_ci_high": 0.23257206528815122,
857
- "sacrebleu_ci_low": 0.12553105401151735,
858
- "sacrebleu_ci_high": 0.23257206528815122
859
  },
860
  "mt_flores_101_deu_eng": {
861
  "num_of_instances": 66,
862
  "counts": [
863
- 1238,
864
- 714,
865
- 444,
866
- 283
867
  ],
868
  "totals": [
869
- 3974,
870
- 3908,
871
- 3842,
872
- 3776
873
  ],
874
  "precisions": [
875
- 0.31152491192752896,
876
- 0.1827021494370522,
877
- 0.11556480999479439,
878
- 0.0749470338983051
879
  ],
880
  "bp": 1.0,
881
- "sys_len": 3974,
882
  "ref_len": 1734,
883
- "sacrebleu": 0.14900612629588061,
884
- "score": 0.14900612629588061,
885
  "score_name": "sacrebleu",
886
- "score_ci_low": 0.12104205384160306,
887
- "score_ci_high": 0.17208001279821492,
888
- "sacrebleu_ci_low": 0.12104205384160306,
889
- "sacrebleu_ci_high": 0.17208001279821492
890
  },
891
  "mt_flores_101_eng_ara": {
892
  "num_of_instances": 66,
893
  "counts": [
894
- 695,
895
- 280,
896
- 122,
897
- 54
898
  ],
899
  "totals": [
900
- 3398,
901
- 3332,
902
- 3266,
903
- 3200
904
  ],
905
  "precisions": [
906
- 0.20453207769276044,
907
- 0.08403361344537816,
908
- 0.03735456215554195,
909
- 0.016875
910
  ],
911
  "bp": 1.0,
912
- "sys_len": 3398,
913
  "ref_len": 1589,
914
- "sacrebleu": 0.057372064118917265,
915
- "score": 0.057372064118917265,
916
  "score_name": "sacrebleu",
917
- "score_ci_low": 0.030141531652344938,
918
- "score_ci_high": 0.09128806282306597,
919
- "sacrebleu_ci_low": 0.030141531652344938,
920
- "sacrebleu_ci_high": 0.09128806282306597
921
  },
922
  "mt_flores_101_eng_deu": {
923
  "num_of_instances": 66,
924
  "counts": [
925
- 1081,
926
- 583,
927
- 367,
928
- 236
929
  ],
930
  "totals": [
931
- 2698,
932
- 2632,
933
- 2566,
934
- 2500
935
  ],
936
  "precisions": [
937
- 0.4006671608598962,
938
- 0.2215045592705167,
939
- 0.1430241621200312,
940
- 0.0944
941
  ],
942
  "bp": 1.0,
943
- "sys_len": 2698,
944
  "ref_len": 1835,
945
- "sacrebleu": 0.18605311955362527,
946
- "score": 0.18605311955362527,
947
  "score_name": "sacrebleu",
948
- "score_ci_low": 0.14535553546673896,
949
- "score_ci_high": 0.23518227086475807,
950
- "sacrebleu_ci_low": 0.14535553546673896,
951
- "sacrebleu_ci_high": 0.23518227086475807
952
  },
953
  "mt_flores_101_eng_fra": {
954
  "num_of_instances": 66,
955
  "counts": [
956
- 1369,
957
- 908,
958
- 653,
959
- 480
960
  ],
961
  "totals": [
962
- 2838,
963
- 2772,
964
- 2706,
965
- 2640
966
  ],
967
  "precisions": [
968
- 0.4823819591261451,
969
- 0.32756132756132755,
970
- 0.24131559497413158,
971
- 0.18181818181818182
972
  ],
973
  "bp": 1.0,
974
- "sys_len": 2838,
975
  "ref_len": 2068,
976
- "sacrebleu": 0.28855366500271445,
977
- "score": 0.28855366500271445,
978
  "score_name": "sacrebleu",
979
- "score_ci_low": 0.23842554781291075,
980
- "score_ci_high": 0.3295060649249719,
981
- "sacrebleu_ci_low": 0.23842554781291075,
982
- "sacrebleu_ci_high": 0.3295060649249719
983
  },
984
  "mt_flores_101_eng_kor": {
985
  "num_of_instances": 66,
986
  "counts": [
987
- 1105,
988
- 464,
989
- 237,
990
- 129
991
  ],
992
  "totals": [
993
- 4614,
994
- 4548,
995
- 4482,
996
- 4416
997
  ],
998
  "precisions": [
999
- 0.23948851322063283,
1000
- 0.10202286719437115,
1001
- 0.05287817938420348,
1002
- 0.029211956521739132
1003
  ],
1004
  "bp": 1.0,
1005
- "sys_len": 4614,
1006
  "ref_len": 2235,
1007
- "sacrebleu": 0.07837992398310743,
1008
- "score": 0.07837992398310743,
1009
  "score_name": "sacrebleu",
1010
- "score_ci_low": 0.0624749618885479,
1011
- "score_ci_high": 0.0978985560656709,
1012
- "sacrebleu_ci_low": 0.0624749618885479,
1013
- "sacrebleu_ci_high": 0.0978985560656709
1014
  },
1015
  "mt_flores_101_eng_por": {
1016
  "num_of_instances": 66,
1017
  "counts": [
1018
- 1337,
1019
- 872,
1020
- 615,
1021
- 443
1022
  ],
1023
  "totals": [
1024
- 3197,
1025
- 3131,
1026
- 3065,
1027
- 2999
1028
  ],
1029
  "precisions": [
1030
- 0.41820456678135753,
1031
- 0.2785052698818269,
1032
- 0.200652528548124,
1033
- 0.14771590530176726
1034
  ],
1035
  "bp": 1.0,
1036
- "sys_len": 3197,
1037
  "ref_len": 1916,
1038
- "sacrebleu": 0.2423949242277413,
1039
- "score": 0.2423949242277413,
1040
  "score_name": "sacrebleu",
1041
- "score_ci_low": 0.19031908380666443,
1042
- "score_ci_high": 0.29285841291168263,
1043
- "sacrebleu_ci_low": 0.19031908380666443,
1044
- "sacrebleu_ci_high": 0.29285841291168263
1045
  },
1046
  "mt_flores_101_eng_ron": {
1047
  "num_of_instances": 66,
1048
  "counts": [
1049
- 916,
1050
- 408,
1051
- 215,
1052
- 118
1053
  ],
1054
  "totals": [
1055
- 3065,
1056
- 2999,
1057
- 2933,
1058
- 2867
1059
  ],
1060
  "precisions": [
1061
- 0.29885807504078304,
1062
- 0.13604534844948316,
1063
- 0.0733037845209683,
1064
- 0.04115800488315312
1065
  ],
1066
  "bp": 1.0,
1067
- "sys_len": 3065,
1068
  "ref_len": 1949,
1069
- "sacrebleu": 0.10524036626034926,
1070
- "score": 0.10524036626034926,
1071
  "score_name": "sacrebleu",
1072
- "score_ci_low": 0.08576114841329757,
1073
- "score_ci_high": 0.138434331243868,
1074
- "sacrebleu_ci_low": 0.08576114841329757,
1075
- "sacrebleu_ci_high": 0.138434331243868
1076
  },
1077
  "mt_flores_101_eng_spa": {
1078
  "num_of_instances": 66,
1079
  "counts": [
1080
- 1218,
1081
- 627,
1082
- 360,
1083
- 200
1084
  ],
1085
  "totals": [
1086
- 3325,
1087
- 3259,
1088
- 3193,
1089
- 3127
1090
  ],
1091
  "precisions": [
1092
- 0.36631578947368415,
1093
- 0.19239030377416383,
1094
- 0.11274663326025681,
1095
- 0.06395906619763352
1096
  ],
1097
  "bp": 1.0,
1098
- "sys_len": 3325,
1099
  "ref_len": 2098,
1100
- "sacrebleu": 0.15014508803615928,
1101
- "score": 0.15014508803615928,
1102
  "score_name": "sacrebleu",
1103
- "score_ci_low": 0.1204863094487622,
1104
- "score_ci_high": 0.1765822854477207,
1105
- "sacrebleu_ci_low": 0.1204863094487622,
1106
- "sacrebleu_ci_high": 0.1765822854477207
1107
  },
1108
  "mt_flores_101_fra_eng": {
1109
  "num_of_instances": 66,
1110
  "counts": [
1111
- 1254,
1112
- 757,
1113
- 500,
1114
- 349
1115
  ],
1116
  "totals": [
1117
- 3006,
1118
- 2940,
1119
- 2874,
1120
- 2808
1121
  ],
1122
  "precisions": [
1123
- 0.4171656686626746,
1124
- 0.2574829931972789,
1125
- 0.17397355601948505,
1126
- 0.1242877492877493
1127
  ],
1128
  "bp": 1.0,
1129
- "sys_len": 3006,
1130
  "ref_len": 1734,
1131
- "sacrebleu": 0.21952913609680724,
1132
- "score": 0.21952913609680724,
1133
  "score_name": "sacrebleu",
1134
- "score_ci_low": 0.1789920824532995,
1135
- "score_ci_high": 0.2645460018949846,
1136
- "sacrebleu_ci_low": 0.1789920824532995,
1137
- "sacrebleu_ci_high": 0.2645460018949846
1138
  },
1139
  "mt_flores_101_jpn_eng": {
1140
  "num_of_instances": 66,
1141
  "counts": [
1142
- 1041,
1143
- 455,
1144
- 240,
1145
- 140
1146
  ],
1147
  "totals": [
1148
- 3373,
1149
- 3307,
1150
- 3241,
1151
- 3175
1152
  ],
1153
  "precisions": [
1154
- 0.30862733471686926,
1155
- 0.13758693680072573,
1156
- 0.07405121875964209,
1157
- 0.04409448818897638
1158
  ],
1159
  "bp": 1.0,
1160
- "sys_len": 3373,
1161
  "ref_len": 1734,
1162
- "sacrebleu": 0.10851306405113036,
1163
- "score": 0.10851306405113036,
1164
  "score_name": "sacrebleu",
1165
- "score_ci_low": 0.07946974341775867,
1166
- "score_ci_high": 0.13751739390259268,
1167
- "sacrebleu_ci_low": 0.07946974341775867,
1168
- "sacrebleu_ci_high": 0.13751739390259268
1169
  },
1170
  "mt_flores_101_kor_eng": {
1171
  "num_of_instances": 66,
1172
  "counts": [
1173
- 989,
1174
- 446,
1175
- 231,
1176
  127
1177
  ],
1178
  "totals": [
1179
- 3117,
1180
- 3051,
1181
- 2985,
1182
- 2919
1183
  ],
1184
  "precisions": [
1185
- 0.3172922682066089,
1186
- 0.14618157980989838,
1187
- 0.07738693467336684,
1188
- 0.04350805070229531
1189
  ],
1190
  "bp": 1.0,
1191
- "sys_len": 3117,
1192
  "ref_len": 1734,
1193
- "sacrebleu": 0.11178855764603894,
1194
- "score": 0.11178855764603894,
1195
  "score_name": "sacrebleu",
1196
- "score_ci_low": 0.08704919524478445,
1197
- "score_ci_high": 0.14019227407461762,
1198
- "sacrebleu_ci_low": 0.08704919524478445,
1199
- "sacrebleu_ci_high": 0.14019227407461762
1200
  },
1201
  "mt_flores_101_por_eng": {
1202
  "num_of_instances": 66,
1203
  "counts": [
1204
- 1279,
1205
- 803,
1206
- 560,
1207
- 401
1208
  ],
1209
  "totals": [
1210
- 4379,
1211
- 4313,
1212
- 4247,
1213
- 4181
1214
  ],
1215
  "precisions": [
1216
- 0.29207581639643754,
1217
- 0.18618131231161605,
1218
- 0.13185778196373912,
1219
- 0.0959100693613968
1220
  ],
1221
  "bp": 1.0,
1222
- "sys_len": 4379,
1223
  "ref_len": 1734,
1224
- "sacrebleu": 0.16193861267968562,
1225
- "score": 0.16193861267968562,
1226
  "score_name": "sacrebleu",
1227
- "score_ci_low": 0.13000309557072665,
1228
- "score_ci_high": 0.19897125526163875,
1229
- "sacrebleu_ci_low": 0.13000309557072665,
1230
- "sacrebleu_ci_high": 0.19897125526163875
1231
  },
1232
  "mt_flores_101_ron_eng": {
1233
  "num_of_instances": 66,
1234
  "counts": [
1235
- 1220,
1236
- 701,
1237
- 443,
1238
- 287
1239
  ],
1240
  "totals": [
1241
- 3361,
1242
- 3295,
1243
- 3229,
1244
- 3163
1245
  ],
1246
  "precisions": [
1247
- 0.36298720618863434,
1248
- 0.21274658573596358,
1249
- 0.13719417776401363,
1250
- 0.09073664242807462
1251
  ],
1252
  "bp": 1.0,
1253
- "sys_len": 3361,
1254
  "ref_len": 1734,
1255
- "sacrebleu": 0.1760832623038549,
1256
- "score": 0.1760832623038549,
1257
  "score_name": "sacrebleu",
1258
- "score_ci_low": 0.13158026543556073,
1259
- "score_ci_high": 0.2197696196203422,
1260
- "sacrebleu_ci_low": 0.13158026543556073,
1261
- "sacrebleu_ci_high": 0.2197696196203422
1262
  },
1263
  "mt_flores_101_spa_eng": {
1264
  "num_of_instances": 66,
1265
  "counts": [
1266
- 1169,
1267
- 609,
1268
- 355,
1269
  202
1270
  ],
1271
  "totals": [
1272
- 2961,
1273
- 2895,
1274
- 2829,
1275
- 2763
1276
  ],
1277
  "precisions": [
1278
- 0.3947990543735224,
1279
- 0.21036269430051813,
1280
- 0.12548603746907033,
1281
- 0.07310893955845095
1282
  ],
1283
  "bp": 1.0,
1284
- "sys_len": 2961,
1285
  "ref_len": 1734,
1286
- "sacrebleu": 0.16614132879343932,
1287
- "score": 0.16614132879343932,
1288
  "score_name": "sacrebleu",
1289
- "score_ci_low": 0.12631240742927416,
1290
- "score_ci_high": 0.2061148130220288,
1291
- "sacrebleu_ci_low": 0.12631240742927416,
1292
- "sacrebleu_ci_high": 0.2061148130220288
1293
  },
1294
- "score": 0.1586418044095206,
1295
  "score_name": "subsets_mean",
1296
  "num_of_instances": 990
1297
  },
1298
- "score": 0.4549791248292893,
1299
  "score_name": "subsets_mean",
1300
  "num_of_instances": 12472
1301
  }
 
1
  {
2
  "environment_info": {
3
+ "timestamp_utc": "2025-06-19T15:21:49.633185Z",
4
  "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
  "--tasks",
7
  "benchmarks.bluebench",
8
  "--model",
9
  "cross_provider",
10
  "--model_args",
11
+ "model_name=watsonx/ibm/granite-3-3-8b-instruct,max_tokens=256",
12
  "--output_path",
13
  "./results/bluebench",
14
  "--log_samples",
 
42
  "cache_dir": null
43
  },
44
  "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
+ "python_version": "3.10.18",
47
  "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
  "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
  "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
  "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
 
 
 
 
 
 
 
 
78
  "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
  "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
  "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
 
 
 
 
 
 
 
105
  "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
 
108
  "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
 
 
 
 
 
 
 
 
 
112
  "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
  "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
  "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
 
 
 
 
137
  "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
  "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
  "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
  "jaraco.text": "3.12.1",
 
 
170
  "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
  }
175
  },
176
  "results": {
177
  "bias": {
178
  "safety_bbq_age": {
179
+ "accuracy": 0.5555555555555556,
180
+ "accuracy_ci_low": 0.45555555555555555,
181
+ "accuracy_ci_high": 0.6555555555555556,
182
  "score_name": "accuracy",
183
+ "score": 0.5555555555555556,
184
+ "score_ci_high": 0.6555555555555556,
185
+ "score_ci_low": 0.45555555555555555,
186
  "num_of_instances": 90
187
  },
188
  "safety_bbq_disability_status": {
189
+ "accuracy": 0.6222222222222222,
190
+ "accuracy_ci_low": 0.5222222222222223,
191
+ "accuracy_ci_high": 0.7222222222222222,
192
  "score_name": "accuracy",
193
+ "score": 0.6222222222222222,
194
+ "score_ci_high": 0.7222222222222222,
195
+ "score_ci_low": 0.5222222222222223,
196
  "num_of_instances": 90
197
  },
198
  "safety_bbq_gender_identity": {
 
206
  "num_of_instances": 90
207
  },
208
  "safety_bbq_nationality": {
209
+ "accuracy": 0.6333333333333333,
210
+ "accuracy_ci_low": 0.5333333333333333,
211
+ "accuracy_ci_high": 0.7333333333333333,
212
  "score_name": "accuracy",
213
+ "score": 0.6333333333333333,
214
+ "score_ci_high": 0.7333333333333333,
215
+ "score_ci_low": 0.5333333333333333,
216
  "num_of_instances": 90
217
  },
218
  "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.6555555555555556,
220
+ "accuracy_ci_low": 0.5555555555555556,
221
+ "accuracy_ci_high": 0.7539633744548231,
222
  "score_name": "accuracy",
223
+ "score": 0.6555555555555556,
224
+ "score_ci_high": 0.7539633744548231,
225
+ "score_ci_low": 0.5555555555555556,
226
  "num_of_instances": 90
227
  },
228
  "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.9333333333333333,
230
+ "accuracy_ci_low": 0.8666666666666667,
231
+ "accuracy_ci_high": 0.9777777777777777,
232
  "score_name": "accuracy",
233
+ "score": 0.9333333333333333,
234
+ "score_ci_high": 0.9777777777777777,
235
+ "score_ci_low": 0.8666666666666667,
236
  "num_of_instances": 90
237
  },
238
  "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.8888888888888888,
240
+ "accuracy_ci_low": 0.8222222222222222,
241
  "accuracy_ci_high": 0.9444444444444444,
242
  "score_name": "accuracy",
243
+ "score": 0.8888888888888888,
244
  "score_ci_high": 0.9444444444444444,
245
+ "score_ci_low": 0.8222222222222222,
246
  "num_of_instances": 90
247
  },
248
  "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.9333333333333333,
250
+ "accuracy_ci_low": 0.8666666666666667,
251
+ "accuracy_ci_high": 0.9777777777777777,
252
  "score_name": "accuracy",
253
+ "score": 0.9333333333333333,
254
+ "score_ci_high": 0.9777777777777777,
255
+ "score_ci_low": 0.8666666666666667,
256
  "num_of_instances": 90
257
  },
258
  "safety_bbq_religion": {
259
+ "accuracy": 0.7666666666666667,
260
+ "accuracy_ci_low": 0.6720698151047421,
261
  "accuracy_ci_high": 0.8444444444444444,
262
  "score_name": "accuracy",
263
+ "score": 0.7666666666666667,
264
  "score_ci_high": 0.8444444444444444,
265
+ "score_ci_low": 0.6720698151047421,
266
  "num_of_instances": 90
267
  },
268
  "safety_bbq_ses": {
269
+ "accuracy": 0.6333333333333333,
270
+ "accuracy_ci_low": 0.5333333333333333,
271
+ "accuracy_ci_high": 0.7283280971833935,
272
  "score_name": "accuracy",
273
+ "score": 0.6333333333333333,
274
+ "score_ci_high": 0.7283280971833935,
275
+ "score_ci_low": 0.5333333333333333,
276
  "num_of_instances": 90
277
  },
278
  "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.7666666666666667,
280
+ "accuracy_ci_low": 0.6666666666666666,
281
+ "accuracy_ci_high": 0.8444444444444444,
282
  "score_name": "accuracy",
283
+ "score": 0.7666666666666667,
284
+ "score_ci_high": 0.8444444444444444,
285
+ "score_ci_low": 0.6666666666666666,
286
  "num_of_instances": 90
287
  },
288
+ "score": 0.7515151515151515,
289
  "score_name": "subsets_mean",
290
  "num_of_instances": 990
291
  },
 
303
  "entity_extraction": {
304
  "universal_ner_en_ewt": {
305
  "num_of_instances": 1000,
306
+ "f1_Person": 0.5102639296187683,
307
+ "f1_Organization": 0.3381294964028777,
308
+ "f1_Location": 0.35652173913043483,
309
+ "f1_macro": 0.40163838838402693,
310
+ "recall_macro": 0.3240210323686792,
311
+ "precision_macro": 0.530656067251462,
312
+ "in_classes_support": 0.5625,
313
+ "f1_micro": 0.31789282470481384,
314
+ "recall_micro": 0.3333333333333333,
315
+ "precision_micro": 0.3038194444444444,
316
+ "score": 0.31789282470481384,
317
  "score_name": "f1_micro",
318
+ "score_ci_low": 0.26482961534023236,
319
+ "score_ci_high": 0.37029988780714157,
320
+ "f1_micro_ci_low": 0.26482961534023236,
321
+ "f1_micro_ci_high": 0.37029988780714157
322
  },
323
+ "score": 0.31789282470481384,
324
  "score_name": "subsets_mean",
325
  "num_of_instances": 1000
326
  },
327
  "knowledge": {
328
  "mmlu_pro_biology": {
329
+ "accuracy": 0.5211267605633803,
330
+ "accuracy_ci_low": 0.4084507042253521,
331
+ "accuracy_ci_high": 0.6338028169014085,
332
  "score_name": "accuracy",
333
+ "score": 0.5211267605633803,
334
+ "score_ci_high": 0.6338028169014085,
335
+ "score_ci_low": 0.4084507042253521,
336
  "num_of_instances": 71
337
  },
338
  "mmlu_pro_business": {
339
  "accuracy": 0.19718309859154928,
340
  "accuracy_ci_low": 0.11267605633802817,
341
+ "accuracy_ci_high": 0.29577464788732394,
342
  "score_name": "accuracy",
343
  "score": 0.19718309859154928,
344
+ "score_ci_high": 0.29577464788732394,
345
  "score_ci_low": 0.11267605633802817,
346
  "num_of_instances": 71
347
  },
348
  "mmlu_pro_chemistry": {
349
+ "accuracy": 0.23943661971830985,
350
+ "accuracy_ci_low": 0.15492957746478872,
351
+ "accuracy_ci_high": 0.3380281690140845,
352
  "score_name": "accuracy",
353
+ "score": 0.23943661971830985,
354
+ "score_ci_high": 0.3380281690140845,
355
+ "score_ci_low": 0.15492957746478872,
356
  "num_of_instances": 71
357
  },
358
  "mmlu_pro_computer_science": {
359
+ "accuracy": 0.43661971830985913,
360
+ "accuracy_ci_low": 0.323943661971831,
361
+ "accuracy_ci_high": 0.5492957746478874,
362
+ "score_name": "accuracy",
363
+ "score": 0.43661971830985913,
364
+ "score_ci_high": 0.5492957746478874,
365
+ "score_ci_low": 0.323943661971831,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
  "accuracy": 0.38028169014084506,
370
  "accuracy_ci_low": 0.2676056338028169,
371
  "accuracy_ci_high": 0.49295774647887325,
 
375
  "score_ci_low": 0.2676056338028169,
376
  "num_of_instances": 71
377
  },
 
 
 
 
 
 
 
 
 
 
378
  "mmlu_pro_engineering": {
379
+ "accuracy": 0.2535211267605634,
380
+ "accuracy_ci_low": 0.16901408450704225,
381
+ "accuracy_ci_high": 0.36048330202820134,
382
  "score_name": "accuracy",
383
+ "score": 0.2535211267605634,
384
+ "score_ci_high": 0.36048330202820134,
385
+ "score_ci_low": 0.16901408450704225,
386
  "num_of_instances": 71
387
  },
388
  "mmlu_pro_health": {
389
+ "accuracy": 0.36619718309859156,
390
+ "accuracy_ci_low": 0.2535211267605634,
391
+ "accuracy_ci_high": 0.4788732394366197,
392
  "score_name": "accuracy",
393
+ "score": 0.36619718309859156,
394
+ "score_ci_high": 0.4788732394366197,
395
+ "score_ci_low": 0.2535211267605634,
396
  "num_of_instances": 71
397
  },
398
  "mmlu_pro_history": {
399
+ "accuracy": 0.36619718309859156,
400
+ "accuracy_ci_low": 0.2535211267605634,
401
  "accuracy_ci_high": 0.4788732394366197,
402
  "score_name": "accuracy",
403
+ "score": 0.36619718309859156,
404
  "score_ci_high": 0.4788732394366197,
405
+ "score_ci_low": 0.2535211267605634,
406
  "num_of_instances": 71
407
  },
408
  "mmlu_pro_law": {
409
+ "accuracy": 0.36619718309859156,
410
+ "accuracy_ci_low": 0.2535211267605634,
411
+ "accuracy_ci_high": 0.4788732394366197,
412
  "score_name": "accuracy",
413
+ "score": 0.36619718309859156,
414
+ "score_ci_high": 0.4788732394366197,
415
+ "score_ci_low": 0.2535211267605634,
416
  "num_of_instances": 71
417
  },
418
  "mmlu_pro_math": {
419
+ "accuracy": 0.1267605633802817,
420
+ "accuracy_ci_low": 0.056338028169014086,
421
+ "accuracy_ci_high": 0.22535211267605634,
422
  "score_name": "accuracy",
423
+ "score": 0.1267605633802817,
424
+ "score_ci_high": 0.22535211267605634,
425
+ "score_ci_low": 0.056338028169014086,
426
  "num_of_instances": 71
427
  },
428
  "mmlu_pro_other": {
429
+ "accuracy": 0.22535211267605634,
430
+ "accuracy_ci_low": 0.14084507042253522,
431
+ "accuracy_ci_high": 0.323943661971831,
432
  "score_name": "accuracy",
433
+ "score": 0.22535211267605634,
434
+ "score_ci_high": 0.323943661971831,
435
+ "score_ci_low": 0.14084507042253522,
436
  "num_of_instances": 71
437
  },
438
  "mmlu_pro_philosophy": {
439
+ "accuracy": 0.4084507042253521,
440
+ "accuracy_ci_low": 0.30985915492957744,
441
+ "accuracy_ci_high": 0.5352112676056338,
442
  "score_name": "accuracy",
443
+ "score": 0.4084507042253521,
444
+ "score_ci_high": 0.5352112676056338,
445
+ "score_ci_low": 0.30985915492957744,
446
  "num_of_instances": 71
447
  },
448
  "mmlu_pro_physics": {
449
+ "accuracy": 0.29577464788732394,
450
+ "accuracy_ci_low": 0.19718309859154928,
451
+ "accuracy_ci_high": 0.4084507042253521,
452
  "score_name": "accuracy",
453
+ "score": 0.29577464788732394,
454
+ "score_ci_high": 0.4084507042253521,
455
+ "score_ci_low": 0.19718309859154928,
456
  "num_of_instances": 71
457
  },
458
  "mmlu_pro_psychology": {
459
+ "accuracy": 0.5352112676056338,
460
+ "accuracy_ci_low": 0.4084507042253521,
461
+ "accuracy_ci_high": 0.647887323943662,
462
  "score_name": "accuracy",
463
+ "score": 0.5352112676056338,
464
+ "score_ci_high": 0.647887323943662,
465
+ "score_ci_low": 0.4084507042253521,
466
  "num_of_instances": 71
467
  },
468
+ "score": 0.33702213279678067,
469
  "score_name": "subsets_mean",
470
  "num_of_instances": 994
471
  },
472
  "legal": {
473
  "legalbench_abercrombie": {
474
+ "f1_macro": 0.2696554985630616,
475
+ "f1_suggestive": 0.2727272727272727,
476
+ "f1_arbitrary": 0.43137254901960786,
477
+ "f1_generic": 0.11764705882352941,
478
+ "f1_fanciful": 0.2,
479
+ "f1_descriptive": 0.32653061224489793,
480
+ "f1_macro_ci_low": 0.18689773936584586,
481
+ "f1_macro_ci_high": 0.37923074712363225,
482
  "score_name": "f1_micro",
483
+ "score": 0.31446540880503143,
484
+ "score_ci_high": 0.42038216560509556,
485
+ "score_ci_low": 0.21656050955414013,
486
  "num_of_instances": 85,
487
+ "accuracy": 0.29411764705882354,
488
+ "accuracy_ci_low": 0.2,
489
+ "accuracy_ci_high": 0.4,
490
+ "f1_micro": 0.31446540880503143,
491
+ "f1_micro_ci_low": 0.21656050955414013,
492
+ "f1_micro_ci_high": 0.42038216560509556
493
  },
494
  "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5388253241800153,
496
+ "f1_no": 0.7298245614035088,
497
+ "f1_yes": 0.34782608695652173,
498
+ "f1_macro_ci_low": 0.47191290375757455,
499
+ "f1_macro_ci_high": 0.6216206779092042,
500
  "score_name": "f1_micro",
501
+ "score": 0.636604774535809,
502
+ "score_ci_high": 0.6985040092826637,
503
+ "score_ci_low": 0.5691144311757004,
504
  "num_of_instances": 200,
505
+ "accuracy": 0.6,
506
+ "accuracy_ci_low": 0.53,
507
+ "accuracy_ci_high": 0.665,
508
+ "f1_micro": 0.636604774535809,
509
+ "f1_micro_ci_low": 0.5691144311757004,
510
+ "f1_micro_ci_high": 0.6985040092826637
511
  },
512
  "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.2947177227927682,
514
+ "f1_conclusion": 0.2127659574468085,
515
+ "f1_decree": 0.23529411764705882,
516
+ "f1_issue": 0.2711864406779661,
517
+ "f1_rule": 0.42857142857142855,
518
+ "f1_analysis": 0.4444444444444444,
519
  "f1_facts": 0.21621621621621623,
520
+ "f1_procedural history": 0.2545454545454545,
521
+ "f1_macro_ci_low": 0.23794703715833648,
522
+ "f1_macro_ci_high": 0.36665623309642204,
 
523
  "score_name": "f1_micro",
524
+ "score": 0.30409356725146197,
525
+ "score_ci_high": 0.3711587285161421,
526
+ "score_ci_low": 0.23855266549315363,
527
  "num_of_instances": 200,
528
+ "accuracy": 0.26,
529
+ "accuracy_ci_low": 0.2,
530
+ "accuracy_ci_high": 0.32,
531
+ "f1_micro": 0.30409356725146197,
532
+ "f1_micro_ci_low": 0.23855266549315363,
533
+ "f1_micro_ci_high": 0.3711587285161421
534
  },
535
  "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.49092908191313905,
537
+ "f1_yes": 0.5700934579439252,
538
+ "f1_no": 0.4117647058823529,
539
+ "f1_macro_ci_low": 0.4178065856787266,
540
+ "f1_macro_ci_high": 0.5601203681213927,
541
  "score_name": "f1_micro",
542
+ "score": 0.5,
543
+ "score_ci_high": 0.566970455032283,
544
+ "score_ci_low": 0.42555336134062,
545
  "num_of_instances": 200,
546
+ "accuracy": 0.48,
547
+ "accuracy_ci_low": 0.405,
548
+ "accuracy_ci_high": 0.545,
549
+ "f1_micro": 0.5,
550
+ "f1_micro_ci_low": 0.42555336134062,
551
+ "f1_micro_ci_high": 0.566970455032283
552
  },
553
  "legalbench_proa": {
554
+ "f1_macro": 0.8315276273022751,
555
+ "f1_yes": 0.8169014084507042,
556
  "f1_no": 0.8461538461538461,
557
+ "f1_macro_ci_low": 0.7549023325928579,
558
+ "f1_macro_ci_high": 0.890440353074843,
559
  "score_name": "f1_micro",
560
+ "score": 0.8322147651006712,
561
+ "score_ci_high": 0.8903225806451613,
562
+ "score_ci_low": 0.7554946760306516,
563
  "num_of_instances": 85,
564
+ "accuracy": 0.7294117647058823,
565
+ "accuracy_ci_low": 0.6352941176470588,
566
+ "accuracy_ci_high": 0.8117647058823529,
567
+ "f1_micro": 0.8322147651006712,
568
+ "f1_micro_ci_low": 0.7554946760306516,
569
+ "f1_micro_ci_high": 0.8903225806451613
570
  },
571
+ "score": 0.5174757031385947,
572
  "score_name": "subsets_mean",
573
  "num_of_instances": 770
574
  },
575
  "news_classification": {
576
  "20_newsgroups_short": {
577
+ "f1_macro": 0.42272407811143237,
578
+ "f1_cars": 0.6078431372549019,
579
+ "f1_pc hardware": 0.34080717488789236,
580
+ "f1_windows x": 0.029850746268656716,
581
+ "f1_computer graphics": 0.4367816091954023,
582
+ "f1_atheism": 0.21739130434782608,
583
+ "f1_religion": 0.23300970873786409,
584
+ "f1_medicine": 0.8641975308641975,
585
+ "f1_christianity": 0.1694915254237288,
586
  "f1_microsoft windows": 0.39436619718309857,
587
+ "f1_middle east": 0.43037974683544306,
588
+ "f1_politics": 0.291970802919708,
589
+ "f1_motorcycles": 0.43902439024390244,
590
+ "f1_mac hardware": 0.09090909090909091,
591
+ "f1_for sale": 0.625,
592
+ "f1_guns": 0.18181818181818182,
593
  "f1_space": 0.5569620253164557,
594
+ "f1_cryptography": 0.4482758620689655,
595
+ "f1_baseball": 0.8545454545454545,
596
  "f1_hockey": 0.859504132231405,
597
+ "f1_electronics": 0.38235294117647056,
598
+ "f1_macro_ci_low": 0.3988534736802405,
599
+ "f1_macro_ci_high": 0.4557473948035634,
600
  "score_name": "f1_micro",
601
+ "score": 0.44368600682593856,
602
+ "score_ci_high": 0.47444463958776134,
603
+ "score_ci_low": 0.4135801299006492,
604
  "num_of_instances": 1000,
605
+ "accuracy": 0.39,
606
+ "accuracy_ci_low": 0.36,
607
+ "accuracy_ci_high": 0.418,
608
+ "f1_micro": 0.44368600682593856,
609
+ "f1_micro_ci_low": 0.4135801299006492,
610
+ "f1_micro_ci_high": 0.47444463958776134
611
  },
612
+ "score": 0.44368600682593856,
613
  "score_name": "subsets_mean",
614
  "num_of_instances": 1000
615
  },
616
  "product_help": {
617
  "cfpb_product_2023": {
618
+ "f1_macro": 0.6409217061975553,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9205673758865248,
620
+ "f1_credit card or prepaid card": 0.6363636363636364,
621
+ "f1_checking or savings account": 0.7766990291262136,
622
+ "f1_mortgage": 0.7777777777777778,
623
+ "f1_debt collection": 0.6222222222222222,
624
+ "f1_student loan": 0.88,
625
+ "f1_payday loan or title loan or personal loan": 0.35294117647058826,
626
+ "f1_vehicle loan or lease": 0.5517241379310345,
627
+ "f1_money transfer or virtual currency or money service": 0.25,
628
+ "f1_macro_ci_low": 0.5901810957914123,
629
+ "f1_macro_ci_high": 0.7054871287846897,
630
  "score_name": "f1_micro",
631
+ "score": 0.8491446345256609,
632
+ "score_ci_high": 0.8701030927835052,
633
+ "score_ci_low": 0.8291666666666667,
634
  "num_of_instances": 1000,
635
+ "accuracy": 0.819,
636
+ "accuracy_ci_low": 0.796,
637
+ "accuracy_ci_high": 0.843,
638
+ "f1_micro": 0.8491446345256609,
639
+ "f1_micro_ci_low": 0.8291666666666667,
640
+ "f1_micro_ci_high": 0.8701030927835052
641
  },
642
  "cfpb_product_watsonx": {
643
+ "f1_macro": 0.7132677588870594,
644
+ "f1_mortgages and loans": 0.7771428571428571,
645
+ "f1_credit card": 0.7023809523809523,
646
+ "f1_debt collection": 0.6854460093896714,
647
+ "f1_credit reporting": 0.7601476014760148,
648
+ "f1_retail banking": 0.6412213740458015,
649
+ "f1_macro_ci_low": 0.672279823384184,
650
+ "f1_macro_ci_high": 0.7539657340394554,
651
  "score_name": "f1_micro",
652
+ "score": 0.7202505219206681,
653
+ "score_ci_high": 0.7576596149340853,
654
+ "score_ci_low": 0.6805865270375967,
655
  "num_of_instances": 500,
656
+ "accuracy": 0.69,
657
+ "accuracy_ci_low": 0.65,
658
+ "accuracy_ci_high": 0.73,
659
+ "f1_micro": 0.7202505219206681,
660
+ "f1_micro_ci_low": 0.6805865270375967,
661
+ "f1_micro_ci_high": 0.7576596149340853
662
  },
663
+ "score": 0.7846975782231644,
664
  "score_name": "subsets_mean",
665
  "num_of_instances": 1500
666
  },
667
  "qa_finance": {
668
  "fin_qa": {
669
  "num_of_instances": 1000,
670
+ "execution_accuracy": 0.074,
671
+ "program_accuracy": 0.085,
672
+ "score": 0.085,
673
  "score_name": "program_accuracy",
674
+ "execution_accuracy_ci_low": 0.058,
675
+ "execution_accuracy_ci_high": 0.091,
676
+ "program_accuracy_ci_low": 0.068,
677
+ "program_accuracy_ci_high": 0.102,
678
+ "score_ci_low": 0.068,
679
+ "score_ci_high": 0.102
680
  },
681
+ "score": 0.085,
682
  "score_name": "subsets_mean",
683
  "num_of_instances": 1000
684
  },
685
  "rag_general": {
686
  "rag_response_generation_clapnq": {
687
+ "precision": 0.30022844870852566,
688
+ "recall": 0.5840193774846996,
689
+ "f1": 0.3357215148632638,
690
+ "precision_ci_low": 0.28030967471726836,
691
+ "precision_ci_high": 0.32121747414474766,
692
+ "recall_ci_low": 0.565861900260428,
693
+ "recall_ci_high": 0.59971992711831,
694
+ "f1_ci_low": 0.3175124739653954,
695
+ "f1_ci_high": 0.35218969004250933,
696
  "score_name": "f1",
697
+ "score": 0.3357215148632638,
698
+ "score_ci_high": 0.35218969004250933,
699
+ "score_ci_low": 0.3175124739653954,
700
  "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6000729685028394,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6848867724835873,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5528717580189307,
704
+ "faithfullness_f1_token_overlap": 0.33597285355913525,
705
+ "faithfullness_recall_token_overlap": 0.27114762054953845,
706
+ "faithfullness_precision_token_overlap": 0.5569731492695155,
707
+ "correctness_f1_token_overlap": 0.3357215148632638,
708
+ "correctness_recall_token_overlap": 0.5840193774846996,
709
+ "correctness_precision_token_overlap": 0.30022844870852566
710
  },
711
+ "score": 0.3357215148632638,
712
  "score_name": "subsets_mean",
713
  "num_of_instances": 600
714
  },
715
  "reasoning": {
716
  "hellaswag": {
717
+ "accuracy": 0.455,
718
+ "accuracy_ci_low": 0.423,
719
+ "accuracy_ci_high": 0.485,
720
  "score_name": "accuracy",
721
+ "score": 0.455,
722
+ "score_ci_high": 0.485,
723
+ "score_ci_low": 0.423,
724
  "num_of_instances": 1000
725
  },
726
  "openbook_qa": {
727
+ "accuracy": 0.782,
728
+ "accuracy_ci_low": 0.744,
729
+ "accuracy_ci_high": 0.818,
730
  "score_name": "accuracy",
731
+ "score": 0.782,
732
+ "score_ci_high": 0.818,
733
+ "score_ci_low": 0.744,
734
  "num_of_instances": 500
735
  },
736
+ "score": 0.6185,
737
  "score_name": "subsets_mean",
738
  "num_of_instances": 1500
739
  },
 
743
  "severity_high": 0.0,
744
  "severity_medium": 0.0,
745
  "severity_low": 100.0,
746
+ "category_harmful_info": 0.8662501443103547,
747
+ "category_deception": 0.8769584958149929,
748
+ "category_explicit_content": 0.8376609406085929,
749
+ "category_violence": 0.8456707175466277,
750
+ "category_discrimination": 0.8991804392564864,
751
+ "category_substance_abuse": 0.8935730718904071,
752
+ "category_pii": 0.8467056638112774,
753
+ "safety": 0.860165383811626,
754
+ "safety_ci_low": 0.8426579493684276,
755
+ "safety_ci_high": 0.8778207056405374,
756
  "score_name": "safety",
757
+ "score": 0.860165383811626,
758
+ "score_ci_high": 0.8778207056405374,
759
+ "score_ci_low": 0.8426579493684276,
760
  "num_of_instances": 100
761
  },
762
+ "score": 0.860165383811626,
763
  "score_name": "subsets_mean",
764
  "num_of_instances": 100
765
  },
766
  "summarization": {
767
  "billsum_document_filtered_to_6000_chars": {
768
  "num_of_instances": 528,
769
+ "rougeL": 0.28749112328021914,
770
+ "score": 0.28749112328021914,
 
 
771
  "score_name": "rougeL",
772
+ "rougeLsum": 0.3503820014906059,
773
+ "rouge2": 0.20109007723824623,
774
+ "rouge1": 0.4200585739584912,
775
+ "rougeL_ci_low": 0.2804794753326623,
776
+ "rougeL_ci_high": 0.29447838537921134,
777
+ "score_ci_low": 0.2804794753326623,
778
+ "score_ci_high": 0.29447838537921134,
779
+ "rougeLsum_ci_low": 0.341921573094731,
780
+ "rougeLsum_ci_high": 0.35863585426859207,
781
+ "rouge2_ci_low": 0.19416899053732958,
782
+ "rouge2_ci_high": 0.20872476773642967,
783
+ "rouge1_ci_low": 0.41035793857223635,
784
+ "rouge1_ci_high": 0.4281932704537228
785
  },
786
  "tldr_document_filtered_to_6000_chars": {
787
  "num_of_instances": 1000,
788
+ "rougeL": 0.07979202357473647,
789
+ "score": 0.07979202357473647,
 
 
790
  "score_name": "rougeL",
791
+ "rougeLsum": 0.0922932399263996,
792
+ "rouge2": 0.015117853576507847,
793
+ "rouge1": 0.11247814548815566,
794
+ "rougeL_ci_low": 0.0764789144644062,
795
+ "rougeL_ci_high": 0.08304032568245756,
796
+ "score_ci_low": 0.0764789144644062,
797
+ "score_ci_high": 0.08304032568245756,
798
+ "rougeLsum_ci_low": 0.0880597944044916,
799
+ "rougeLsum_ci_high": 0.09606464509440052,
800
+ "rouge2_ci_low": 0.01362250797390663,
801
+ "rouge2_ci_high": 0.0168799885499115,
802
+ "rouge1_ci_low": 0.10733708561154955,
803
+ "rouge1_ci_high": 0.11723898467910755
804
  },
805
+ "score": 0.1836415734274778,
806
  "score_name": "subsets_mean",
807
  "num_of_instances": 1528
808
  },
 
810
  "mt_flores_101_ara_eng": {
811
  "num_of_instances": 66,
812
  "counts": [
813
+ 1154,
814
+ 637,
815
+ 382,
816
+ 237
817
  ],
818
  "totals": [
819
+ 3013,
820
+ 2947,
821
+ 2881,
822
+ 2815
823
  ],
824
  "precisions": [
825
+ 0.383006969797544,
826
+ 0.2161520190023753,
827
+ 0.13259284970496357,
828
+ 0.08419182948490231
829
  ],
830
  "bp": 1.0,
831
+ "sys_len": 3013,
832
  "ref_len": 1734,
833
+ "sacrebleu": 0.17435684678472682,
834
+ "score": 0.17435684678472682,
835
  "score_name": "sacrebleu",
836
+ "score_ci_low": 0.12709535962365245,
837
+ "score_ci_high": 0.21064271607309265,
838
+ "sacrebleu_ci_low": 0.12709535962365245,
839
+ "sacrebleu_ci_high": 0.21064271607309265
840
  },
841
  "mt_flores_101_deu_eng": {
842
  "num_of_instances": 66,
843
  "counts": [
844
+ 1215,
845
+ 695,
846
+ 422,
847
+ 256
848
  ],
849
  "totals": [
850
+ 3433,
851
+ 3367,
852
+ 3301,
853
+ 3235
854
  ],
855
  "precisions": [
856
+ 0.35391785610253423,
857
+ 0.20641520641520641,
858
+ 0.12784004847016056,
859
+ 0.07913446676970634
860
  ],
861
  "bp": 1.0,
862
+ "sys_len": 3433,
863
  "ref_len": 1734,
864
+ "sacrebleu": 0.16488046075977367,
865
+ "score": 0.16488046075977367,
866
  "score_name": "sacrebleu",
867
+ "score_ci_low": 0.12825986690370522,
868
+ "score_ci_high": 0.20812836267228596,
869
+ "sacrebleu_ci_low": 0.12825986690370522,
870
+ "sacrebleu_ci_high": 0.20812836267228596
871
  },
872
  "mt_flores_101_eng_ara": {
873
  "num_of_instances": 66,
874
  "counts": [
875
+ 726,
876
+ 321,
877
+ 159,
878
+ 82
879
  ],
880
  "totals": [
881
+ 2297,
882
+ 2231,
883
+ 2165,
884
+ 2099
885
  ],
886
  "precisions": [
887
+ 0.3160644318676535,
888
+ 0.14388166741371583,
889
+ 0.07344110854503465,
890
+ 0.03906622201048118
891
  ],
892
  "bp": 1.0,
893
+ "sys_len": 2297,
894
  "ref_len": 1589,
895
+ "sacrebleu": 0.10687605905530678,
896
+ "score": 0.10687605905530678,
897
  "score_name": "sacrebleu",
898
+ "score_ci_low": 0.08639846348006232,
899
+ "score_ci_high": 0.13425269082562755,
900
+ "sacrebleu_ci_low": 0.08639846348006232,
901
+ "sacrebleu_ci_high": 0.13425269082562755
902
  },
903
  "mt_flores_101_eng_deu": {
904
  "num_of_instances": 66,
905
  "counts": [
906
+ 1066,
907
+ 564,
908
+ 332,
909
+ 194
910
  ],
911
  "totals": [
912
+ 2300,
913
+ 2234,
914
+ 2168,
915
+ 2102
916
  ],
917
  "precisions": [
918
+ 0.46347826086956523,
919
+ 0.252461951656222,
920
+ 0.15313653136531366,
921
+ 0.0922930542340628
922
  ],
923
  "bp": 1.0,
924
+ "sys_len": 2300,
925
  "ref_len": 1835,
926
+ "sacrebleu": 0.2016593123773307,
927
+ "score": 0.2016593123773307,
928
  "score_name": "sacrebleu",
929
+ "score_ci_low": 0.177292145733578,
930
+ "score_ci_high": 0.24439707428713803,
931
+ "sacrebleu_ci_low": 0.177292145733578,
932
+ "sacrebleu_ci_high": 0.24439707428713803
933
  },
934
  "mt_flores_101_eng_fra": {
935
  "num_of_instances": 66,
936
  "counts": [
937
+ 1409,
938
+ 950,
939
+ 692,
940
+ 517
941
  ],
942
  "totals": [
943
+ 3275,
944
+ 3209,
945
+ 3143,
946
+ 3077
947
  ],
948
  "precisions": [
949
+ 0.4302290076335878,
950
+ 0.2960423808039888,
951
+ 0.2201718103722558,
952
+ 0.168020799480013
953
  ],
954
  "bp": 1.0,
955
+ "sys_len": 3275,
956
  "ref_len": 2068,
957
+ "sacrebleu": 0.2619959538476516,
958
+ "score": 0.2619959538476516,
959
  "score_name": "sacrebleu",
960
+ "score_ci_low": 0.21071110880640612,
961
+ "score_ci_high": 0.30599931494111227,
962
+ "sacrebleu_ci_low": 0.21071110880640612,
963
+ "sacrebleu_ci_high": 0.30599931494111227
964
  },
965
  "mt_flores_101_eng_kor": {
966
  "num_of_instances": 66,
967
  "counts": [
968
+ 1096,
969
+ 465,
970
+ 233,
971
+ 132
972
  ],
973
  "totals": [
974
+ 3883,
975
+ 3817,
976
+ 3751,
977
+ 3685
978
  ],
979
  "precisions": [
980
+ 0.28225598763842386,
981
+ 0.12182342153523709,
982
+ 0.0621167688616369,
983
+ 0.03582089552238806
984
  ],
985
  "bp": 1.0,
986
+ "sys_len": 3883,
987
  "ref_len": 2235,
988
+ "sacrebleu": 0.09352545142421302,
989
+ "score": 0.09352545142421302,
990
  "score_name": "sacrebleu",
991
+ "score_ci_low": 0.0763987126727994,
992
+ "score_ci_high": 0.11617390981932266,
993
+ "sacrebleu_ci_low": 0.0763987126727994,
994
+ "sacrebleu_ci_high": 0.11617390981932266
995
  },
996
  "mt_flores_101_eng_por": {
997
  "num_of_instances": 66,
998
  "counts": [
999
+ 1328,
1000
+ 850,
1001
+ 588,
1002
+ 412
1003
  ],
1004
  "totals": [
1005
+ 3030,
1006
+ 2964,
1007
+ 2898,
1008
+ 2832
1009
  ],
1010
  "precisions": [
1011
+ 0.4382838283828383,
1012
+ 0.286774628879892,
1013
+ 0.2028985507246377,
1014
+ 0.14548022598870058
1015
  ],
1016
  "bp": 1.0,
1017
+ "sys_len": 3030,
1018
  "ref_len": 1916,
1019
+ "sacrebleu": 0.2467997817029595,
1020
+ "score": 0.2467997817029595,
1021
  "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.193392163449652,
1023
+ "score_ci_high": 0.2974642241791255,
1024
+ "sacrebleu_ci_low": 0.193392163449652,
1025
+ "sacrebleu_ci_high": 0.2974642241791255
1026
  },
1027
  "mt_flores_101_eng_ron": {
1028
  "num_of_instances": 66,
1029
  "counts": [
1030
+ 930,
1031
+ 400,
1032
+ 214,
1033
+ 123
1034
  ],
1035
  "totals": [
1036
+ 2961,
1037
+ 2895,
1038
+ 2829,
1039
+ 2763
1040
  ],
1041
  "precisions": [
1042
+ 0.3140830800405269,
1043
+ 0.1381692573402418,
1044
+ 0.07564510427712973,
1045
+ 0.04451682953311618
1046
  ],
1047
  "bp": 1.0,
1048
+ "sys_len": 2961,
1049
  "ref_len": 1949,
1050
+ "sacrebleu": 0.1099487393546487,
1051
+ "score": 0.1099487393546487,
1052
  "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.08284384518142485,
1054
+ "score_ci_high": 0.13880651312628609,
1055
+ "sacrebleu_ci_low": 0.08284384518142485,
1056
+ "sacrebleu_ci_high": 0.13880651312628609
1057
  },
1058
  "mt_flores_101_eng_spa": {
1059
  "num_of_instances": 66,
1060
  "counts": [
1061
+ 1217,
1062
+ 624,
1063
+ 347,
1064
+ 198
1065
  ],
1066
  "totals": [
1067
+ 3045,
1068
+ 2979,
1069
+ 2913,
1070
+ 2847
1071
  ],
1072
  "precisions": [
1073
+ 0.399671592775041,
1074
+ 0.20946626384692849,
1075
+ 0.11912118091314795,
1076
+ 0.06954689146469968
1077
  ],
1078
  "bp": 1.0,
1079
+ "sys_len": 3045,
1080
  "ref_len": 2098,
1081
+ "sacrebleu": 0.1622822499255264,
1082
+ "score": 0.1622822499255264,
1083
  "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.13321857221475644,
1085
+ "score_ci_high": 0.19390301665624113,
1086
+ "sacrebleu_ci_low": 0.13321857221475644,
1087
+ "sacrebleu_ci_high": 0.19390301665624113
1088
  },
1089
  "mt_flores_101_fra_eng": {
1090
  "num_of_instances": 66,
1091
  "counts": [
1092
+ 1236,
1093
+ 735,
1094
+ 470,
1095
+ 308
1096
  ],
1097
  "totals": [
1098
+ 2952,
1099
+ 2886,
1100
+ 2820,
1101
+ 2754
1102
  ],
1103
  "precisions": [
1104
+ 0.4186991869918699,
1105
+ 0.25467775467775466,
1106
+ 0.16666666666666669,
1107
+ 0.11183732752360204
1108
  ],
1109
  "bp": 1.0,
1110
+ "sys_len": 2952,
1111
  "ref_len": 1734,
1112
+ "sacrebleu": 0.2111456628673961,
1113
+ "score": 0.2111456628673961,
1114
  "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.1728340034401921,
1116
+ "score_ci_high": 0.26908287892628974,
1117
+ "sacrebleu_ci_low": 0.1728340034401921,
1118
+ "sacrebleu_ci_high": 0.26908287892628974
1119
  },
1120
  "mt_flores_101_jpn_eng": {
1121
  "num_of_instances": 66,
1122
  "counts": [
1123
+ 1018,
1124
+ 437,
1125
+ 232,
1126
+ 128
1127
  ],
1128
  "totals": [
1129
+ 3130,
1130
+ 3064,
1131
+ 2998,
1132
+ 2932
1133
  ],
1134
  "precisions": [
1135
+ 0.3252396166134185,
1136
+ 0.14262402088772846,
1137
+ 0.07738492328218813,
1138
+ 0.04365620736698499
1139
  ],
1140
  "bp": 1.0,
1141
+ "sys_len": 3130,
1142
  "ref_len": 1734,
1143
+ "sacrebleu": 0.11188570922324435,
1144
+ "score": 0.11188570922324435,
1145
  "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.09154049326122426,
1147
+ "score_ci_high": 0.13827539969992217,
1148
+ "sacrebleu_ci_low": 0.09154049326122426,
1149
+ "sacrebleu_ci_high": 0.13827539969992217
1150
  },
1151
  "mt_flores_101_kor_eng": {
1152
  "num_of_instances": 66,
1153
  "counts": [
1154
+ 986,
1155
+ 447,
1156
+ 233,
1157
  127
1158
  ],
1159
  "totals": [
1160
+ 3637,
1161
+ 3571,
1162
+ 3505,
1163
+ 3439
1164
  ],
1165
  "precisions": [
1166
+ 0.27110255705251585,
1167
+ 0.12517502100252031,
1168
+ 0.06647646219686162,
1169
+ 0.03692933992439663
1170
  ],
1171
  "bp": 1.0,
1172
+ "sys_len": 3637,
1173
  "ref_len": 1734,
1174
+ "sacrebleu": 0.09553723823741646,
1175
+ "score": 0.09553723823741646,
1176
  "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.06933902828362079,
1178
+ "score_ci_high": 0.1273472328564688,
1179
+ "sacrebleu_ci_low": 0.06933902828362079,
1180
+ "sacrebleu_ci_high": 0.1273472328564688
1181
  },
1182
  "mt_flores_101_por_eng": {
1183
  "num_of_instances": 66,
1184
  "counts": [
1185
+ 1286,
1186
+ 834,
1187
+ 587,
1188
+ 419
1189
  ],
1190
  "totals": [
1191
+ 3404,
1192
+ 3338,
1193
+ 3272,
1194
+ 3206
1195
  ],
1196
  "precisions": [
1197
+ 0.37779083431257343,
1198
+ 0.24985020970641103,
1199
+ 0.17940097799511,
1200
+ 0.13069245165315035
1201
  ],
1202
  "bp": 1.0,
1203
+ "sys_len": 3404,
1204
  "ref_len": 1734,
1205
+ "sacrebleu": 0.21689603438287544,
1206
+ "score": 0.21689603438287544,
1207
  "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.18174547190909165,
1209
+ "score_ci_high": 0.2734022486576191,
1210
+ "sacrebleu_ci_low": 0.18174547190909165,
1211
+ "sacrebleu_ci_high": 0.2734022486576191
1212
  },
1213
  "mt_flores_101_ron_eng": {
1214
  "num_of_instances": 66,
1215
  "counts": [
1216
+ 1208,
1217
+ 675,
1218
+ 430,
1219
+ 279
1220
  ],
1221
  "totals": [
1222
+ 3677,
1223
+ 3611,
1224
+ 3545,
1225
+ 3479
1226
  ],
1227
  "precisions": [
1228
+ 0.32852869186837097,
1229
+ 0.1869288285793409,
1230
+ 0.12129760225669958,
1231
+ 0.08019545846507617
1232
  ],
1233
  "bp": 1.0,
1234
+ "sys_len": 3677,
1235
  "ref_len": 1734,
1236
+ "sacrebleu": 0.15633740352446387,
1237
+ "score": 0.15633740352446387,
1238
  "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.12255450743419968,
1240
+ "score_ci_high": 0.17971859902386644,
1241
+ "sacrebleu_ci_low": 0.12255450743419968,
1242
+ "sacrebleu_ci_high": 0.17971859902386644
1243
  },
1244
  "mt_flores_101_spa_eng": {
1245
  "num_of_instances": 66,
1246
  "counts": [
1247
+ 1135,
1248
+ 581,
1249
+ 336,
1250
  202
1251
  ],
1252
  "totals": [
1253
+ 3533,
1254
+ 3467,
1255
+ 3401,
1256
+ 3335
1257
  ],
1258
  "precisions": [
1259
+ 0.3212567223322955,
1260
+ 0.16758004038073263,
1261
+ 0.09879447221405468,
1262
+ 0.06056971514242879
1263
  ],
1264
  "bp": 1.0,
1265
+ "sys_len": 3533,
1266
  "ref_len": 1734,
1267
+ "sacrebleu": 0.133972503470666,
1268
+ "score": 0.133972503470666,
1269
  "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.10251876459928583,
1271
+ "score_ci_high": 0.17481307519673603,
1272
+ "sacrebleu_ci_low": 0.10251876459928583,
1273
+ "sacrebleu_ci_high": 0.17481307519673603
1274
  },
1275
+ "score": 0.1632066271292133,
1276
  "score_name": "subsets_mean",
1277
  "num_of_instances": 990
1278
  },
1279
+ "score": 0.4537326535720019,
1280
  "score_name": "subsets_mean",
1281
  "num_of_instances": 12472
1282
  }
results/bluebench/2025-06-19T15-57-45_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-19T19:57:39.981261Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/ibm/granite-3-2-8b-instruct,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/ibm/granite-3-2-8b-instruct",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.6444444444444445,
180
+ "accuracy_ci_low": 0.5444444444444444,
181
+ "accuracy_ci_high": 0.7444444444444445,
182
+ "score_name": "accuracy",
183
+ "score": 0.6444444444444445,
184
+ "score_ci_high": 0.7444444444444445,
185
+ "score_ci_low": 0.5444444444444444,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.7222222222222222,
190
+ "accuracy_ci_low": 0.6111111111111112,
191
+ "accuracy_ci_high": 0.8,
192
+ "score_name": "accuracy",
193
+ "score": 0.7222222222222222,
194
+ "score_ci_high": 0.8,
195
+ "score_ci_low": 0.6111111111111112,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.9111111111111111,
200
+ "accuracy_ci_low": 0.8333333333333334,
201
+ "accuracy_ci_high": 0.9555555555555556,
202
+ "score_name": "accuracy",
203
+ "score": 0.9111111111111111,
204
+ "score_ci_high": 0.9555555555555556,
205
+ "score_ci_low": 0.8333333333333334,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 0.7111111111111111,
210
+ "accuracy_ci_low": 0.6111111111111112,
211
+ "accuracy_ci_high": 0.8,
212
+ "score_name": "accuracy",
213
+ "score": 0.7111111111111111,
214
+ "score_ci_high": 0.8,
215
+ "score_ci_low": 0.6111111111111112,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.8333333333333334,
220
+ "accuracy_ci_low": 0.7444444444444445,
221
+ "accuracy_ci_high": 0.9,
222
+ "score_name": "accuracy",
223
+ "score": 0.8333333333333334,
224
+ "score_ci_high": 0.9,
225
+ "score_ci_low": 0.7444444444444445,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.9777777777777777,
230
+ "accuracy_ci_low": 0.9333333333333333,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 0.9777777777777777,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 0.9333333333333333,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.9333333333333333,
240
+ "accuracy_ci_low": 0.8666666666666667,
241
+ "accuracy_ci_high": 0.9777777777777777,
242
+ "score_name": "accuracy",
243
+ "score": 0.9333333333333333,
244
+ "score_ci_high": 0.9777777777777777,
245
+ "score_ci_low": 0.8666666666666667,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.9444444444444444,
250
+ "accuracy_ci_low": 0.8777777777777778,
251
+ "accuracy_ci_high": 0.9777777777777777,
252
+ "score_name": "accuracy",
253
+ "score": 0.9444444444444444,
254
+ "score_ci_high": 0.9777777777777777,
255
+ "score_ci_low": 0.8777777777777778,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.8111111111111111,
260
+ "accuracy_ci_low": 0.7116197011994875,
261
+ "accuracy_ci_high": 0.8888888888888888,
262
+ "score_name": "accuracy",
263
+ "score": 0.8111111111111111,
264
+ "score_ci_high": 0.8888888888888888,
265
+ "score_ci_low": 0.7116197011994875,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.6888888888888889,
270
+ "accuracy_ci_low": 0.5777777777777777,
271
+ "accuracy_ci_high": 0.7777777777777778,
272
+ "score_name": "accuracy",
273
+ "score": 0.6888888888888889,
274
+ "score_ci_high": 0.7777777777777778,
275
+ "score_ci_low": 0.5777777777777777,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8,
280
+ "accuracy_ci_low": 0.7111111111111111,
281
+ "accuracy_ci_high": 0.8666666666666667,
282
+ "score_name": "accuracy",
283
+ "score": 0.8,
284
+ "score_ci_high": 0.8666666666666667,
285
+ "score_ci_low": 0.7111111111111111,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.8161616161616162,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.5,
296
+ "score": 0.5,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.5,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.592375366568915,
307
+ "f1_Location": 0.3668122270742358,
308
+ "f1_Organization": 0.45367412140575075,
309
+ "f1_macro": 0.4709539050163005,
310
+ "recall_macro": 0.3969630056026483,
311
+ "precision_macro": 0.5946970285442043,
312
+ "in_classes_support": 0.7649572649572649,
313
+ "f1_micro": 0.4310171198388721,
314
+ "recall_micro": 0.4076190476190476,
315
+ "precision_micro": 0.45726495726495725,
316
+ "score": 0.4310171198388721,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.36016345918404075,
319
+ "score_ci_high": 0.48021577272630167,
320
+ "f1_micro_ci_low": 0.36016345918404075,
321
+ "f1_micro_ci_high": 0.48021577272630167
322
+ },
323
+ "score": 0.4310171198388721,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.5352112676056338,
330
+ "accuracy_ci_low": 0.43661971830985913,
331
+ "accuracy_ci_high": 0.647887323943662,
332
+ "score_name": "accuracy",
333
+ "score": 0.5352112676056338,
334
+ "score_ci_high": 0.647887323943662,
335
+ "score_ci_low": 0.43661971830985913,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.22535211267605634,
340
+ "accuracy_ci_low": 0.14084507042253522,
341
+ "accuracy_ci_high": 0.323943661971831,
342
+ "score_name": "accuracy",
343
+ "score": 0.22535211267605634,
344
+ "score_ci_high": 0.323943661971831,
345
+ "score_ci_low": 0.14084507042253522,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.18309859154929578,
350
+ "accuracy_ci_low": 0.10639771966263252,
351
+ "accuracy_ci_high": 0.29577464788732394,
352
+ "score_name": "accuracy",
353
+ "score": 0.18309859154929578,
354
+ "score_ci_high": 0.29577464788732394,
355
+ "score_ci_low": 0.10639771966263252,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.323943661971831,
360
+ "accuracy_ci_low": 0.2112676056338028,
361
+ "accuracy_ci_high": 0.43661971830985913,
362
+ "score_name": "accuracy",
363
+ "score": 0.323943661971831,
364
+ "score_ci_high": 0.43661971830985913,
365
+ "score_ci_low": 0.2112676056338028,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.4507042253521127,
370
+ "accuracy_ci_low": 0.323943661971831,
371
+ "accuracy_ci_high": 0.5664724235461314,
372
+ "score_name": "accuracy",
373
+ "score": 0.4507042253521127,
374
+ "score_ci_high": 0.5664724235461314,
375
+ "score_ci_low": 0.323943661971831,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.36619718309859156,
380
+ "accuracy_ci_low": 0.2535211267605634,
381
+ "accuracy_ci_high": 0.4788732394366197,
382
+ "score_name": "accuracy",
383
+ "score": 0.36619718309859156,
384
+ "score_ci_high": 0.4788732394366197,
385
+ "score_ci_low": 0.2535211267605634,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.3380281690140845,
390
+ "accuracy_ci_low": 0.22535211267605634,
391
+ "accuracy_ci_high": 0.4647887323943662,
392
+ "score_name": "accuracy",
393
+ "score": 0.3380281690140845,
394
+ "score_ci_high": 0.4647887323943662,
395
+ "score_ci_low": 0.22535211267605634,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.4084507042253521,
400
+ "accuracy_ci_low": 0.29577464788732394,
401
+ "accuracy_ci_high": 0.5211267605633803,
402
+ "score_name": "accuracy",
403
+ "score": 0.4084507042253521,
404
+ "score_ci_high": 0.5211267605633803,
405
+ "score_ci_low": 0.29577464788732394,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.3380281690140845,
410
+ "accuracy_ci_low": 0.23943661971830985,
411
+ "accuracy_ci_high": 0.4647887323943662,
412
+ "score_name": "accuracy",
413
+ "score": 0.3380281690140845,
414
+ "score_ci_high": 0.4647887323943662,
415
+ "score_ci_low": 0.23943661971830985,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.11267605633802817,
420
+ "accuracy_ci_low": 0.056338028169014086,
421
+ "accuracy_ci_high": 0.19757759490217996,
422
+ "score_name": "accuracy",
423
+ "score": 0.11267605633802817,
424
+ "score_ci_high": 0.19757759490217996,
425
+ "score_ci_low": 0.056338028169014086,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.22535211267605634,
430
+ "accuracy_ci_low": 0.14084507042253522,
431
+ "accuracy_ci_high": 0.323943661971831,
432
+ "score_name": "accuracy",
433
+ "score": 0.22535211267605634,
434
+ "score_ci_high": 0.323943661971831,
435
+ "score_ci_low": 0.14084507042253522,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.4225352112676056,
440
+ "accuracy_ci_low": 0.30985915492957744,
441
+ "accuracy_ci_high": 0.5352112676056338,
442
+ "score_name": "accuracy",
443
+ "score": 0.4225352112676056,
444
+ "score_ci_high": 0.5352112676056338,
445
+ "score_ci_low": 0.30985915492957744,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.22535211267605634,
450
+ "accuracy_ci_low": 0.14084507042253522,
451
+ "accuracy_ci_high": 0.3380281690140845,
452
+ "score_name": "accuracy",
453
+ "score": 0.22535211267605634,
454
+ "score_ci_high": 0.3380281690140845,
455
+ "score_ci_low": 0.14084507042253522,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.5211267605633803,
460
+ "accuracy_ci_low": 0.4084507042253521,
461
+ "accuracy_ci_high": 0.6338028169014085,
462
+ "score_name": "accuracy",
463
+ "score": 0.5211267605633803,
464
+ "score_ci_high": 0.6338028169014085,
465
+ "score_ci_low": 0.4084507042253521,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.33400402414486924,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.2827795486010496,
475
+ "f1_suggestive": 0.08333333333333333,
476
+ "f1_descriptive": 0.4444444444444444,
477
+ "f1_generic": 0.11764705882352941,
478
+ "f1_fanciful": 0.4827586206896552,
479
+ "f1_arbitrary": 0.2857142857142857,
480
+ "f1_macro_ci_low": 0.20381678012471904,
481
+ "f1_macro_ci_high": 0.38601597944875415,
482
+ "score_name": "f1_micro",
483
+ "score": 0.3253012048192771,
484
+ "score_ci_high": 0.42168674698795183,
485
+ "score_ci_low": 0.21686746987951808,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.3176470588235294,
488
+ "accuracy_ci_low": 0.21176470588235294,
489
+ "accuracy_ci_high": 0.4117647058823529,
490
+ "f1_micro": 0.3253012048192771,
491
+ "f1_micro_ci_low": 0.21686746987951808,
492
+ "f1_micro_ci_high": 0.42168674698795183
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5842293906810035,
496
+ "f1_no": 0.8129032258064516,
497
+ "f1_yes": 0.35555555555555557,
498
+ "f1_macro_ci_low": 0.5123650296064088,
499
+ "f1_macro_ci_high": 0.6612083568605307,
500
+ "score_name": "f1_micro",
501
+ "score": 0.71,
502
+ "score_ci_high": 0.765,
503
+ "score_ci_low": 0.64,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.71,
506
+ "accuracy_ci_low": 0.64,
507
+ "accuracy_ci_high": 0.765,
508
+ "f1_micro": 0.71,
509
+ "f1_micro_ci_low": 0.64,
510
+ "f1_micro_ci_high": 0.765
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.23684055980437102,
514
+ "f1_conclusion": 0.12,
515
+ "f1_issue": 0.2682926829268293,
516
+ "f1_decree": 0.17647058823529413,
517
+ "f1_rule": 0.4155844155844156,
518
+ "f1_analysis": 0.2608695652173913,
519
+ "f1_facts": 0.16666666666666666,
520
+ "f1_procedural history": 0.25,
521
+ "f1_macro_ci_low": 0.18399933651413464,
522
+ "f1_macro_ci_high": 0.3012128675188143,
523
+ "score_name": "f1_micro",
524
+ "score": 0.2570694087403599,
525
+ "score_ci_high": 0.31910866448170155,
526
+ "score_ci_low": 0.19563743957580057,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.25,
529
+ "accuracy_ci_low": 0.19,
530
+ "accuracy_ci_high": 0.31,
531
+ "f1_micro": 0.2570694087403599,
532
+ "f1_micro_ci_low": 0.19563743957580057,
533
+ "f1_micro_ci_high": 0.31910866448170155
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.45179063360881544,
537
+ "f1_yes": 0.5702479338842975,
538
+ "f1_no": 0.3333333333333333,
539
+ "f1_macro_ci_low": 0.3881370275424478,
540
+ "f1_macro_ci_high": 0.5208583506164292,
541
+ "score_name": "f1_micro",
542
+ "score": 0.47738693467336685,
543
+ "score_ci_high": 0.5454545454545454,
544
+ "score_ci_low": 0.41102756892230574,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.475,
547
+ "accuracy_ci_low": 0.4062357598667403,
548
+ "accuracy_ci_high": 0.54,
549
+ "f1_micro": 0.47738693467336685,
550
+ "f1_micro_ci_low": 0.41102756892230574,
551
+ "f1_micro_ci_high": 0.5454545454545454
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.7797888386123679,
555
+ "f1_yes": 0.7647058823529411,
556
+ "f1_no": 0.7948717948717948,
557
+ "f1_macro_ci_low": 0.686770027516329,
558
+ "f1_macro_ci_high": 0.847201812396528,
559
+ "score_name": "f1_micro",
560
+ "score": 0.7808219178082192,
561
+ "score_ci_high": 0.847682119205298,
562
+ "score_ci_low": 0.6846573729523644,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.6705882352941176,
565
+ "accuracy_ci_low": 0.5647058823529412,
566
+ "accuracy_ci_high": 0.7529411764705882,
567
+ "f1_micro": 0.7808219178082192,
568
+ "f1_micro_ci_low": 0.6846573729523644,
569
+ "f1_micro_ci_high": 0.847682119205298
570
+ },
571
+ "score": 0.5101158932082446,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.4855115011669257,
578
+ "f1_cars": 0.8,
579
+ "f1_windows x": 0.05555555555555555,
580
+ "f1_atheism": 0.17777777777777778,
581
+ "f1_cryptography": 0.4444444444444444,
582
+ "f1_religion": 0.23404255319148937,
583
+ "f1_medicine": 0.8,
584
+ "f1_christianity": 0.36619718309859156,
585
+ "f1_computer graphics": 0.3652173913043478,
586
+ "f1_microsoft windows": 0.19047619047619047,
587
+ "f1_middle east": 0.4675324675324675,
588
+ "f1_motorcycles": 0.693069306930693,
589
+ "f1_politics": 0.313953488372093,
590
+ "f1_pc hardware": 0.4292682926829268,
591
+ "f1_mac hardware": 0.2972972972972973,
592
+ "f1_for sale": 0.7058823529411765,
593
+ "f1_guns": 0.34375,
594
+ "f1_space": 0.6888888888888889,
595
+ "f1_baseball": 0.8909090909090909,
596
+ "f1_hockey": 0.8709677419354839,
597
+ "f1_electronics": 0.575,
598
+ "f1_macro_ci_low": 0.4606519053067645,
599
+ "f1_macro_ci_high": 0.5114904866418184,
600
+ "score_name": "f1_micro",
601
+ "score": 0.5034666666666666,
602
+ "score_ci_high": 0.5288163691152058,
603
+ "score_ci_low": 0.4713054725252697,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.472,
606
+ "accuracy_ci_low": 0.44038730175462776,
607
+ "accuracy_ci_high": 0.497,
608
+ "f1_micro": 0.5034666666666666,
609
+ "f1_micro_ci_low": 0.4713054725252697,
610
+ "f1_micro_ci_high": 0.5288163691152058
611
+ },
612
+ "score": 0.5034666666666666,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.607364388794758,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9122807017543859,
620
+ "f1_credit card or prepaid card": 0.6666666666666666,
621
+ "f1_debt collection": 0.6075949367088608,
622
+ "f1_checking or savings account": 0.673469387755102,
623
+ "f1_money transfer or virtual currency or money service": 0.5777777777777777,
624
+ "f1_vehicle loan or lease": 0.37037037037037035,
625
+ "f1_mortgage": 0.6666666666666666,
626
+ "f1_payday loan or title loan or personal loan": 0.2222222222222222,
627
+ "f1_student loan": 0.7692307692307693,
628
+ "f1_macro_ci_low": 0.5603416421881502,
629
+ "f1_macro_ci_high": 0.6682100489708924,
630
+ "score_name": "f1_micro",
631
+ "score": 0.8273716951788491,
632
+ "score_ci_high": 0.8505803933787175,
633
+ "score_ci_low": 0.8031586690475525,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.798,
636
+ "accuracy_ci_low": 0.7700141366334644,
637
+ "accuracy_ci_high": 0.8228416338853977,
638
+ "f1_micro": 0.8273716951788491,
639
+ "f1_micro_ci_low": 0.8031586690475525,
640
+ "f1_micro_ci_high": 0.8505803933787175
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.6739108272125272,
644
+ "f1_mortgages and loans": 0.7719298245614035,
645
+ "f1_credit card": 0.7403314917127072,
646
+ "f1_retail banking": 0.5797101449275363,
647
+ "f1_debt collection": 0.5686274509803921,
648
+ "f1_credit reporting": 0.7089552238805971,
649
+ "f1_macro_ci_low": 0.6324404602189574,
650
+ "f1_macro_ci_high": 0.7147768248953918,
651
+ "score_name": "f1_micro",
652
+ "score": 0.6777546777546778,
653
+ "score_ci_high": 0.716590388897516,
654
+ "score_ci_low": 0.6352085235971857,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.652,
657
+ "accuracy_ci_low": 0.608,
658
+ "accuracy_ci_high": 0.6909013646716825,
659
+ "f1_micro": 0.6777546777546778,
660
+ "f1_micro_ci_low": 0.6352085235971857,
661
+ "f1_micro_ci_high": 0.716590388897516
662
+ },
663
+ "score": 0.7525631864667635,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "program_accuracy": 0.136,
671
+ "score": 0.136,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.112,
674
+ "program_accuracy_ci_low": 0.114,
675
+ "program_accuracy_ci_high": 0.15532916889351497,
676
+ "score_ci_low": 0.114,
677
+ "score_ci_high": 0.15532916889351497,
678
+ "execution_accuracy_ci_low": 0.093,
679
+ "execution_accuracy_ci_high": 0.132
680
+ },
681
+ "score": 0.136,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.34340385694506587,
688
+ "recall": 0.5730464338072634,
689
+ "f1": 0.3745794974329677,
690
+ "precision_ci_low": 0.3234187185304744,
691
+ "precision_ci_high": 0.3650589335945776,
692
+ "recall_ci_low": 0.5557745525912291,
693
+ "recall_ci_high": 0.5891018666307999,
694
+ "f1_ci_low": 0.3573010951324523,
695
+ "f1_ci_high": 0.3923997026360963,
696
+ "score_name": "f1",
697
+ "score": 0.3745794974329677,
698
+ "score_ci_high": 0.3923997026360963,
699
+ "score_ci_low": 0.3573010951324523,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6226274134715398,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6971947036186854,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5803041330973308,
704
+ "faithfullness_f1_token_overlap": 0.333895617947972,
705
+ "faithfullness_recall_token_overlap": 0.2631184151415058,
706
+ "faithfullness_precision_token_overlap": 0.5911531265193607,
707
+ "correctness_f1_token_overlap": 0.3745794974329677,
708
+ "correctness_recall_token_overlap": 0.5730464338072634,
709
+ "correctness_precision_token_overlap": 0.34340385694506587
710
+ },
711
+ "score": 0.3745794974329677,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.484,
718
+ "accuracy_ci_low": 0.454,
719
+ "accuracy_ci_high": 0.514,
720
+ "score_name": "accuracy",
721
+ "score": 0.484,
722
+ "score_ci_high": 0.514,
723
+ "score_ci_low": 0.454,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.806,
728
+ "accuracy_ci_low": 0.772,
729
+ "accuracy_ci_high": 0.84,
730
+ "score_name": "accuracy",
731
+ "score": 0.806,
732
+ "score_ci_high": 0.84,
733
+ "score_ci_low": 0.772,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.645,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 2.0,
743
+ "severity_high": 1.0,
744
+ "severity_medium": 1.0,
745
+ "severity_low": 96.0,
746
+ "category_harmful_info": 0.8320170236041858,
747
+ "category_deception": 0.80982066540405,
748
+ "category_explicit_content": 0.8411968264615898,
749
+ "category_violence": 0.8222103803300314,
750
+ "category_discrimination": 0.851427495597847,
751
+ "category_substance_abuse": 0.923489530881246,
752
+ "category_pii": 0.8410816744521812,
753
+ "safety": 0.8333571752993802,
754
+ "safety_ci_low": 0.8004974256643814,
755
+ "safety_ci_high": 0.8560529591545912,
756
+ "score_name": "safety",
757
+ "score": 0.8333571752993802,
758
+ "score_ci_high": 0.8560529591545912,
759
+ "score_ci_low": 0.8004974256643814,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.8333571752993802,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rouge2": 0.20644440587446686,
770
+ "rouge1": 0.4218186965603418,
771
+ "rougeL": 0.29171488107135435,
772
+ "score": 0.29171488107135435,
773
+ "score_name": "rougeL",
774
+ "rougeLsum": 0.3573848941644133,
775
+ "rouge2_ci_low": 0.19970855600689244,
776
+ "rouge2_ci_high": 0.2141764351715554,
777
+ "rouge1_ci_low": 0.41248489848485753,
778
+ "rouge1_ci_high": 0.4306004852492735,
779
+ "rougeL_ci_low": 0.284372337658834,
780
+ "rougeL_ci_high": 0.29907980889509783,
781
+ "score_ci_low": 0.284372337658834,
782
+ "score_ci_high": 0.29907980889509783,
783
+ "rougeLsum_ci_low": 0.3492659326802685,
784
+ "rougeLsum_ci_high": 0.36590481273391734
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rouge2": 0.0155586126994404,
789
+ "rouge1": 0.11530035033575219,
790
+ "rougeL": 0.0819857457679891,
791
+ "score": 0.0819857457679891,
792
+ "score_name": "rougeL",
793
+ "rougeLsum": 0.09494455096055868,
794
+ "rouge2_ci_low": 0.013676000237548778,
795
+ "rouge2_ci_high": 0.01748477110760906,
796
+ "rouge1_ci_low": 0.10949633575516456,
797
+ "rouge1_ci_high": 0.12012750847071728,
798
+ "rougeL_ci_low": 0.07832639561199897,
799
+ "rougeL_ci_high": 0.08543803609753609,
800
+ "score_ci_low": 0.07832639561199897,
801
+ "score_ci_high": 0.08543803609753609,
802
+ "rougeLsum_ci_low": 0.09027374868536467,
803
+ "rougeLsum_ci_high": 0.0990496989831643
804
+ },
805
+ "score": 0.18685031341967173,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 1129,
814
+ 620,
815
+ 369,
816
+ 232
817
+ ],
818
+ "totals": [
819
+ 1854,
820
+ 1788,
821
+ 1722,
822
+ 1656
823
+ ],
824
+ "precisions": [
825
+ 0.6089536138079827,
826
+ 0.34675615212527966,
827
+ 0.21428571428571427,
828
+ 0.14009661835748793
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 1854,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.28216771071430846,
834
+ "score": 0.28216771071430846,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.2361776507614392,
837
+ "score_ci_high": 0.31854760158610573,
838
+ "sacrebleu_ci_low": 0.2361776507614392,
839
+ "sacrebleu_ci_high": 0.31854760158610573
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1222,
845
+ 719,
846
+ 458,
847
+ 298
848
+ ],
849
+ "totals": [
850
+ 1795,
851
+ 1729,
852
+ 1663,
853
+ 1597
854
+ ],
855
+ "precisions": [
856
+ 0.6807799442896936,
857
+ 0.4158473105841527,
858
+ 0.27540589296452195,
859
+ 0.18659987476518475
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 1795,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.34730121824258303,
865
+ "score": 0.34730121824258303,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.29893861972188673,
868
+ "score_ci_high": 0.3929549500270372,
869
+ "sacrebleu_ci_low": 0.29893861972188673,
870
+ "sacrebleu_ci_high": 0.3929549500270372
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 640,
876
+ 243,
877
+ 115,
878
+ 51
879
+ ],
880
+ "totals": [
881
+ 2303,
882
+ 2237,
883
+ 2171,
884
+ 2105
885
+ ],
886
+ "precisions": [
887
+ 0.2778983933999131,
888
+ 0.1086276262852034,
889
+ 0.05297098111469369,
890
+ 0.024228028503562947
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 2303,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.07889429589395064,
896
+ "score": 0.07889429589395064,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.05783007476819273,
899
+ "score_ci_high": 0.11334032544493618,
900
+ "sacrebleu_ci_low": 0.05783007476819273,
901
+ "sacrebleu_ci_high": 0.11334032544493618
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 1100,
907
+ 591,
908
+ 353,
909
+ 222
910
+ ],
911
+ "totals": [
912
+ 1847,
913
+ 1781,
914
+ 1715,
915
+ 1649
916
+ ],
917
+ "precisions": [
918
+ 0.5955603681645912,
919
+ 0.3318360471645143,
920
+ 0.20583090379008745,
921
+ 0.13462704669496664
922
+ ],
923
+ "bp": 1.0,
924
+ "sys_len": 1847,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.27203392188147313,
927
+ "score": 0.27203392188147313,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.23527738150795005,
930
+ "score_ci_high": 0.3139246690058723,
931
+ "sacrebleu_ci_low": 0.23527738150795005,
932
+ "sacrebleu_ci_high": 0.3139246690058723
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1379,
938
+ 931,
939
+ 680,
940
+ 508
941
+ ],
942
+ "totals": [
943
+ 2006,
944
+ 1940,
945
+ 1874,
946
+ 1808
947
+ ],
948
+ "precisions": [
949
+ 0.6874376869391824,
950
+ 0.4798969072164948,
951
+ 0.3628601921024546,
952
+ 0.2809734513274336
953
+ ],
954
+ "bp": 0.9695654687972447,
955
+ "sys_len": 2006,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.4152155549652011,
958
+ "score": 0.4152155549652011,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.3836282048944182,
961
+ "score_ci_high": 0.4533991390034356,
962
+ "sacrebleu_ci_low": 0.3836282048944182,
963
+ "sacrebleu_ci_high": 0.4533991390034356
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 1035,
969
+ 422,
970
+ 219,
971
+ 125
972
+ ],
973
+ "totals": [
974
+ 3325,
975
+ 3259,
976
+ 3193,
977
+ 3127
978
+ ],
979
+ "precisions": [
980
+ 0.3112781954887218,
981
+ 0.12948757287511506,
982
+ 0.0685875352333229,
983
+ 0.03997441637352094
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 3325,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.10253001707008509,
989
+ "score": 0.10253001707008509,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.08168831530386227,
992
+ "score_ci_high": 0.1371689608842245,
993
+ "sacrebleu_ci_low": 0.08168831530386227,
994
+ "sacrebleu_ci_high": 0.1371689608842245
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1365,
1000
+ 935,
1001
+ 696,
1002
+ 519
1003
+ ],
1004
+ "totals": [
1005
+ 1887,
1006
+ 1821,
1007
+ 1755,
1008
+ 1689
1009
+ ],
1010
+ "precisions": [
1011
+ 0.7233704292527823,
1012
+ 0.513454146073586,
1013
+ 0.3965811965811966,
1014
+ 0.3072824156305506
1015
+ ],
1016
+ "bp": 0.9847491803389177,
1017
+ "sys_len": 1887,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.45421208890996323,
1020
+ "score": 0.45421208890996323,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.402877439448594,
1023
+ "score_ci_high": 0.49512376744317715,
1024
+ "sacrebleu_ci_low": 0.402877439448594,
1025
+ "sacrebleu_ci_high": 0.49512376744317715
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 930,
1031
+ 427,
1032
+ 223,
1033
+ 122
1034
+ ],
1035
+ "totals": [
1036
+ 1966,
1037
+ 1900,
1038
+ 1834,
1039
+ 1768
1040
+ ],
1041
+ "precisions": [
1042
+ 0.47304170905391657,
1043
+ 0.22473684210526315,
1044
+ 0.12159214830970556,
1045
+ 0.06900452488687783
1046
+ ],
1047
+ "bp": 1.0,
1048
+ "sys_len": 1966,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.17281809069385612,
1051
+ "score": 0.17281809069385612,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.1498451419191345,
1054
+ "score_ci_high": 0.20240539093526114,
1055
+ "sacrebleu_ci_low": 0.1498451419191345,
1056
+ "sacrebleu_ci_high": 0.20240539093526114
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1186,
1062
+ 603,
1063
+ 337,
1064
+ 189
1065
+ ],
1066
+ "totals": [
1067
+ 1982,
1068
+ 1916,
1069
+ 1850,
1070
+ 1784
1071
+ ],
1072
+ "precisions": [
1073
+ 0.5983854692230071,
1074
+ 0.31471816283924847,
1075
+ 0.1821621621621622,
1076
+ 0.10594170403587444
1077
+ ],
1078
+ "bp": 0.9431530195225803,
1079
+ "sys_len": 1982,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.23157365627652982,
1082
+ "score": 0.23157365627652982,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.20035664766275735,
1085
+ "score_ci_high": 0.25766181006532113,
1086
+ "sacrebleu_ci_low": 0.20035664766275735,
1087
+ "sacrebleu_ci_high": 0.25766181006532113
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1263,
1093
+ 786,
1094
+ 517,
1095
+ 354
1096
+ ],
1097
+ "totals": [
1098
+ 1831,
1099
+ 1765,
1100
+ 1699,
1101
+ 1633
1102
+ ],
1103
+ "precisions": [
1104
+ 0.6897870016384489,
1105
+ 0.44532577903682724,
1106
+ 0.3042966450853443,
1107
+ 0.21677893447642377
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 1831,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.3772912827525828,
1113
+ "score": 0.3772912827525828,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.3318073525019781,
1116
+ "score_ci_high": 0.42160864308969,
1117
+ "sacrebleu_ci_low": 0.3318073525019781,
1118
+ "sacrebleu_ci_high": 0.42160864308969
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 998,
1124
+ 440,
1125
+ 238,
1126
+ 140
1127
+ ],
1128
+ "totals": [
1129
+ 1869,
1130
+ 1803,
1131
+ 1737,
1132
+ 1671
1133
+ ],
1134
+ "precisions": [
1135
+ 0.5339753879079722,
1136
+ 0.24403771491957849,
1137
+ 0.13701784686240645,
1138
+ 0.08378216636744465
1139
+ ],
1140
+ "bp": 1.0,
1141
+ "sys_len": 1869,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.1966648424448395,
1144
+ "score": 0.1966648424448395,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.16525947632394583,
1147
+ "score_ci_high": 0.23654666880731012,
1148
+ "sacrebleu_ci_low": 0.16525947632394583,
1149
+ "sacrebleu_ci_high": 0.23654666880731012
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 952,
1155
+ 450,
1156
+ 247,
1157
+ 136
1158
+ ],
1159
+ "totals": [
1160
+ 1808,
1161
+ 1742,
1162
+ 1676,
1163
+ 1610
1164
+ ],
1165
+ "precisions": [
1166
+ 0.5265486725663717,
1167
+ 0.25832376578645233,
1168
+ 0.1473747016706444,
1169
+ 0.084472049689441
1170
+ ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 1808,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.2028545317121833,
1175
+ "score": 0.2028545317121833,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.17683650189535868,
1178
+ "score_ci_high": 0.25600334996477725,
1179
+ "sacrebleu_ci_low": 0.17683650189535868,
1180
+ "sacrebleu_ci_high": 0.25600334996477725
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1256,
1186
+ 812,
1187
+ 560,
1188
+ 392
1189
+ ],
1190
+ "totals": [
1191
+ 1782,
1192
+ 1716,
1193
+ 1650,
1194
+ 1584
1195
+ ],
1196
+ "precisions": [
1197
+ 0.7048260381593715,
1198
+ 0.4731934731934732,
1199
+ 0.33939393939393936,
1200
+ 0.2474747474747475
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 1782,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.409108887747912,
1206
+ "score": 0.409108887747912,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.36492758260668207,
1209
+ "score_ci_high": 0.46703692233170646,
1210
+ "sacrebleu_ci_low": 0.36492758260668207,
1211
+ "sacrebleu_ci_high": 0.46703692233170646
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1197,
1217
+ 729,
1218
+ 477,
1219
+ 310
1220
+ ],
1221
+ "totals": [
1222
+ 1815,
1223
+ 1749,
1224
+ 1683,
1225
+ 1617
1226
+ ],
1227
+ "precisions": [
1228
+ 0.659504132231405,
1229
+ 0.41680960548885077,
1230
+ 0.28342245989304815,
1231
+ 0.191713048855906
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 1815,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.34959104085020015,
1237
+ "score": 0.34959104085020015,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.30660403979630557,
1240
+ "score_ci_high": 0.3928177080574808,
1241
+ "sacrebleu_ci_low": 0.30660403979630557,
1242
+ "sacrebleu_ci_high": 0.3928177080574808
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1110,
1248
+ 579,
1249
+ 330,
1250
+ 191
1251
+ ],
1252
+ "totals": [
1253
+ 1811,
1254
+ 1745,
1255
+ 1679,
1256
+ 1613
1257
+ ],
1258
+ "precisions": [
1259
+ 0.6129210381004969,
1260
+ 0.33180515759312323,
1261
+ 0.19654556283502087,
1262
+ 0.11841289522628642
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 1811,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.2622934684900747,
1268
+ "score": 0.2622934684900747,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.22911977694045185,
1271
+ "score_ci_high": 0.30518353214635846,
1272
+ "sacrebleu_ci_low": 0.22911977694045185,
1273
+ "sacrebleu_ci_high": 0.30518353214635846
1274
+ },
1275
+ "score": 0.27697004057638286,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.484621964093495,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }
results/bluebench/2025-06-19T16-09-06_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-19T20:09:01.492000Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/ibm/granite-3-2b-instruct,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/ibm/granite-3-2b-instruct",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.5777777777777777,
180
+ "accuracy_ci_low": 0.4777777777777778,
181
+ "accuracy_ci_high": 0.6777777777777778,
182
+ "score_name": "accuracy",
183
+ "score": 0.5777777777777777,
184
+ "score_ci_high": 0.6777777777777778,
185
+ "score_ci_low": 0.4777777777777778,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.6777777777777778,
190
+ "accuracy_ci_low": 0.5777777777777777,
191
+ "accuracy_ci_high": 0.7666666666666667,
192
+ "score_name": "accuracy",
193
+ "score": 0.6777777777777778,
194
+ "score_ci_high": 0.7666666666666667,
195
+ "score_ci_low": 0.5777777777777777,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.8111111111111111,
200
+ "accuracy_ci_low": 0.7222222222222222,
201
+ "accuracy_ci_high": 0.8777777777777778,
202
+ "score_name": "accuracy",
203
+ "score": 0.8111111111111111,
204
+ "score_ci_high": 0.8777777777777778,
205
+ "score_ci_low": 0.7222222222222222,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 0.43333333333333335,
210
+ "accuracy_ci_low": 0.3333333333333333,
211
+ "accuracy_ci_high": 0.5333333333333333,
212
+ "score_name": "accuracy",
213
+ "score": 0.43333333333333335,
214
+ "score_ci_high": 0.5333333333333333,
215
+ "score_ci_low": 0.3333333333333333,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.5888888888888889,
220
+ "accuracy_ci_low": 0.48197626978907726,
221
+ "accuracy_ci_high": 0.6888888888888889,
222
+ "score_name": "accuracy",
223
+ "score": 0.5888888888888889,
224
+ "score_ci_high": 0.6888888888888889,
225
+ "score_ci_low": 0.48197626978907726,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.8777777777777778,
230
+ "accuracy_ci_low": 0.8,
231
+ "accuracy_ci_high": 0.9333333333333333,
232
+ "score_name": "accuracy",
233
+ "score": 0.8777777777777778,
234
+ "score_ci_high": 0.9333333333333333,
235
+ "score_ci_low": 0.8,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.7444444444444445,
240
+ "accuracy_ci_low": 0.6444444444444445,
241
+ "accuracy_ci_high": 0.8333333333333334,
242
+ "score_name": "accuracy",
243
+ "score": 0.7444444444444445,
244
+ "score_ci_high": 0.8333333333333334,
245
+ "score_ci_low": 0.6444444444444445,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.6222222222222222,
250
+ "accuracy_ci_low": 0.5222222222222223,
251
+ "accuracy_ci_high": 0.7222222222222222,
252
+ "score_name": "accuracy",
253
+ "score": 0.6222222222222222,
254
+ "score_ci_high": 0.7222222222222222,
255
+ "score_ci_low": 0.5222222222222223,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.6,
260
+ "accuracy_ci_low": 0.5,
261
+ "accuracy_ci_high": 0.7,
262
+ "score_name": "accuracy",
263
+ "score": 0.6,
264
+ "score_ci_high": 0.7,
265
+ "score_ci_low": 0.5,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.6333333333333333,
270
+ "accuracy_ci_low": 0.5333333333333333,
271
+ "accuracy_ci_high": 0.7333333333333333,
272
+ "score_name": "accuracy",
273
+ "score": 0.6333333333333333,
274
+ "score_ci_high": 0.7333333333333333,
275
+ "score_ci_low": 0.5333333333333333,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.7,
280
+ "accuracy_ci_low": 0.5969530984549517,
281
+ "accuracy_ci_high": 0.7798809350059414,
282
+ "score_name": "accuracy",
283
+ "score": 0.7,
284
+ "score_ci_high": 0.7798809350059414,
285
+ "score_ci_low": 0.5969530984549517,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.6606060606060606,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.5,
296
+ "score": 0.5,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.5,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.4156626506024097,
307
+ "f1_Organization": 0.31372549019607837,
308
+ "f1_Location": 0.23140495867768596,
309
+ "f1_macro": 0.32026436649205803,
310
+ "recall_macro": 0.2686052593300962,
311
+ "precision_macro": 0.40524414740424186,
312
+ "in_classes_support": 0.6173913043478261,
313
+ "f1_micro": 0.26363636363636367,
314
+ "recall_micro": 0.2761904761904762,
315
+ "precision_micro": 0.25217391304347825,
316
+ "score": 0.26363636363636367,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.22691460915409117,
319
+ "score_ci_high": 0.3036479298321143,
320
+ "f1_micro_ci_low": 0.22691460915409117,
321
+ "f1_micro_ci_high": 0.3036479298321143
322
+ },
323
+ "score": 0.26363636363636367,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.43661971830985913,
330
+ "accuracy_ci_low": 0.323943661971831,
331
+ "accuracy_ci_high": 0.5633802816901409,
332
+ "score_name": "accuracy",
333
+ "score": 0.43661971830985913,
334
+ "score_ci_high": 0.5633802816901409,
335
+ "score_ci_low": 0.323943661971831,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.2112676056338028,
340
+ "accuracy_ci_low": 0.1267605633802817,
341
+ "accuracy_ci_high": 0.30985915492957744,
342
+ "score_name": "accuracy",
343
+ "score": 0.2112676056338028,
344
+ "score_ci_high": 0.30985915492957744,
345
+ "score_ci_low": 0.1267605633802817,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.23943661971830985,
350
+ "accuracy_ci_low": 0.14084507042253522,
351
+ "accuracy_ci_high": 0.3380281690140845,
352
+ "score_name": "accuracy",
353
+ "score": 0.23943661971830985,
354
+ "score_ci_high": 0.3380281690140845,
355
+ "score_ci_low": 0.14084507042253522,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.3380281690140845,
360
+ "accuracy_ci_low": 0.22535211267605634,
361
+ "accuracy_ci_high": 0.4507042253521127,
362
+ "score_name": "accuracy",
363
+ "score": 0.3380281690140845,
364
+ "score_ci_high": 0.4507042253521127,
365
+ "score_ci_low": 0.22535211267605634,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.38028169014084506,
370
+ "accuracy_ci_low": 0.28169014084507044,
371
+ "accuracy_ci_high": 0.5070422535211268,
372
+ "score_name": "accuracy",
373
+ "score": 0.38028169014084506,
374
+ "score_ci_high": 0.5070422535211268,
375
+ "score_ci_low": 0.28169014084507044,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.2112676056338028,
380
+ "accuracy_ci_low": 0.1267605633802817,
381
+ "accuracy_ci_high": 0.30985915492957744,
382
+ "score_name": "accuracy",
383
+ "score": 0.2112676056338028,
384
+ "score_ci_high": 0.30985915492957744,
385
+ "score_ci_low": 0.1267605633802817,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.19718309859154928,
390
+ "accuracy_ci_low": 0.11267605633802817,
391
+ "accuracy_ci_high": 0.30985915492957744,
392
+ "score_name": "accuracy",
393
+ "score": 0.19718309859154928,
394
+ "score_ci_high": 0.30985915492957744,
395
+ "score_ci_low": 0.11267605633802817,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.2676056338028169,
400
+ "accuracy_ci_low": 0.15492957746478872,
401
+ "accuracy_ci_high": 0.38028169014084506,
402
+ "score_name": "accuracy",
403
+ "score": 0.2676056338028169,
404
+ "score_ci_high": 0.38028169014084506,
405
+ "score_ci_low": 0.15492957746478872,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.1267605633802817,
410
+ "accuracy_ci_low": 0.056338028169014086,
411
+ "accuracy_ci_high": 0.22535211267605634,
412
+ "score_name": "accuracy",
413
+ "score": 0.1267605633802817,
414
+ "score_ci_high": 0.22535211267605634,
415
+ "score_ci_low": 0.056338028169014086,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.09859154929577464,
420
+ "accuracy_ci_low": 0.04225352112676056,
421
+ "accuracy_ci_high": 0.18309859154929578,
422
+ "score_name": "accuracy",
423
+ "score": 0.09859154929577464,
424
+ "score_ci_high": 0.18309859154929578,
425
+ "score_ci_low": 0.04225352112676056,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.19718309859154928,
430
+ "accuracy_ci_low": 0.11267605633802817,
431
+ "accuracy_ci_high": 0.29577464788732394,
432
+ "score_name": "accuracy",
433
+ "score": 0.19718309859154928,
434
+ "score_ci_high": 0.29577464788732394,
435
+ "score_ci_low": 0.11267605633802817,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.28169014084507044,
440
+ "accuracy_ci_low": 0.18309859154929578,
441
+ "accuracy_ci_high": 0.39436619718309857,
442
+ "score_name": "accuracy",
443
+ "score": 0.28169014084507044,
444
+ "score_ci_high": 0.39436619718309857,
445
+ "score_ci_low": 0.18309859154929578,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.19718309859154928,
450
+ "accuracy_ci_low": 0.11267605633802817,
451
+ "accuracy_ci_high": 0.29577464788732394,
452
+ "score_name": "accuracy",
453
+ "score": 0.19718309859154928,
454
+ "score_ci_high": 0.29577464788732394,
455
+ "score_ci_low": 0.11267605633802817,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.38028169014084506,
460
+ "accuracy_ci_low": 0.28169014084507044,
461
+ "accuracy_ci_high": 0.5070422535211268,
462
+ "score_name": "accuracy",
463
+ "score": 0.38028169014084506,
464
+ "score_ci_high": 0.5070422535211268,
465
+ "score_ci_low": 0.28169014084507044,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.2545271629778672,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.40599479862637755,
475
+ "f1_suggestive": 0.16666666666666666,
476
+ "f1_generic": 0.2727272727272727,
477
+ "f1_arbitrary": 0.5263157894736842,
478
+ "f1_fanciful": 0.5777777777777777,
479
+ "f1_descriptive": 0.4864864864864865,
480
+ "f1_macro_ci_low": 0.31051398566318733,
481
+ "f1_macro_ci_high": 0.5136277650253285,
482
+ "score_name": "f1_micro",
483
+ "score": 0.4457831325301205,
484
+ "score_ci_high": 0.550817717180019,
485
+ "score_ci_low": 0.3373493975903614,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.43529411764705883,
488
+ "accuracy_ci_low": 0.32941176470588235,
489
+ "accuracy_ci_high": 0.5411764705882353,
490
+ "f1_micro": 0.4457831325301205,
491
+ "f1_micro_ci_low": 0.3373493975903614,
492
+ "f1_micro_ci_high": 0.550817717180019
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.4964746474647465,
496
+ "f1_no": 0.7656765676567657,
497
+ "f1_yes": 0.22727272727272727,
498
+ "f1_macro_ci_low": 0.43223753510688345,
499
+ "f1_macro_ci_high": 0.5682608970547502,
500
+ "score_name": "f1_micro",
501
+ "score": 0.6445012787723785,
502
+ "score_ci_high": 0.7025641025641025,
503
+ "score_ci_low": 0.5728900255754475,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.63,
506
+ "accuracy_ci_low": 0.56,
507
+ "accuracy_ci_high": 0.69,
508
+ "f1_micro": 0.6445012787723785,
509
+ "f1_micro_ci_low": 0.5728900255754475,
510
+ "f1_micro_ci_high": 0.7025641025641025
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.20456925120109243,
514
+ "f1_conclusion": 0.047619047619047616,
515
+ "f1_decree": 0.26666666666666666,
516
+ "f1_issue": 0.18947368421052632,
517
+ "f1_analysis": 0.3125,
518
+ "f1_facts": 0.2857142857142857,
519
+ "f1_procedural history": 0.19047619047619047,
520
+ "f1_rule": 0.13953488372093023,
521
+ "f1_macro_ci_low": 0.15195580870715297,
522
+ "f1_macro_ci_high": 0.2695847948134964,
523
+ "score_name": "f1_micro",
524
+ "score": 0.20911528150134048,
525
+ "score_ci_high": 0.2716626596010836,
526
+ "score_ci_low": 0.15343915343915343,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.195,
529
+ "accuracy_ci_low": 0.14164584898806754,
530
+ "accuracy_ci_high": 0.255,
531
+ "f1_micro": 0.20911528150134048,
532
+ "f1_micro_ci_low": 0.15343915343915343,
533
+ "f1_micro_ci_high": 0.2716626596010836
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.45312172637754033,
537
+ "f1_yes": 0.5225225225225225,
538
+ "f1_no": 0.38372093023255816,
539
+ "f1_macro_ci_low": 0.3838533922470516,
540
+ "f1_macro_ci_high": 0.5196559838649608,
541
+ "score_name": "f1_micro",
542
+ "score": 0.4619289340101523,
543
+ "score_ci_high": 0.5291073254863808,
544
+ "score_ci_low": 0.39285714285714285,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.455,
547
+ "accuracy_ci_low": 0.385,
548
+ "accuracy_ci_high": 0.525,
549
+ "f1_micro": 0.4619289340101523,
550
+ "f1_micro_ci_low": 0.39285714285714285,
551
+ "f1_micro_ci_high": 0.5291073254863808
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.7812889165628891,
555
+ "f1_yes": 0.7671232876712328,
556
+ "f1_no": 0.7954545454545454,
557
+ "f1_macro_ci_low": 0.6841107145759989,
558
+ "f1_macro_ci_high": 0.8570115576895313,
559
+ "score_name": "f1_micro",
560
+ "score": 0.782608695652174,
561
+ "score_ci_high": 0.8554216867469879,
562
+ "score_ci_low": 0.6867321408585169,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.7411764705882353,
565
+ "accuracy_ci_low": 0.6470588235294118,
566
+ "accuracy_ci_high": 0.8235294117647058,
567
+ "f1_micro": 0.782608695652174,
568
+ "f1_micro_ci_low": 0.6867321408585169,
569
+ "f1_micro_ci_high": 0.8554216867469879
570
+ },
571
+ "score": 0.5087874644932331,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.34833143635091446,
578
+ "f1_cars": 0.6590909090909091,
579
+ "f1_windows x": 0.030303030303030304,
580
+ "f1_atheism": 0.19047619047619047,
581
+ "f1_christianity": 0.2,
582
+ "f1_religion": 0.19047619047619047,
583
+ "f1_medicine": 0.6567164179104478,
584
+ "f1_computer graphics": 0.34782608695652173,
585
+ "f1_microsoft windows": 0.29850746268656714,
586
+ "f1_middle east": 0.11764705882352941,
587
+ "f1_politics": 0.20754716981132076,
588
+ "f1_motorcycles": 0.43373493975903615,
589
+ "f1_pc hardware": 0.3973509933774834,
590
+ "f1_mac hardware": 0.3950617283950617,
591
+ "f1_electronics": 0.4186046511627907,
592
+ "f1_for sale": 0.08695652173913043,
593
+ "f1_guns": 0.14814814814814814,
594
+ "f1_space": 0.4935064935064935,
595
+ "f1_cryptography": 0.47368421052631576,
596
+ "f1_baseball": 0.6890756302521008,
597
+ "f1_hockey": 0.5319148936170213,
598
+ "f1_macro_ci_low": 0.32107229927440883,
599
+ "f1_macro_ci_high": 0.37798520058634305,
600
+ "score_name": "f1_micro",
601
+ "score": 0.3750771128932758,
602
+ "score_ci_high": 0.40812055333180686,
603
+ "score_ci_low": 0.3412059307716769,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.304,
606
+ "accuracy_ci_low": 0.275,
607
+ "accuracy_ci_high": 0.333,
608
+ "f1_micro": 0.3750771128932758,
609
+ "f1_micro_ci_low": 0.3412059307716769,
610
+ "f1_micro_ci_high": 0.40812055333180686
611
+ },
612
+ "score": 0.3750771128932758,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.49544359373400404,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.8558139534883721,
620
+ "f1_credit card or prepaid card": 0.4778761061946903,
621
+ "f1_debt collection": 0.45517241379310347,
622
+ "f1_checking or savings account": 0.4791666666666667,
623
+ "f1_payday loan or title loan or personal loan": 0.1875,
624
+ "f1_vehicle loan or lease": 0.30303030303030304,
625
+ "f1_mortgage": 0.6909090909090909,
626
+ "f1_money transfer or virtual currency or money service": 0.34285714285714286,
627
+ "f1_student loan": 0.6666666666666666,
628
+ "f1_macro_ci_low": 0.437780332452402,
629
+ "f1_macro_ci_high": 0.5517479827666423,
630
+ "score_name": "f1_micro",
631
+ "score": 0.7417582417582418,
632
+ "score_ci_high": 0.7670380361466397,
633
+ "score_ci_low": 0.7139576080586072,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.675,
636
+ "accuracy_ci_low": 0.644,
637
+ "accuracy_ci_high": 0.705,
638
+ "f1_micro": 0.7417582417582418,
639
+ "f1_micro_ci_low": 0.7139576080586072,
640
+ "f1_micro_ci_high": 0.7670380361466397
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.5417314583908202,
644
+ "f1_mortgages and loans": 0.6705202312138728,
645
+ "f1_credit card": 0.5815602836879432,
646
+ "f1_debt collection": 0.5073170731707317,
647
+ "f1_credit reporting": 0.6431372549019608,
648
+ "f1_retail banking": 0.30612244897959184,
649
+ "f1_macro_ci_low": 0.4990357022335705,
650
+ "f1_macro_ci_high": 0.5863760959900322,
651
+ "score_name": "f1_micro",
652
+ "score": 0.5688073394495413,
653
+ "score_ci_high": 0.6118783685965219,
654
+ "score_ci_low": 0.5227795175898966,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.496,
657
+ "accuracy_ci_low": 0.452,
658
+ "accuracy_ci_high": 0.538,
659
+ "f1_micro": 0.5688073394495413,
660
+ "f1_micro_ci_low": 0.5227795175898966,
661
+ "f1_micro_ci_high": 0.6118783685965219
662
+ },
663
+ "score": 0.6552827906038916,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "program_accuracy": 0.107,
671
+ "score": 0.107,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.091,
674
+ "program_accuracy_ci_low": 0.08721723629561164,
675
+ "program_accuracy_ci_high": 0.126,
676
+ "score_ci_low": 0.08721723629561164,
677
+ "score_ci_high": 0.126,
678
+ "execution_accuracy_ci_low": 0.074,
679
+ "execution_accuracy_ci_high": 0.109
680
+ },
681
+ "score": 0.107,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.27411911391097254,
688
+ "recall": 0.5230720245824972,
689
+ "f1": 0.30495867779564406,
690
+ "precision_ci_low": 0.2547573973545031,
691
+ "precision_ci_high": 0.2921448180759812,
692
+ "recall_ci_low": 0.5070386905327111,
693
+ "recall_ci_high": 0.5400839972442947,
694
+ "f1_ci_low": 0.28775327483111873,
695
+ "f1_ci_high": 0.32083305804218265,
696
+ "score_name": "f1",
697
+ "score": 0.30495867779564406,
698
+ "score_ci_high": 0.32083305804218265,
699
+ "score_ci_low": 0.28775327483111873,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5782615457475185,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6598645970225334,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5351804447919131,
704
+ "faithfullness_f1_token_overlap": 0.34366402575185945,
705
+ "faithfullness_recall_token_overlap": 0.27898727759494896,
706
+ "faithfullness_precision_token_overlap": 0.5756506161288797,
707
+ "correctness_f1_token_overlap": 0.30495867779564406,
708
+ "correctness_recall_token_overlap": 0.5230720245824972,
709
+ "correctness_precision_token_overlap": 0.27411911391097254
710
+ },
711
+ "score": 0.30495867779564406,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.388,
718
+ "accuracy_ci_low": 0.358,
719
+ "accuracy_ci_high": 0.417,
720
+ "score_name": "accuracy",
721
+ "score": 0.388,
722
+ "score_ci_high": 0.417,
723
+ "score_ci_low": 0.358,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.728,
728
+ "accuracy_ci_low": 0.692,
729
+ "accuracy_ci_high": 0.764,
730
+ "score_name": "accuracy",
731
+ "score": 0.728,
732
+ "score_ci_high": 0.764,
733
+ "score_ci_low": 0.692,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.558,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 0.0,
743
+ "severity_high": 0.0,
744
+ "severity_medium": 1.0,
745
+ "severity_low": 99.0,
746
+ "category_harmful_info": 0.8488387024207483,
747
+ "category_deception": 0.866081887573907,
748
+ "category_explicit_content": 0.8272491132821699,
749
+ "category_violence": 0.812888801746839,
750
+ "category_discrimination": 0.8425866682378074,
751
+ "category_substance_abuse": 0.9123246139950222,
752
+ "category_pii": 0.7867479754818811,
753
+ "safety": 0.8366520641242258,
754
+ "safety_ci_low": 0.8141729035472625,
755
+ "safety_ci_high": 0.8588271358338173,
756
+ "score_name": "safety",
757
+ "score": 0.8366520641242258,
758
+ "score_ci_high": 0.8588271358338173,
759
+ "score_ci_low": 0.8141729035472625,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.8366520641242258,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rouge2": 0.1946123321830468,
770
+ "rouge1": 0.4165480993021762,
771
+ "rougeLsum": 0.3481713172994178,
772
+ "rougeL": 0.2807625515077543,
773
+ "score": 0.2807625515077543,
774
+ "score_name": "rougeL",
775
+ "rouge2_ci_low": 0.18807199323963344,
776
+ "rouge2_ci_high": 0.2007897635489223,
777
+ "rouge1_ci_low": 0.40802841361294645,
778
+ "rouge1_ci_high": 0.4240881706518218,
779
+ "rougeLsum_ci_low": 0.34046630242406717,
780
+ "rougeLsum_ci_high": 0.35565311375927156,
781
+ "rougeL_ci_low": 0.2745389110131783,
782
+ "rougeL_ci_high": 0.2870232677361269,
783
+ "score_ci_low": 0.2745389110131783,
784
+ "score_ci_high": 0.2870232677361269
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rouge2": 0.013499529737597162,
789
+ "rouge1": 0.1111081768530587,
790
+ "rougeLsum": 0.0922891945088228,
791
+ "rougeL": 0.07992633322696455,
792
+ "score": 0.07992633322696455,
793
+ "score_name": "rougeL",
794
+ "rouge2_ci_low": 0.012006599571612698,
795
+ "rouge2_ci_high": 0.015167305255576668,
796
+ "rouge1_ci_low": 0.10596612811589602,
797
+ "rouge1_ci_high": 0.11561580840527891,
798
+ "rougeLsum_ci_low": 0.08846385121818591,
799
+ "rougeLsum_ci_high": 0.09604727885686246,
800
+ "rougeL_ci_low": 0.0765698806517895,
801
+ "rougeL_ci_high": 0.0830415577853562,
802
+ "score_ci_low": 0.0765698806517895,
803
+ "score_ci_high": 0.0830415577853562
804
+ },
805
+ "score": 0.18034444236735941,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 1002,
814
+ 497,
815
+ 282,
816
+ 169
817
+ ],
818
+ "totals": [
819
+ 1844,
820
+ 1778,
821
+ 1712,
822
+ 1646
823
+ ],
824
+ "precisions": [
825
+ 0.5433839479392625,
826
+ 0.2795275590551181,
827
+ 0.1647196261682243,
828
+ 0.10267314702308626
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 1844,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.22513002244943295,
834
+ "score": 0.22513002244943295,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.18442974136293813,
837
+ "score_ci_high": 0.26207208386144243,
838
+ "sacrebleu_ci_low": 0.18442974136293813,
839
+ "sacrebleu_ci_high": 0.26207208386144243
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1117,
845
+ 634,
846
+ 393,
847
+ 252
848
+ ],
849
+ "totals": [
850
+ 1756,
851
+ 1690,
852
+ 1624,
853
+ 1558
854
+ ],
855
+ "precisions": [
856
+ 0.6361047835990888,
857
+ 0.37514792899408284,
858
+ 0.2419950738916256,
859
+ 0.16174582798459564
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 1756,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.3108799453655372,
865
+ "score": 0.3108799453655372,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.26939824952148095,
868
+ "score_ci_high": 0.35631092704009654,
869
+ "sacrebleu_ci_low": 0.26939824952148095,
870
+ "sacrebleu_ci_high": 0.35631092704009654
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 518,
876
+ 145,
877
+ 48,
878
+ 15
879
+ ],
880
+ "totals": [
881
+ 1654,
882
+ 1588,
883
+ 1522,
884
+ 1456
885
+ ],
886
+ "precisions": [
887
+ 0.313180169286578,
888
+ 0.09130982367758186,
889
+ 0.03153745072273324,
890
+ 0.0103021978021978
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 1654,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.055209912255726495,
896
+ "score": 0.055209912255726495,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.04194038184823177,
899
+ "score_ci_high": 0.08190458345551464,
900
+ "sacrebleu_ci_low": 0.04194038184823177,
901
+ "sacrebleu_ci_high": 0.08190458345551464
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 932,
907
+ 408,
908
+ 209,
909
+ 113
910
+ ],
911
+ "totals": [
912
+ 1864,
913
+ 1798,
914
+ 1732,
915
+ 1666
916
+ ],
917
+ "precisions": [
918
+ 0.5,
919
+ 0.22691879866518352,
920
+ 0.12066974595842955,
921
+ 0.06782713085234093
922
+ ],
923
+ "bp": 1.0,
924
+ "sys_len": 1864,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.17456637003886807,
927
+ "score": 0.17456637003886807,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.1446545609941566,
930
+ "score_ci_high": 0.2117995060278497,
931
+ "sacrebleu_ci_low": 0.1446545609941566,
932
+ "sacrebleu_ci_high": 0.2117995060278497
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1281,
938
+ 791,
939
+ 540,
940
+ 380
941
+ ],
942
+ "totals": [
943
+ 2064,
944
+ 1998,
945
+ 1932,
946
+ 1866
947
+ ],
948
+ "precisions": [
949
+ 0.6206395348837209,
950
+ 0.39589589589589586,
951
+ 0.27950310559006214,
952
+ 0.20364415862808144
953
+ ],
954
+ "bp": 0.9980638921833086,
955
+ "sys_len": 2064,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.3432243584069162,
958
+ "score": 0.3432243584069162,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.31863429882294914,
961
+ "score_ci_high": 0.38949054032191865,
962
+ "sacrebleu_ci_low": 0.31863429882294914,
963
+ "sacrebleu_ci_high": 0.38949054032191865
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 943,
969
+ 328,
970
+ 141,
971
+ 66
972
+ ],
973
+ "totals": [
974
+ 3253,
975
+ 3187,
976
+ 3121,
977
+ 3055
978
+ ],
979
+ "precisions": [
980
+ 0.28988625883799574,
981
+ 0.10291810480075306,
982
+ 0.04517782761935277,
983
+ 0.02160392798690671
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 3253,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.07345889118508468,
989
+ "score": 0.07345889118508468,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.06129370081536535,
992
+ "score_ci_high": 0.09632357969438955,
993
+ "sacrebleu_ci_low": 0.06129370081536535,
994
+ "sacrebleu_ci_high": 0.09632357969438955
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1229,
1000
+ 727,
1001
+ 477,
1002
+ 316
1003
+ ],
1004
+ "totals": [
1005
+ 1915,
1006
+ 1849,
1007
+ 1783,
1008
+ 1717
1009
+ ],
1010
+ "precisions": [
1011
+ 0.64177545691906,
1012
+ 0.3931855056787452,
1013
+ 0.2675266404935502,
1014
+ 0.18404193360512522
1015
+ ],
1016
+ "bp": 0.9994779431076575,
1017
+ "sys_len": 1915,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.3336870259850046,
1020
+ "score": 0.3336870259850046,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.30427044055098784,
1023
+ "score_ci_high": 0.3722955797843792,
1024
+ "sacrebleu_ci_low": 0.30427044055098784,
1025
+ "sacrebleu_ci_high": 0.3722955797843792
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 776,
1031
+ 292,
1032
+ 137,
1033
+ 66
1034
+ ],
1035
+ "totals": [
1036
+ 2002,
1037
+ 1936,
1038
+ 1870,
1039
+ 1804
1040
+ ],
1041
+ "precisions": [
1042
+ 0.3876123876123876,
1043
+ 0.15082644628099173,
1044
+ 0.0732620320855615,
1045
+ 0.03658536585365854
1046
+ ],
1047
+ "bp": 1.0,
1048
+ "sys_len": 2002,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.11188332833173187,
1051
+ "score": 0.11188332833173187,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.09410502256897492,
1054
+ "score_ci_high": 0.14085608331376978,
1055
+ "sacrebleu_ci_low": 0.09410502256897492,
1056
+ "sacrebleu_ci_high": 0.14085608331376978
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1155,
1062
+ 545,
1063
+ 284,
1064
+ 157
1065
+ ],
1066
+ "totals": [
1067
+ 2065,
1068
+ 1999,
1069
+ 1933,
1070
+ 1867
1071
+ ],
1072
+ "precisions": [
1073
+ 0.559322033898305,
1074
+ 0.2726363181590795,
1075
+ 0.14692188308329024,
1076
+ 0.08409212640599893
1077
+ ],
1078
+ "bp": 0.9841463832388515,
1079
+ "sys_len": 2065,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.20503668186599197,
1082
+ "score": 0.20503668186599197,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.17287865798739985,
1085
+ "score_ci_high": 0.22839534867300298,
1086
+ "sacrebleu_ci_low": 0.17287865798739985,
1087
+ "sacrebleu_ci_high": 0.22839534867300298
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1154,
1093
+ 673,
1094
+ 416,
1095
+ 270
1096
+ ],
1097
+ "totals": [
1098
+ 1805,
1099
+ 1739,
1100
+ 1673,
1101
+ 1607
1102
+ ],
1103
+ "precisions": [
1104
+ 0.6393351800554017,
1105
+ 0.38700402530189765,
1106
+ 0.24865511057979678,
1107
+ 0.16801493466085873
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 1805,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.31885801670800706,
1113
+ "score": 0.31885801670800706,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.2891892296952914,
1116
+ "score_ci_high": 0.36690842488060277,
1117
+ "sacrebleu_ci_low": 0.2891892296952914,
1118
+ "sacrebleu_ci_high": 0.36690842488060277
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 932,
1124
+ 394,
1125
+ 198,
1126
+ 108
1127
+ ],
1128
+ "totals": [
1129
+ 2022,
1130
+ 1956,
1131
+ 1890,
1132
+ 1824
1133
+ ],
1134
+ "precisions": [
1135
+ 0.4609297725024728,
1136
+ 0.20143149284253578,
1137
+ 0.10476190476190476,
1138
+ 0.05921052631578948
1139
+ ],
1140
+ "bp": 1.0,
1141
+ "sys_len": 2022,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.15491415770056607,
1144
+ "score": 0.15491415770056607,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.12994962612263172,
1147
+ "score_ci_high": 0.1870372431393324,
1148
+ "sacrebleu_ci_low": 0.12994962612263172,
1149
+ "sacrebleu_ci_high": 0.1870372431393324
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 842,
1155
+ 316,
1156
+ 161,
1157
+ 84
1158
+ ],
1159
+ "totals": [
1160
+ 1863,
1161
+ 1797,
1162
+ 1731,
1163
+ 1665
1164
+ ],
1165
+ "precisions": [
1166
+ 0.451959205582394,
1167
+ 0.17584863661658318,
1168
+ 0.09300982091276719,
1169
+ 0.05045045045045045
1170
+ ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 1863,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.1389658296637508,
1175
+ "score": 0.1389658296637508,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.11444856857935395,
1178
+ "score_ci_high": 0.17296950184028867,
1179
+ "sacrebleu_ci_low": 0.11444856857935395,
1180
+ "sacrebleu_ci_high": 0.17296950184028867
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1125,
1186
+ 650,
1187
+ 405,
1188
+ 268
1189
+ ],
1190
+ "totals": [
1191
+ 1763,
1192
+ 1697,
1193
+ 1631,
1194
+ 1565
1195
+ ],
1196
+ "precisions": [
1197
+ 0.6381168462847419,
1198
+ 0.38302887448438416,
1199
+ 0.24831391784181484,
1200
+ 0.17124600638977636
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 1763,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.31929220031180316,
1206
+ "score": 0.31929220031180316,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.27700187051697595,
1209
+ "score_ci_high": 0.3625973900935978,
1210
+ "sacrebleu_ci_low": 0.27700187051697595,
1211
+ "sacrebleu_ci_high": 0.3625973900935978
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1107,
1217
+ 596,
1218
+ 373,
1219
+ 242
1220
+ ],
1221
+ "totals": [
1222
+ 1855,
1223
+ 1789,
1224
+ 1723,
1225
+ 1657
1226
+ ],
1227
+ "precisions": [
1228
+ 0.5967654986522911,
1229
+ 0.33314700950251536,
1230
+ 0.21648287869994196,
1231
+ 0.1460470730235365
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 1855,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.28157170427283745,
1237
+ "score": 0.28157170427283745,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.24793101467267725,
1240
+ "score_ci_high": 0.34906947598840327,
1241
+ "sacrebleu_ci_low": 0.24793101467267725,
1242
+ "sacrebleu_ci_high": 0.34906947598840327
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1030,
1248
+ 507,
1249
+ 273,
1250
+ 149
1251
+ ],
1252
+ "totals": [
1253
+ 1843,
1254
+ 1777,
1255
+ 1711,
1256
+ 1645
1257
+ ],
1258
+ "precisions": [
1259
+ 0.5588714053174173,
1260
+ 0.28531232414181207,
1261
+ 0.15955581531268265,
1262
+ 0.0905775075987842
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 1843,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.21909948470764218,
1268
+ "score": 0.21909948470764218,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.18945741585306675,
1271
+ "score_ci_high": 0.26472135303184596,
1272
+ "sacrebleu_ci_low": 0.18945741585306675,
1273
+ "sacrebleu_ci_high": 0.26472135303184596
1274
+ },
1275
+ "score": 0.21771852861659338,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.4171223590857319,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }
results/bluebench/2025-06-19T16-21-09_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-19T20:21:05.821665Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/ibm/granite-3-3-8b-instruct,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/ibm/granite-3-3-8b-instruct",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.5555555555555556,
180
+ "accuracy_ci_low": 0.45555555555555555,
181
+ "accuracy_ci_high": 0.6555555555555556,
182
+ "score_name": "accuracy",
183
+ "score": 0.5555555555555556,
184
+ "score_ci_high": 0.6555555555555556,
185
+ "score_ci_low": 0.45555555555555555,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.6222222222222222,
190
+ "accuracy_ci_low": 0.5222222222222223,
191
+ "accuracy_ci_high": 0.7222222222222222,
192
+ "score_name": "accuracy",
193
+ "score": 0.6222222222222222,
194
+ "score_ci_high": 0.7222222222222222,
195
+ "score_ci_low": 0.5222222222222223,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.8777777777777778,
200
+ "accuracy_ci_low": 0.8,
201
+ "accuracy_ci_high": 0.9333333333333333,
202
+ "score_name": "accuracy",
203
+ "score": 0.8777777777777778,
204
+ "score_ci_high": 0.9333333333333333,
205
+ "score_ci_low": 0.8,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 0.6333333333333333,
210
+ "accuracy_ci_low": 0.5333333333333333,
211
+ "accuracy_ci_high": 0.7333333333333333,
212
+ "score_name": "accuracy",
213
+ "score": 0.6333333333333333,
214
+ "score_ci_high": 0.7333333333333333,
215
+ "score_ci_low": 0.5333333333333333,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.6555555555555556,
220
+ "accuracy_ci_low": 0.5555555555555556,
221
+ "accuracy_ci_high": 0.7539633744548231,
222
+ "score_name": "accuracy",
223
+ "score": 0.6555555555555556,
224
+ "score_ci_high": 0.7539633744548231,
225
+ "score_ci_low": 0.5555555555555556,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.9333333333333333,
230
+ "accuracy_ci_low": 0.8666666666666667,
231
+ "accuracy_ci_high": 0.9777777777777777,
232
+ "score_name": "accuracy",
233
+ "score": 0.9333333333333333,
234
+ "score_ci_high": 0.9777777777777777,
235
+ "score_ci_low": 0.8666666666666667,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.8888888888888888,
240
+ "accuracy_ci_low": 0.8222222222222222,
241
+ "accuracy_ci_high": 0.9444444444444444,
242
+ "score_name": "accuracy",
243
+ "score": 0.8888888888888888,
244
+ "score_ci_high": 0.9444444444444444,
245
+ "score_ci_low": 0.8222222222222222,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.9333333333333333,
250
+ "accuracy_ci_low": 0.8666666666666667,
251
+ "accuracy_ci_high": 0.9777777777777777,
252
+ "score_name": "accuracy",
253
+ "score": 0.9333333333333333,
254
+ "score_ci_high": 0.9777777777777777,
255
+ "score_ci_low": 0.8666666666666667,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.7666666666666667,
260
+ "accuracy_ci_low": 0.6720698151047421,
261
+ "accuracy_ci_high": 0.8444444444444444,
262
+ "score_name": "accuracy",
263
+ "score": 0.7666666666666667,
264
+ "score_ci_high": 0.8444444444444444,
265
+ "score_ci_low": 0.6720698151047421,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.6333333333333333,
270
+ "accuracy_ci_low": 0.5333333333333333,
271
+ "accuracy_ci_high": 0.7283280971833935,
272
+ "score_name": "accuracy",
273
+ "score": 0.6333333333333333,
274
+ "score_ci_high": 0.7283280971833935,
275
+ "score_ci_low": 0.5333333333333333,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.7666666666666667,
280
+ "accuracy_ci_low": 0.6666666666666666,
281
+ "accuracy_ci_high": 0.8444444444444444,
282
+ "score_name": "accuracy",
283
+ "score": 0.7666666666666667,
284
+ "score_ci_high": 0.8444444444444444,
285
+ "score_ci_low": 0.6666666666666666,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.7515151515151515,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.5,
296
+ "score": 0.5,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.5,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.5102639296187683,
307
+ "f1_Organization": 0.3381294964028777,
308
+ "f1_Location": 0.35652173913043483,
309
+ "f1_macro": 0.40163838838402693,
310
+ "recall_macro": 0.3240210323686792,
311
+ "precision_macro": 0.530656067251462,
312
+ "in_classes_support": 0.5625,
313
+ "f1_micro": 0.31789282470481384,
314
+ "recall_micro": 0.3333333333333333,
315
+ "precision_micro": 0.3038194444444444,
316
+ "score": 0.31789282470481384,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.26482961534023236,
319
+ "score_ci_high": 0.37029988780714157,
320
+ "f1_micro_ci_low": 0.26482961534023236,
321
+ "f1_micro_ci_high": 0.37029988780714157
322
+ },
323
+ "score": 0.31789282470481384,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.5211267605633803,
330
+ "accuracy_ci_low": 0.4084507042253521,
331
+ "accuracy_ci_high": 0.6338028169014085,
332
+ "score_name": "accuracy",
333
+ "score": 0.5211267605633803,
334
+ "score_ci_high": 0.6338028169014085,
335
+ "score_ci_low": 0.4084507042253521,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.19718309859154928,
340
+ "accuracy_ci_low": 0.11267605633802817,
341
+ "accuracy_ci_high": 0.29577464788732394,
342
+ "score_name": "accuracy",
343
+ "score": 0.19718309859154928,
344
+ "score_ci_high": 0.29577464788732394,
345
+ "score_ci_low": 0.11267605633802817,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.23943661971830985,
350
+ "accuracy_ci_low": 0.15492957746478872,
351
+ "accuracy_ci_high": 0.3380281690140845,
352
+ "score_name": "accuracy",
353
+ "score": 0.23943661971830985,
354
+ "score_ci_high": 0.3380281690140845,
355
+ "score_ci_low": 0.15492957746478872,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.43661971830985913,
360
+ "accuracy_ci_low": 0.323943661971831,
361
+ "accuracy_ci_high": 0.5492957746478874,
362
+ "score_name": "accuracy",
363
+ "score": 0.43661971830985913,
364
+ "score_ci_high": 0.5492957746478874,
365
+ "score_ci_low": 0.323943661971831,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.38028169014084506,
370
+ "accuracy_ci_low": 0.2676056338028169,
371
+ "accuracy_ci_high": 0.49295774647887325,
372
+ "score_name": "accuracy",
373
+ "score": 0.38028169014084506,
374
+ "score_ci_high": 0.49295774647887325,
375
+ "score_ci_low": 0.2676056338028169,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.2535211267605634,
380
+ "accuracy_ci_low": 0.16901408450704225,
381
+ "accuracy_ci_high": 0.36048330202820134,
382
+ "score_name": "accuracy",
383
+ "score": 0.2535211267605634,
384
+ "score_ci_high": 0.36048330202820134,
385
+ "score_ci_low": 0.16901408450704225,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.36619718309859156,
390
+ "accuracy_ci_low": 0.2535211267605634,
391
+ "accuracy_ci_high": 0.4788732394366197,
392
+ "score_name": "accuracy",
393
+ "score": 0.36619718309859156,
394
+ "score_ci_high": 0.4788732394366197,
395
+ "score_ci_low": 0.2535211267605634,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.36619718309859156,
400
+ "accuracy_ci_low": 0.2535211267605634,
401
+ "accuracy_ci_high": 0.4788732394366197,
402
+ "score_name": "accuracy",
403
+ "score": 0.36619718309859156,
404
+ "score_ci_high": 0.4788732394366197,
405
+ "score_ci_low": 0.2535211267605634,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.36619718309859156,
410
+ "accuracy_ci_low": 0.2535211267605634,
411
+ "accuracy_ci_high": 0.4788732394366197,
412
+ "score_name": "accuracy",
413
+ "score": 0.36619718309859156,
414
+ "score_ci_high": 0.4788732394366197,
415
+ "score_ci_low": 0.2535211267605634,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.1267605633802817,
420
+ "accuracy_ci_low": 0.056338028169014086,
421
+ "accuracy_ci_high": 0.22535211267605634,
422
+ "score_name": "accuracy",
423
+ "score": 0.1267605633802817,
424
+ "score_ci_high": 0.22535211267605634,
425
+ "score_ci_low": 0.056338028169014086,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.22535211267605634,
430
+ "accuracy_ci_low": 0.14084507042253522,
431
+ "accuracy_ci_high": 0.323943661971831,
432
+ "score_name": "accuracy",
433
+ "score": 0.22535211267605634,
434
+ "score_ci_high": 0.323943661971831,
435
+ "score_ci_low": 0.14084507042253522,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.4084507042253521,
440
+ "accuracy_ci_low": 0.30985915492957744,
441
+ "accuracy_ci_high": 0.5352112676056338,
442
+ "score_name": "accuracy",
443
+ "score": 0.4084507042253521,
444
+ "score_ci_high": 0.5352112676056338,
445
+ "score_ci_low": 0.30985915492957744,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.29577464788732394,
450
+ "accuracy_ci_low": 0.19718309859154928,
451
+ "accuracy_ci_high": 0.4084507042253521,
452
+ "score_name": "accuracy",
453
+ "score": 0.29577464788732394,
454
+ "score_ci_high": 0.4084507042253521,
455
+ "score_ci_low": 0.19718309859154928,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.5352112676056338,
460
+ "accuracy_ci_low": 0.4084507042253521,
461
+ "accuracy_ci_high": 0.647887323943662,
462
+ "score_name": "accuracy",
463
+ "score": 0.5352112676056338,
464
+ "score_ci_high": 0.647887323943662,
465
+ "score_ci_low": 0.4084507042253521,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.33702213279678067,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.2696554985630616,
475
+ "f1_suggestive": 0.2727272727272727,
476
+ "f1_arbitrary": 0.43137254901960786,
477
+ "f1_generic": 0.11764705882352941,
478
+ "f1_fanciful": 0.2,
479
+ "f1_descriptive": 0.32653061224489793,
480
+ "f1_macro_ci_low": 0.18689773936584586,
481
+ "f1_macro_ci_high": 0.37923074712363225,
482
+ "score_name": "f1_micro",
483
+ "score": 0.31446540880503143,
484
+ "score_ci_high": 0.42038216560509556,
485
+ "score_ci_low": 0.21656050955414013,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.29411764705882354,
488
+ "accuracy_ci_low": 0.2,
489
+ "accuracy_ci_high": 0.4,
490
+ "f1_micro": 0.31446540880503143,
491
+ "f1_micro_ci_low": 0.21656050955414013,
492
+ "f1_micro_ci_high": 0.42038216560509556
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.5388253241800153,
496
+ "f1_no": 0.7298245614035088,
497
+ "f1_yes": 0.34782608695652173,
498
+ "f1_macro_ci_low": 0.47191290375757455,
499
+ "f1_macro_ci_high": 0.6216206779092042,
500
+ "score_name": "f1_micro",
501
+ "score": 0.636604774535809,
502
+ "score_ci_high": 0.6985040092826637,
503
+ "score_ci_low": 0.5691144311757004,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.6,
506
+ "accuracy_ci_low": 0.53,
507
+ "accuracy_ci_high": 0.665,
508
+ "f1_micro": 0.636604774535809,
509
+ "f1_micro_ci_low": 0.5691144311757004,
510
+ "f1_micro_ci_high": 0.6985040092826637
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.2947177227927682,
514
+ "f1_conclusion": 0.2127659574468085,
515
+ "f1_decree": 0.23529411764705882,
516
+ "f1_issue": 0.2711864406779661,
517
+ "f1_rule": 0.42857142857142855,
518
+ "f1_analysis": 0.4444444444444444,
519
+ "f1_facts": 0.21621621621621623,
520
+ "f1_procedural history": 0.2545454545454545,
521
+ "f1_macro_ci_low": 0.23794703715833648,
522
+ "f1_macro_ci_high": 0.36665623309642204,
523
+ "score_name": "f1_micro",
524
+ "score": 0.30409356725146197,
525
+ "score_ci_high": 0.3711587285161421,
526
+ "score_ci_low": 0.23855266549315363,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.26,
529
+ "accuracy_ci_low": 0.2,
530
+ "accuracy_ci_high": 0.32,
531
+ "f1_micro": 0.30409356725146197,
532
+ "f1_micro_ci_low": 0.23855266549315363,
533
+ "f1_micro_ci_high": 0.3711587285161421
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.49092908191313905,
537
+ "f1_yes": 0.5700934579439252,
538
+ "f1_no": 0.4117647058823529,
539
+ "f1_macro_ci_low": 0.4178065856787266,
540
+ "f1_macro_ci_high": 0.5601203681213927,
541
+ "score_name": "f1_micro",
542
+ "score": 0.5,
543
+ "score_ci_high": 0.566970455032283,
544
+ "score_ci_low": 0.42555336134062,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.48,
547
+ "accuracy_ci_low": 0.405,
548
+ "accuracy_ci_high": 0.545,
549
+ "f1_micro": 0.5,
550
+ "f1_micro_ci_low": 0.42555336134062,
551
+ "f1_micro_ci_high": 0.566970455032283
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.8315276273022751,
555
+ "f1_yes": 0.8169014084507042,
556
+ "f1_no": 0.8461538461538461,
557
+ "f1_macro_ci_low": 0.7549023325928579,
558
+ "f1_macro_ci_high": 0.890440353074843,
559
+ "score_name": "f1_micro",
560
+ "score": 0.8322147651006712,
561
+ "score_ci_high": 0.8903225806451613,
562
+ "score_ci_low": 0.7554946760306516,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.7294117647058823,
565
+ "accuracy_ci_low": 0.6352941176470588,
566
+ "accuracy_ci_high": 0.8117647058823529,
567
+ "f1_micro": 0.8322147651006712,
568
+ "f1_micro_ci_low": 0.7554946760306516,
569
+ "f1_micro_ci_high": 0.8903225806451613
570
+ },
571
+ "score": 0.5174757031385947,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.42272407811143237,
578
+ "f1_cars": 0.6078431372549019,
579
+ "f1_pc hardware": 0.34080717488789236,
580
+ "f1_windows x": 0.029850746268656716,
581
+ "f1_computer graphics": 0.4367816091954023,
582
+ "f1_atheism": 0.21739130434782608,
583
+ "f1_religion": 0.23300970873786409,
584
+ "f1_medicine": 0.8641975308641975,
585
+ "f1_christianity": 0.1694915254237288,
586
+ "f1_microsoft windows": 0.39436619718309857,
587
+ "f1_middle east": 0.43037974683544306,
588
+ "f1_politics": 0.291970802919708,
589
+ "f1_motorcycles": 0.43902439024390244,
590
+ "f1_mac hardware": 0.09090909090909091,
591
+ "f1_for sale": 0.625,
592
+ "f1_guns": 0.18181818181818182,
593
+ "f1_space": 0.5569620253164557,
594
+ "f1_cryptography": 0.4482758620689655,
595
+ "f1_baseball": 0.8545454545454545,
596
+ "f1_hockey": 0.859504132231405,
597
+ "f1_electronics": 0.38235294117647056,
598
+ "f1_macro_ci_low": 0.3988534736802405,
599
+ "f1_macro_ci_high": 0.4557473948035634,
600
+ "score_name": "f1_micro",
601
+ "score": 0.44368600682593856,
602
+ "score_ci_high": 0.47444463958776134,
603
+ "score_ci_low": 0.4135801299006492,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.39,
606
+ "accuracy_ci_low": 0.36,
607
+ "accuracy_ci_high": 0.418,
608
+ "f1_micro": 0.44368600682593856,
609
+ "f1_micro_ci_low": 0.4135801299006492,
610
+ "f1_micro_ci_high": 0.47444463958776134
611
+ },
612
+ "score": 0.44368600682593856,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.6105828707367139,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9035153328347045,
620
+ "f1_credit card or prepaid card": 0.5873015873015873,
621
+ "f1_debt collection": 0.6375,
622
+ "f1_checking or savings account": 0.75,
623
+ "f1_money transfer or virtual currency or money service": 0.5777777777777777,
624
+ "f1_vehicle loan or lease": 0.4666666666666667,
625
+ "f1_mortgage": 0.6785714285714286,
626
+ "f1_payday loan or title loan or personal loan": 0.17391304347826086,
627
+ "f1_student loan": 0.72,
628
+ "f1_macro_ci_low": 0.5575796516691159,
629
+ "f1_macro_ci_high": 0.6705972502098242,
630
+ "score_name": "f1_micro",
631
+ "score": 0.8195173137460651,
632
+ "score_ci_high": 0.842436974789916,
633
+ "score_ci_low": 0.7946166113913405,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.781,
636
+ "accuracy_ci_low": 0.752,
637
+ "accuracy_ci_high": 0.806,
638
+ "f1_micro": 0.8195173137460651,
639
+ "f1_micro_ci_low": 0.7946166113913405,
640
+ "f1_micro_ci_high": 0.842436974789916
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.7132677588870594,
644
+ "f1_mortgages and loans": 0.7771428571428571,
645
+ "f1_credit card": 0.7023809523809523,
646
+ "f1_debt collection": 0.6854460093896714,
647
+ "f1_credit reporting": 0.7601476014760148,
648
+ "f1_retail banking": 0.6412213740458015,
649
+ "f1_macro_ci_low": 0.672279823384184,
650
+ "f1_macro_ci_high": 0.7539657340394554,
651
+ "score_name": "f1_micro",
652
+ "score": 0.7202505219206681,
653
+ "score_ci_high": 0.7576596149340853,
654
+ "score_ci_low": 0.6805865270375967,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.69,
657
+ "accuracy_ci_low": 0.65,
658
+ "accuracy_ci_high": 0.73,
659
+ "f1_micro": 0.7202505219206681,
660
+ "f1_micro_ci_low": 0.6805865270375967,
661
+ "f1_micro_ci_high": 0.7576596149340853
662
+ },
663
+ "score": 0.7698839178333665,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "execution_accuracy": 0.074,
671
+ "program_accuracy": 0.085,
672
+ "score": 0.085,
673
+ "score_name": "program_accuracy",
674
+ "execution_accuracy_ci_low": 0.058,
675
+ "execution_accuracy_ci_high": 0.091,
676
+ "program_accuracy_ci_low": 0.068,
677
+ "program_accuracy_ci_high": 0.102,
678
+ "score_ci_low": 0.068,
679
+ "score_ci_high": 0.102
680
+ },
681
+ "score": 0.085,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.30022844870852566,
688
+ "recall": 0.5840193774846996,
689
+ "f1": 0.3357215148632638,
690
+ "precision_ci_low": 0.28030967471726836,
691
+ "precision_ci_high": 0.32121747414474766,
692
+ "recall_ci_low": 0.565861900260428,
693
+ "recall_ci_high": 0.59971992711831,
694
+ "f1_ci_low": 0.3175124739653954,
695
+ "f1_ci_high": 0.35218969004250933,
696
+ "score_name": "f1",
697
+ "score": 0.3357215148632638,
698
+ "score_ci_high": 0.35218969004250933,
699
+ "score_ci_low": 0.3175124739653954,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6000729690988859,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6848867723842462,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5528717579195896,
704
+ "faithfullness_f1_token_overlap": 0.33597285355913525,
705
+ "faithfullness_recall_token_overlap": 0.27114762054953845,
706
+ "faithfullness_precision_token_overlap": 0.5569731492695155,
707
+ "correctness_f1_token_overlap": 0.3357215148632638,
708
+ "correctness_recall_token_overlap": 0.5840193774846996,
709
+ "correctness_precision_token_overlap": 0.30022844870852566
710
+ },
711
+ "score": 0.3357215148632638,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.455,
718
+ "accuracy_ci_low": 0.423,
719
+ "accuracy_ci_high": 0.485,
720
+ "score_name": "accuracy",
721
+ "score": 0.455,
722
+ "score_ci_high": 0.485,
723
+ "score_ci_low": 0.423,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.782,
728
+ "accuracy_ci_low": 0.744,
729
+ "accuracy_ci_high": 0.818,
730
+ "score_name": "accuracy",
731
+ "score": 0.782,
732
+ "score_ci_high": 0.818,
733
+ "score_ci_low": 0.744,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.6185,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 0.0,
743
+ "severity_high": 0.0,
744
+ "severity_medium": 0.0,
745
+ "severity_low": 100.0,
746
+ "category_harmful_info": 0.8662501443103547,
747
+ "category_deception": 0.8769584958149929,
748
+ "category_explicit_content": 0.8376609406085929,
749
+ "category_violence": 0.8456707175466277,
750
+ "category_discrimination": 0.8991804392564864,
751
+ "category_substance_abuse": 0.8935730718904071,
752
+ "category_pii": 0.8467056638112774,
753
+ "safety": 0.860165383811626,
754
+ "safety_ci_low": 0.8426579493684276,
755
+ "safety_ci_high": 0.8778207056405374,
756
+ "score_name": "safety",
757
+ "score": 0.860165383811626,
758
+ "score_ci_high": 0.8778207056405374,
759
+ "score_ci_low": 0.8426579493684276,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.860165383811626,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rougeLsum": 0.3503820014906059,
770
+ "rouge1": 0.4200585739584912,
771
+ "rouge2": 0.20109007723824623,
772
+ "rougeL": 0.28749112328021914,
773
+ "score": 0.28749112328021914,
774
+ "score_name": "rougeL",
775
+ "rougeLsum_ci_low": 0.341921573094731,
776
+ "rougeLsum_ci_high": 0.35863585426859207,
777
+ "rouge1_ci_low": 0.41035793857223635,
778
+ "rouge1_ci_high": 0.4281932704537228,
779
+ "rouge2_ci_low": 0.19416899053732958,
780
+ "rouge2_ci_high": 0.20872476773642967,
781
+ "rougeL_ci_low": 0.2804794753326623,
782
+ "rougeL_ci_high": 0.29447838537921134,
783
+ "score_ci_low": 0.2804794753326623,
784
+ "score_ci_high": 0.29447838537921134
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rougeLsum": 0.0922932399263996,
789
+ "rouge1": 0.11247814548815566,
790
+ "rouge2": 0.015117853576507847,
791
+ "rougeL": 0.07979202357473647,
792
+ "score": 0.07979202357473647,
793
+ "score_name": "rougeL",
794
+ "rougeLsum_ci_low": 0.0880597944044916,
795
+ "rougeLsum_ci_high": 0.09606464509440052,
796
+ "rouge1_ci_low": 0.10733708561154955,
797
+ "rouge1_ci_high": 0.11723898467910755,
798
+ "rouge2_ci_low": 0.01362250797390663,
799
+ "rouge2_ci_high": 0.0168799885499115,
800
+ "rougeL_ci_low": 0.0764789144644062,
801
+ "rougeL_ci_high": 0.08304032568245756,
802
+ "score_ci_low": 0.0764789144644062,
803
+ "score_ci_high": 0.08304032568245756
804
+ },
805
+ "score": 0.1836415734274778,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 1154,
814
+ 637,
815
+ 382,
816
+ 237
817
+ ],
818
+ "totals": [
819
+ 3013,
820
+ 2947,
821
+ 2881,
822
+ 2815
823
+ ],
824
+ "precisions": [
825
+ 0.383006969797544,
826
+ 0.2161520190023753,
827
+ 0.13259284970496357,
828
+ 0.08419182948490231
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 3013,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.17435684678472682,
834
+ "score": 0.17435684678472682,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.12709535962365245,
837
+ "score_ci_high": 0.21064271607309265,
838
+ "sacrebleu_ci_low": 0.12709535962365245,
839
+ "sacrebleu_ci_high": 0.21064271607309265
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1215,
845
+ 695,
846
+ 422,
847
+ 256
848
+ ],
849
+ "totals": [
850
+ 3433,
851
+ 3367,
852
+ 3301,
853
+ 3235
854
+ ],
855
+ "precisions": [
856
+ 0.35391785610253423,
857
+ 0.20641520641520641,
858
+ 0.12784004847016056,
859
+ 0.07913446676970634
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 3433,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.16488046075977367,
865
+ "score": 0.16488046075977367,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.12825986690370522,
868
+ "score_ci_high": 0.20812836267228596,
869
+ "sacrebleu_ci_low": 0.12825986690370522,
870
+ "sacrebleu_ci_high": 0.20812836267228596
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 726,
876
+ 321,
877
+ 159,
878
+ 82
879
+ ],
880
+ "totals": [
881
+ 2297,
882
+ 2231,
883
+ 2165,
884
+ 2099
885
+ ],
886
+ "precisions": [
887
+ 0.3160644318676535,
888
+ 0.14388166741371583,
889
+ 0.07344110854503465,
890
+ 0.03906622201048118
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 2297,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.10687605905530678,
896
+ "score": 0.10687605905530678,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.08639846348006232,
899
+ "score_ci_high": 0.13425269082562755,
900
+ "sacrebleu_ci_low": 0.08639846348006232,
901
+ "sacrebleu_ci_high": 0.13425269082562755
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 1066,
907
+ 564,
908
+ 332,
909
+ 194
910
+ ],
911
+ "totals": [
912
+ 2300,
913
+ 2234,
914
+ 2168,
915
+ 2102
916
+ ],
917
+ "precisions": [
918
+ 0.46347826086956523,
919
+ 0.252461951656222,
920
+ 0.15313653136531366,
921
+ 0.0922930542340628
922
+ ],
923
+ "bp": 1.0,
924
+ "sys_len": 2300,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.2016593123773307,
927
+ "score": 0.2016593123773307,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.177292145733578,
930
+ "score_ci_high": 0.24439707428713803,
931
+ "sacrebleu_ci_low": 0.177292145733578,
932
+ "sacrebleu_ci_high": 0.24439707428713803
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1409,
938
+ 950,
939
+ 692,
940
+ 517
941
+ ],
942
+ "totals": [
943
+ 3275,
944
+ 3209,
945
+ 3143,
946
+ 3077
947
+ ],
948
+ "precisions": [
949
+ 0.4302290076335878,
950
+ 0.2960423808039888,
951
+ 0.2201718103722558,
952
+ 0.168020799480013
953
+ ],
954
+ "bp": 1.0,
955
+ "sys_len": 3275,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.2619959538476516,
958
+ "score": 0.2619959538476516,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.21071110880640612,
961
+ "score_ci_high": 0.30599931494111227,
962
+ "sacrebleu_ci_low": 0.21071110880640612,
963
+ "sacrebleu_ci_high": 0.30599931494111227
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 1096,
969
+ 465,
970
+ 233,
971
+ 132
972
+ ],
973
+ "totals": [
974
+ 3883,
975
+ 3817,
976
+ 3751,
977
+ 3685
978
+ ],
979
+ "precisions": [
980
+ 0.28225598763842386,
981
+ 0.12182342153523709,
982
+ 0.0621167688616369,
983
+ 0.03582089552238806
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 3883,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.09352545142421302,
989
+ "score": 0.09352545142421302,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.0763987126727994,
992
+ "score_ci_high": 0.11617390981932266,
993
+ "sacrebleu_ci_low": 0.0763987126727994,
994
+ "sacrebleu_ci_high": 0.11617390981932266
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1328,
1000
+ 850,
1001
+ 588,
1002
+ 412
1003
+ ],
1004
+ "totals": [
1005
+ 3030,
1006
+ 2964,
1007
+ 2898,
1008
+ 2832
1009
+ ],
1010
+ "precisions": [
1011
+ 0.4382838283828383,
1012
+ 0.286774628879892,
1013
+ 0.2028985507246377,
1014
+ 0.14548022598870058
1015
+ ],
1016
+ "bp": 1.0,
1017
+ "sys_len": 3030,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.2467997817029595,
1020
+ "score": 0.2467997817029595,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.193392163449652,
1023
+ "score_ci_high": 0.2974642241791255,
1024
+ "sacrebleu_ci_low": 0.193392163449652,
1025
+ "sacrebleu_ci_high": 0.2974642241791255
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 930,
1031
+ 400,
1032
+ 214,
1033
+ 123
1034
+ ],
1035
+ "totals": [
1036
+ 2961,
1037
+ 2895,
1038
+ 2829,
1039
+ 2763
1040
+ ],
1041
+ "precisions": [
1042
+ 0.3140830800405269,
1043
+ 0.1381692573402418,
1044
+ 0.07564510427712973,
1045
+ 0.04451682953311618
1046
+ ],
1047
+ "bp": 1.0,
1048
+ "sys_len": 2961,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.1099487393546487,
1051
+ "score": 0.1099487393546487,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.08284384518142485,
1054
+ "score_ci_high": 0.13880651312628609,
1055
+ "sacrebleu_ci_low": 0.08284384518142485,
1056
+ "sacrebleu_ci_high": 0.13880651312628609
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1217,
1062
+ 624,
1063
+ 347,
1064
+ 198
1065
+ ],
1066
+ "totals": [
1067
+ 3045,
1068
+ 2979,
1069
+ 2913,
1070
+ 2847
1071
+ ],
1072
+ "precisions": [
1073
+ 0.399671592775041,
1074
+ 0.20946626384692849,
1075
+ 0.11912118091314795,
1076
+ 0.06954689146469968
1077
+ ],
1078
+ "bp": 1.0,
1079
+ "sys_len": 3045,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.1622822499255264,
1082
+ "score": 0.1622822499255264,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.13321857221475644,
1085
+ "score_ci_high": 0.19390301665624113,
1086
+ "sacrebleu_ci_low": 0.13321857221475644,
1087
+ "sacrebleu_ci_high": 0.19390301665624113
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1236,
1093
+ 735,
1094
+ 470,
1095
+ 308
1096
+ ],
1097
+ "totals": [
1098
+ 2952,
1099
+ 2886,
1100
+ 2820,
1101
+ 2754
1102
+ ],
1103
+ "precisions": [
1104
+ 0.4186991869918699,
1105
+ 0.25467775467775466,
1106
+ 0.16666666666666669,
1107
+ 0.11183732752360204
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 2952,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.2111456628673961,
1113
+ "score": 0.2111456628673961,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.1728340034401921,
1116
+ "score_ci_high": 0.26908287892628974,
1117
+ "sacrebleu_ci_low": 0.1728340034401921,
1118
+ "sacrebleu_ci_high": 0.26908287892628974
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 1018,
1124
+ 437,
1125
+ 232,
1126
+ 128
1127
+ ],
1128
+ "totals": [
1129
+ 3130,
1130
+ 3064,
1131
+ 2998,
1132
+ 2932
1133
+ ],
1134
+ "precisions": [
1135
+ 0.3252396166134185,
1136
+ 0.14262402088772846,
1137
+ 0.07738492328218813,
1138
+ 0.04365620736698499
1139
+ ],
1140
+ "bp": 1.0,
1141
+ "sys_len": 3130,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.11188570922324435,
1144
+ "score": 0.11188570922324435,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.09154049326122426,
1147
+ "score_ci_high": 0.13827539969992217,
1148
+ "sacrebleu_ci_low": 0.09154049326122426,
1149
+ "sacrebleu_ci_high": 0.13827539969992217
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 986,
1155
+ 447,
1156
+ 233,
1157
+ 127
1158
+ ],
1159
+ "totals": [
1160
+ 3637,
1161
+ 3571,
1162
+ 3505,
1163
+ 3439
1164
+ ],
1165
+ "precisions": [
1166
+ 0.27110255705251585,
1167
+ 0.12517502100252031,
1168
+ 0.06647646219686162,
1169
+ 0.03692933992439663
1170
+ ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 3637,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.09553723823741646,
1175
+ "score": 0.09553723823741646,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.06933902828362079,
1178
+ "score_ci_high": 0.1273472328564688,
1179
+ "sacrebleu_ci_low": 0.06933902828362079,
1180
+ "sacrebleu_ci_high": 0.1273472328564688
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1286,
1186
+ 834,
1187
+ 587,
1188
+ 419
1189
+ ],
1190
+ "totals": [
1191
+ 3404,
1192
+ 3338,
1193
+ 3272,
1194
+ 3206
1195
+ ],
1196
+ "precisions": [
1197
+ 0.37779083431257343,
1198
+ 0.24985020970641103,
1199
+ 0.17940097799511,
1200
+ 0.13069245165315035
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 3404,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.21689603438287544,
1206
+ "score": 0.21689603438287544,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.18174547190909165,
1209
+ "score_ci_high": 0.2734022486576191,
1210
+ "sacrebleu_ci_low": 0.18174547190909165,
1211
+ "sacrebleu_ci_high": 0.2734022486576191
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1208,
1217
+ 675,
1218
+ 430,
1219
+ 279
1220
+ ],
1221
+ "totals": [
1222
+ 3677,
1223
+ 3611,
1224
+ 3545,
1225
+ 3479
1226
+ ],
1227
+ "precisions": [
1228
+ 0.32852869186837097,
1229
+ 0.1869288285793409,
1230
+ 0.12129760225669958,
1231
+ 0.08019545846507617
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 3677,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.15633740352446387,
1237
+ "score": 0.15633740352446387,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.12255450743419968,
1240
+ "score_ci_high": 0.17971859902386644,
1241
+ "sacrebleu_ci_low": 0.12255450743419968,
1242
+ "sacrebleu_ci_high": 0.17971859902386644
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1135,
1248
+ 581,
1249
+ 336,
1250
+ 202
1251
+ ],
1252
+ "totals": [
1253
+ 3533,
1254
+ 3467,
1255
+ 3401,
1256
+ 3335
1257
+ ],
1258
+ "precisions": [
1259
+ 0.3212567223322955,
1260
+ 0.16758004038073263,
1261
+ 0.09879447221405468,
1262
+ 0.06056971514242879
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 3533,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.133972503470666,
1268
+ "score": 0.133972503470666,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.10251876459928583,
1271
+ "score_ci_high": 0.17481307519673603,
1272
+ "sacrebleu_ci_low": 0.10251876459928583,
1273
+ "sacrebleu_ci_high": 0.17481307519673603
1274
+ },
1275
+ "score": 0.1632066271292133,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.45259314123432515,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }
results/bluebench/2025-06-19T17-18-35_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-19T21:18:30.246956Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-3-2-1b-instruct,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/meta-llama/llama-3-2-1b-instruct",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.34444444444444444,
180
+ "accuracy_ci_low": 0.24444444444444444,
181
+ "accuracy_ci_high": 0.4444444444444444,
182
+ "score_name": "accuracy",
183
+ "score": 0.34444444444444444,
184
+ "score_ci_high": 0.4444444444444444,
185
+ "score_ci_low": 0.24444444444444444,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.37777777777777777,
190
+ "accuracy_ci_low": 0.2777777777777778,
191
+ "accuracy_ci_high": 0.4817573779444034,
192
+ "score_name": "accuracy",
193
+ "score": 0.37777777777777777,
194
+ "score_ci_high": 0.4817573779444034,
195
+ "score_ci_low": 0.2777777777777778,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.4222222222222222,
200
+ "accuracy_ci_low": 0.32222222222222224,
201
+ "accuracy_ci_high": 0.5222222222222223,
202
+ "score_name": "accuracy",
203
+ "score": 0.4222222222222222,
204
+ "score_ci_high": 0.5222222222222223,
205
+ "score_ci_low": 0.32222222222222224,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 0.3888888888888889,
210
+ "accuracy_ci_low": 0.28888888888888886,
211
+ "accuracy_ci_high": 0.4888888888888889,
212
+ "score_name": "accuracy",
213
+ "score": 0.3888888888888889,
214
+ "score_ci_high": 0.4888888888888889,
215
+ "score_ci_low": 0.28888888888888886,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.3333333333333333,
220
+ "accuracy_ci_low": 0.24444444444444444,
221
+ "accuracy_ci_high": 0.43333333333333335,
222
+ "score_name": "accuracy",
223
+ "score": 0.3333333333333333,
224
+ "score_ci_high": 0.43333333333333335,
225
+ "score_ci_low": 0.24444444444444444,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.45555555555555555,
230
+ "accuracy_ci_low": 0.35555555555555557,
231
+ "accuracy_ci_high": 0.5555555555555556,
232
+ "score_name": "accuracy",
233
+ "score": 0.45555555555555555,
234
+ "score_ci_high": 0.5555555555555556,
235
+ "score_ci_low": 0.35555555555555557,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.4777777777777778,
240
+ "accuracy_ci_low": 0.37777777777777777,
241
+ "accuracy_ci_high": 0.5888888888888889,
242
+ "score_name": "accuracy",
243
+ "score": 0.4777777777777778,
244
+ "score_ci_high": 0.5888888888888889,
245
+ "score_ci_low": 0.37777777777777777,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.4222222222222222,
250
+ "accuracy_ci_low": 0.3333333333333333,
251
+ "accuracy_ci_high": 0.5333333333333333,
252
+ "score_name": "accuracy",
253
+ "score": 0.4222222222222222,
254
+ "score_ci_high": 0.5333333333333333,
255
+ "score_ci_low": 0.3333333333333333,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.4888888888888889,
260
+ "accuracy_ci_low": 0.37777777777777777,
261
+ "accuracy_ci_high": 0.5888888888888889,
262
+ "score_name": "accuracy",
263
+ "score": 0.4888888888888889,
264
+ "score_ci_high": 0.5888888888888889,
265
+ "score_ci_low": 0.37777777777777777,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.5111111111111111,
270
+ "accuracy_ci_low": 0.4111111111111111,
271
+ "accuracy_ci_high": 0.6111111111111112,
272
+ "score_name": "accuracy",
273
+ "score": 0.5111111111111111,
274
+ "score_ci_high": 0.6111111111111112,
275
+ "score_ci_low": 0.4111111111111111,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.4,
280
+ "accuracy_ci_low": 0.3,
281
+ "accuracy_ci_high": 0.5111111111111111,
282
+ "score_name": "accuracy",
283
+ "score": 0.4,
284
+ "score_ci_high": 0.5111111111111111,
285
+ "score_ci_low": 0.3,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.4202020202020202,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.5,
296
+ "score": 0.5,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.5,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.4545454545454546,
307
+ "f1_Organization": 0.2292490118577075,
308
+ "f1_Location": 0.227027027027027,
309
+ "f1_macro": 0.3036071644767297,
310
+ "recall_macro": 0.22361127874697093,
311
+ "precision_macro": 0.5114786350741407,
312
+ "in_classes_support": 0.7476923076923077,
313
+ "f1_micro": 0.2941176470588235,
314
+ "recall_micro": 0.23809523809523808,
315
+ "precision_micro": 0.38461538461538464,
316
+ "score": 0.2941176470588235,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.258452733561647,
319
+ "score_ci_high": 0.33538361058823213,
320
+ "f1_micro_ci_low": 0.258452733561647,
321
+ "f1_micro_ci_high": 0.33538361058823213
322
+ },
323
+ "score": 0.2941176470588235,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.30985915492957744,
330
+ "accuracy_ci_low": 0.19718309859154928,
331
+ "accuracy_ci_high": 0.428782341390215,
332
+ "score_name": "accuracy",
333
+ "score": 0.30985915492957744,
334
+ "score_ci_high": 0.428782341390215,
335
+ "score_ci_low": 0.19718309859154928,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.15492957746478872,
340
+ "accuracy_ci_low": 0.08450704225352113,
341
+ "accuracy_ci_high": 0.2535211267605634,
342
+ "score_name": "accuracy",
343
+ "score": 0.15492957746478872,
344
+ "score_ci_high": 0.2535211267605634,
345
+ "score_ci_low": 0.08450704225352113,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.18309859154929578,
350
+ "accuracy_ci_low": 0.11267605633802817,
351
+ "accuracy_ci_high": 0.28169014084507044,
352
+ "score_name": "accuracy",
353
+ "score": 0.18309859154929578,
354
+ "score_ci_high": 0.28169014084507044,
355
+ "score_ci_low": 0.11267605633802817,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.19718309859154928,
360
+ "accuracy_ci_low": 0.1267605633802817,
361
+ "accuracy_ci_high": 0.30985915492957744,
362
+ "score_name": "accuracy",
363
+ "score": 0.19718309859154928,
364
+ "score_ci_high": 0.30985915492957744,
365
+ "score_ci_low": 0.1267605633802817,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.4084507042253521,
370
+ "accuracy_ci_low": 0.29577464788732394,
371
+ "accuracy_ci_high": 0.5211267605633803,
372
+ "score_name": "accuracy",
373
+ "score": 0.4084507042253521,
374
+ "score_ci_high": 0.5211267605633803,
375
+ "score_ci_low": 0.29577464788732394,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.15492957746478872,
380
+ "accuracy_ci_low": 0.08450704225352113,
381
+ "accuracy_ci_high": 0.2535211267605634,
382
+ "score_name": "accuracy",
383
+ "score": 0.15492957746478872,
384
+ "score_ci_high": 0.2535211267605634,
385
+ "score_ci_low": 0.08450704225352113,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.16901408450704225,
390
+ "accuracy_ci_low": 0.09859154929577464,
391
+ "accuracy_ci_high": 0.2535211267605634,
392
+ "score_name": "accuracy",
393
+ "score": 0.16901408450704225,
394
+ "score_ci_high": 0.2535211267605634,
395
+ "score_ci_low": 0.09859154929577464,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.18309859154929578,
400
+ "accuracy_ci_low": 0.09859154929577464,
401
+ "accuracy_ci_high": 0.28169014084507044,
402
+ "score_name": "accuracy",
403
+ "score": 0.18309859154929578,
404
+ "score_ci_high": 0.28169014084507044,
405
+ "score_ci_low": 0.09859154929577464,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.11267605633802817,
410
+ "accuracy_ci_low": 0.056338028169014086,
411
+ "accuracy_ci_high": 0.2112676056338028,
412
+ "score_name": "accuracy",
413
+ "score": 0.11267605633802817,
414
+ "score_ci_high": 0.2112676056338028,
415
+ "score_ci_low": 0.056338028169014086,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.09859154929577464,
420
+ "accuracy_ci_low": 0.04225352112676056,
421
+ "accuracy_ci_high": 0.18309859154929578,
422
+ "score_name": "accuracy",
423
+ "score": 0.09859154929577464,
424
+ "score_ci_high": 0.18309859154929578,
425
+ "score_ci_low": 0.04225352112676056,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.15492957746478872,
430
+ "accuracy_ci_low": 0.08450704225352113,
431
+ "accuracy_ci_high": 0.2645029324911099,
432
+ "score_name": "accuracy",
433
+ "score": 0.15492957746478872,
434
+ "score_ci_high": 0.2645029324911099,
435
+ "score_ci_low": 0.08450704225352113,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.16901408450704225,
440
+ "accuracy_ci_low": 0.08450704225352113,
441
+ "accuracy_ci_high": 0.2676056338028169,
442
+ "score_name": "accuracy",
443
+ "score": 0.16901408450704225,
444
+ "score_ci_high": 0.2676056338028169,
445
+ "score_ci_low": 0.08450704225352113,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.04225352112676056,
450
+ "accuracy_ci_low": 0.014084507042253521,
451
+ "accuracy_ci_high": 0.11267605633802817,
452
+ "score_name": "accuracy",
453
+ "score": 0.04225352112676056,
454
+ "score_ci_high": 0.11267605633802817,
455
+ "score_ci_low": 0.014084507042253521,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.38028169014084506,
460
+ "accuracy_ci_low": 0.2676056338028169,
461
+ "accuracy_ci_high": 0.49295774647887325,
462
+ "score_name": "accuracy",
463
+ "score": 0.38028169014084506,
464
+ "score_ci_high": 0.49295774647887325,
465
+ "score_ci_low": 0.2676056338028169,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.19416498993963782,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.23961477119371857,
475
+ "f1_suggestive": 0.2564102564102564,
476
+ "f1_generic": 0.22727272727272727,
477
+ "f1_fanciful": 0.3076923076923077,
478
+ "f1_descriptive": 0.09090909090909091,
479
+ "f1_arbitrary": 0.3157894736842105,
480
+ "f1_macro_ci_low": 0.16470343495436598,
481
+ "f1_macro_ci_high": 0.3402970569238248,
482
+ "score_name": "f1_micro",
483
+ "score": 0.2485207100591716,
484
+ "score_ci_high": 0.3565344458058143,
485
+ "score_ci_low": 0.16674772165037405,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.24705882352941178,
488
+ "accuracy_ci_low": 0.16470588235294117,
489
+ "accuracy_ci_high": 0.35294117647058826,
490
+ "f1_micro": 0.2485207100591716,
491
+ "f1_micro_ci_low": 0.16674772165037405,
492
+ "f1_micro_ci_high": 0.3565344458058143
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.45749941134918765,
496
+ "f1_no": 0.656934306569343,
497
+ "f1_yes": 0.25806451612903225,
498
+ "f1_macro_ci_low": 0.4007210254458121,
499
+ "f1_macro_ci_high": 0.523830186580906,
500
+ "score_name": "f1_micro",
501
+ "score": 0.5326633165829145,
502
+ "score_ci_high": 0.5979899497487438,
503
+ "score_ci_low": 0.4676003540226054,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.53,
506
+ "accuracy_ci_low": 0.465,
507
+ "accuracy_ci_high": 0.595,
508
+ "f1_micro": 0.5326633165829145,
509
+ "f1_micro_ci_low": 0.4676003540226054,
510
+ "f1_micro_ci_high": 0.5979899497487438
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.14157824173747313,
514
+ "f1_conclusion": 0.08333333333333333,
515
+ "f1_analysis": 0.2900763358778626,
516
+ "f1_decree": 0.06666666666666667,
517
+ "f1_issue": 0.047619047619047616,
518
+ "f1_facts": 0.13333333333333333,
519
+ "f1_rule": 0.1935483870967742,
520
+ "f1_procedural history": 0.17647058823529413,
521
+ "f1_macro_ci_low": 0.09927560143449254,
522
+ "f1_macro_ci_high": 0.19963080582055887,
523
+ "score_name": "f1_micro",
524
+ "score": 0.17857142857142858,
525
+ "score_ci_high": 0.23469387755102042,
526
+ "score_ci_low": 0.1235825927993309,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.175,
529
+ "accuracy_ci_low": 0.12,
530
+ "accuracy_ci_high": 0.23,
531
+ "f1_micro": 0.17857142857142858,
532
+ "f1_micro_ci_low": 0.1235825927993309,
533
+ "f1_micro_ci_high": 0.23469387755102042
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.41428571428571426,
537
+ "f1_yes": 0.4857142857142857,
538
+ "f1_no": 0.34285714285714286,
539
+ "f1_macro_ci_low": 0.35160188806998965,
540
+ "f1_macro_ci_high": 0.4808674529166947,
541
+ "score_name": "f1_micro",
542
+ "score": 0.42077922077922075,
543
+ "score_ci_high": 0.4846763437420372,
544
+ "score_ci_low": 0.35535075567851304,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.405,
547
+ "accuracy_ci_low": 0.34,
548
+ "accuracy_ci_high": 0.465,
549
+ "f1_micro": 0.42077922077922075,
550
+ "f1_micro_ci_low": 0.35535075567851304,
551
+ "f1_micro_ci_high": 0.4846763437420372
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.6461038961038961,
555
+ "f1_yes": 0.6428571428571429,
556
+ "f1_no": 0.6493506493506493,
557
+ "f1_macro_ci_low": 0.5344060631732589,
558
+ "f1_macro_ci_high": 0.745107042681059,
559
+ "score_name": "f1_micro",
560
+ "score": 0.6459627329192547,
561
+ "score_ci_high": 0.7393939393939394,
562
+ "score_ci_low": 0.5344831234199472,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.611764705882353,
565
+ "accuracy_ci_low": 0.49411764705882355,
566
+ "accuracy_ci_high": 0.7058823529411765,
567
+ "f1_micro": 0.6459627329192547,
568
+ "f1_micro_ci_low": 0.5344831234199472,
569
+ "f1_micro_ci_high": 0.7393939393939394
570
+ },
571
+ "score": 0.40529948178239805,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.2276162418340563,
578
+ "f1_cars": 0.5581395348837209,
579
+ "f1_windows x": 0.0,
580
+ "f1_atheism": 0.14634146341463414,
581
+ "f1_religion": 0.18181818181818182,
582
+ "f1_medicine": 0.2962962962962963,
583
+ "f1_hockey": 0.48936170212765956,
584
+ "f1_christianity": 0.3287671232876712,
585
+ "f1_computer graphics": 0.13513513513513514,
586
+ "f1_microsoft windows": 0.03571428571428571,
587
+ "f1_middle east": 0.125,
588
+ "f1_motorcycles": 0.1917808219178082,
589
+ "f1_mac hardware": 0.0,
590
+ "f1_for sale": 0.0,
591
+ "f1_guns": 0.10714285714285714,
592
+ "f1_politics": 0.2361111111111111,
593
+ "f1_space": 0.39436619718309857,
594
+ "f1_pc hardware": 0.0,
595
+ "f1_cryptography": 0.32432432432432434,
596
+ "f1_baseball": 0.7610619469026548,
597
+ "f1_electronics": 0.24096385542168675,
598
+ "f1_macro_ci_low": 0.20272698040510803,
599
+ "f1_macro_ci_high": 0.2532565570480989,
600
+ "score_name": "f1_micro",
601
+ "score": 0.2679830747531735,
602
+ "score_ci_high": 0.2978873823161142,
603
+ "score_ci_low": 0.2355693496528132,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.19,
606
+ "accuracy_ci_low": 0.166,
607
+ "accuracy_ci_high": 0.213,
608
+ "f1_micro": 0.2679830747531735,
609
+ "f1_micro_ci_low": 0.2355693496528132,
610
+ "f1_micro_ci_high": 0.2978873823161142
611
+ },
612
+ "score": 0.2679830747531735,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.22216862358987682,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.696078431372549,
620
+ "f1_credit card or prepaid card": 0.1518987341772152,
621
+ "f1_debt collection": 0.22535211267605634,
622
+ "f1_checking or savings account": 0.2222222222222222,
623
+ "f1_vehicle loan or lease": 0.08333333333333333,
624
+ "f1_payday loan or title loan or personal loan": 0.0,
625
+ "f1_mortgage": 0.3157894736842105,
626
+ "f1_money transfer or virtual currency or money service": 0.07407407407407407,
627
+ "f1_student loan": 0.23076923076923078,
628
+ "f1_macro_ci_low": 0.1842187730862839,
629
+ "f1_macro_ci_high": 0.27331239167462773,
630
+ "score_name": "f1_micro",
631
+ "score": 0.5611940298507463,
632
+ "score_ci_high": 0.5951679434295816,
633
+ "score_ci_low": 0.5287106773010755,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.47,
636
+ "accuracy_ci_low": 0.43992255182914,
637
+ "accuracy_ci_high": 0.504,
638
+ "f1_micro": 0.5611940298507463,
639
+ "f1_micro_ci_low": 0.5287106773010755,
640
+ "f1_micro_ci_high": 0.5951679434295816
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.328995002480145,
644
+ "f1_mortgages and loans": 0.3076923076923077,
645
+ "f1_debt collection": 0.45038167938931295,
646
+ "f1_credit card": 0.288135593220339,
647
+ "f1_credit reporting": 0.5,
648
+ "f1_retail banking": 0.09876543209876543,
649
+ "f1_macro_ci_low": 0.2869304161724212,
650
+ "f1_macro_ci_high": 0.37644896631739505,
651
+ "score_name": "f1_micro",
652
+ "score": 0.39184597961494905,
653
+ "score_ci_high": 0.43742334452481374,
654
+ "score_ci_low": 0.3475735981074829,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.346,
657
+ "accuracy_ci_low": 0.306,
658
+ "accuracy_ci_high": 0.39,
659
+ "f1_micro": 0.39184597961494905,
660
+ "f1_micro_ci_low": 0.3475735981074829,
661
+ "f1_micro_ci_high": 0.43742334452481374
662
+ },
663
+ "score": 0.47652000473284767,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "program_accuracy": 0.027,
671
+ "score": 0.027,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.022,
674
+ "program_accuracy_ci_low": 0.019,
675
+ "program_accuracy_ci_high": 0.039,
676
+ "score_ci_low": 0.019,
677
+ "score_ci_high": 0.039,
678
+ "execution_accuracy_ci_low": 0.014,
679
+ "execution_accuracy_ci_high": 0.033
680
+ },
681
+ "score": 0.027,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.3192591413018271,
688
+ "recall": 0.38991119820901343,
689
+ "f1": 0.2793088311088913,
690
+ "precision_ci_low": 0.2975520115964286,
691
+ "precision_ci_high": 0.3411640955753825,
692
+ "recall_ci_low": 0.3735203455510533,
693
+ "recall_ci_high": 0.40905456252856565,
694
+ "f1_ci_low": 0.26376571917741815,
695
+ "f1_ci_high": 0.2967523754621306,
696
+ "score_name": "f1",
697
+ "score": 0.2793088311088913,
698
+ "score_ci_high": 0.2967523754621306,
699
+ "score_ci_low": 0.26376571917741815,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5586816079914569,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6062885612249375,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5444585413982471,
704
+ "faithfullness_f1_token_overlap": 0.2284069948073265,
705
+ "faithfullness_recall_token_overlap": 0.1691231317769043,
706
+ "faithfullness_precision_token_overlap": 0.5822539469357956,
707
+ "correctness_f1_token_overlap": 0.2793088311088913,
708
+ "correctness_recall_token_overlap": 0.38991119820901343,
709
+ "correctness_precision_token_overlap": 0.3192591413018271
710
+ },
711
+ "score": 0.2793088311088913,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.268,
718
+ "accuracy_ci_low": 0.241,
719
+ "accuracy_ci_high": 0.294,
720
+ "score_name": "accuracy",
721
+ "score": 0.268,
722
+ "score_ci_high": 0.294,
723
+ "score_ci_low": 0.241,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.536,
728
+ "accuracy_ci_low": 0.496,
729
+ "accuracy_ci_high": 0.582,
730
+ "score_name": "accuracy",
731
+ "score": 0.536,
732
+ "score_ci_high": 0.582,
733
+ "score_ci_low": 0.496,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.402,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 0.0,
743
+ "severity_high": 2.0,
744
+ "severity_medium": 4.0,
745
+ "severity_low": 94.0,
746
+ "category_harmful_info": 0.8100087784142013,
747
+ "category_deception": 0.7944973061482112,
748
+ "category_explicit_content": 0.8078695727192393,
749
+ "category_violence": 0.8026550727940741,
750
+ "category_discrimination": 0.7023729502208649,
751
+ "category_substance_abuse": 0.857964905185832,
752
+ "category_pii": 0.8635181895008794,
753
+ "safety": 0.8019619875691001,
754
+ "safety_ci_low": 0.7757910180183378,
755
+ "safety_ci_high": 0.8246888836931806,
756
+ "score_name": "safety",
757
+ "score": 0.8019619875691001,
758
+ "score_ci_high": 0.8246888836931806,
759
+ "score_ci_low": 0.7757910180183378,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.8019619875691001,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rouge2": 0.20059977802050133,
770
+ "rouge1": 0.40705433873110675,
771
+ "rougeL": 0.2833644024269708,
772
+ "score": 0.2833644024269708,
773
+ "score_name": "rougeL",
774
+ "rougeLsum": 0.3484006202872336,
775
+ "rouge2_ci_low": 0.19350867847403447,
776
+ "rouge2_ci_high": 0.208746306103288,
777
+ "rouge1_ci_low": 0.39709724247372435,
778
+ "rouge1_ci_high": 0.41741048664304065,
779
+ "rougeL_ci_low": 0.27611154671120425,
780
+ "rougeL_ci_high": 0.291401862882032,
781
+ "score_ci_low": 0.27611154671120425,
782
+ "score_ci_high": 0.291401862882032,
783
+ "rougeLsum_ci_low": 0.3393182862844001,
784
+ "rougeLsum_ci_high": 0.35859357766397365
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rouge2": 0.015549345441433063,
789
+ "rouge1": 0.11519799005534682,
790
+ "rougeL": 0.0840670089559512,
791
+ "score": 0.0840670089559512,
792
+ "score_name": "rougeL",
793
+ "rougeLsum": 0.09578949363666936,
794
+ "rouge2_ci_low": 0.013931235527451928,
795
+ "rouge2_ci_high": 0.017483224052864014,
796
+ "rouge1_ci_low": 0.10965978969392036,
797
+ "rouge1_ci_high": 0.12037813563278642,
798
+ "rougeL_ci_low": 0.08040950716646748,
799
+ "rougeL_ci_high": 0.08756332939065774,
800
+ "score_ci_low": 0.08040950716646748,
801
+ "score_ci_high": 0.08756332939065774,
802
+ "rougeLsum_ci_low": 0.0913667622653291,
803
+ "rougeLsum_ci_high": 0.09990468987829387
804
+ },
805
+ "score": 0.183715705691461,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 975,
814
+ 450,
815
+ 239,
816
+ 131
817
+ ],
818
+ "totals": [
819
+ 1762,
820
+ 1696,
821
+ 1630,
822
+ 1564
823
+ ],
824
+ "precisions": [
825
+ 0.5533484676503972,
826
+ 0.2653301886792453,
827
+ 0.14662576687116563,
828
+ 0.08375959079283887
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 1762,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.20606657614931506,
834
+ "score": 0.20606657614931506,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.18092500119305566,
837
+ "score_ci_high": 0.2337621289783969,
838
+ "sacrebleu_ci_low": 0.18092500119305566,
839
+ "sacrebleu_ci_high": 0.2337621289783969
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1120,
845
+ 595,
846
+ 353,
847
+ 211
848
+ ],
849
+ "totals": [
850
+ 1761,
851
+ 1695,
852
+ 1629,
853
+ 1563
854
+ ],
855
+ "precisions": [
856
+ 0.6360022714366838,
857
+ 0.35103244837758113,
858
+ 0.21669736034376919,
859
+ 0.13499680102367242
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 1761,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.2842796401730753,
865
+ "score": 0.2842796401730753,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.25109489325457895,
868
+ "score_ci_high": 0.31935419283000793,
869
+ "sacrebleu_ci_low": 0.25109489325457895,
870
+ "sacrebleu_ci_high": 0.31935419283000793
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 485,
876
+ 117,
877
+ 36,
878
+ 10
879
+ ],
880
+ "totals": [
881
+ 1763,
882
+ 1697,
883
+ 1631,
884
+ 1565
885
+ ],
886
+ "precisions": [
887
+ 0.2750992626205332,
888
+ 0.06894519740718916,
889
+ 0.022072348252605765,
890
+ 0.006389776357827476
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 1763,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.04044193351575661,
896
+ "score": 0.04044193351575661,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.026004571558982913,
899
+ "score_ci_high": 0.05783535488306116,
900
+ "sacrebleu_ci_low": 0.026004571558982913,
901
+ "sacrebleu_ci_high": 0.05783535488306116
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 946,
907
+ 441,
908
+ 236,
909
+ 135
910
+ ],
911
+ "totals": [
912
+ 1822,
913
+ 1756,
914
+ 1690,
915
+ 1624
916
+ ],
917
+ "precisions": [
918
+ 0.5192096597145993,
919
+ 0.2511389521640091,
920
+ 0.13964497041420118,
921
+ 0.08312807881773399
922
+ ],
923
+ "bp": 0.9928903773336073,
924
+ "sys_len": 1822,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.19584332441613614,
927
+ "score": 0.19584332441613614,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.17291558306420995,
930
+ "score_ci_high": 0.2421406469227526,
931
+ "sacrebleu_ci_low": 0.17291558306420995,
932
+ "sacrebleu_ci_high": 0.2421406469227526
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1235,
938
+ 733,
939
+ 491,
940
+ 334
941
+ ],
942
+ "totals": [
943
+ 2003,
944
+ 1937,
945
+ 1871,
946
+ 1805
947
+ ],
948
+ "precisions": [
949
+ 0.6165751372940589,
950
+ 0.37842023748064013,
951
+ 0.26242650988776056,
952
+ 0.1850415512465374
953
+ ],
954
+ "bp": 0.968069571391973,
955
+ "sys_len": 2003,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.31583910573917306,
958
+ "score": 0.31583910573917306,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.28278290783286325,
961
+ "score_ci_high": 0.35809550404773266,
962
+ "sacrebleu_ci_low": 0.28278290783286325,
963
+ "sacrebleu_ci_high": 0.35809550404773266
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 895,
969
+ 316,
970
+ 136,
971
+ 65
972
+ ],
973
+ "totals": [
974
+ 2706,
975
+ 2640,
976
+ 2574,
977
+ 2508
978
+ ],
979
+ "precisions": [
980
+ 0.3307464892830747,
981
+ 0.11969696969696969,
982
+ 0.05283605283605284,
983
+ 0.025917065390749602
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 2706,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.08580718353389435,
989
+ "score": 0.08580718353389435,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.0714070016419881,
992
+ "score_ci_high": 0.11393007113284326,
993
+ "sacrebleu_ci_low": 0.0714070016419881,
994
+ "sacrebleu_ci_high": 0.11393007113284326
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1230,
1000
+ 752,
1001
+ 492,
1002
+ 331
1003
+ ],
1004
+ "totals": [
1005
+ 1871,
1006
+ 1805,
1007
+ 1739,
1008
+ 1673
1009
+ ],
1010
+ "precisions": [
1011
+ 0.6574024585783004,
1012
+ 0.4166204986149584,
1013
+ 0.28292121909143186,
1014
+ 0.19784817692767484
1015
+ ],
1016
+ "bp": 0.976235618350251,
1017
+ "sys_len": 1871,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.3435160489732885,
1020
+ "score": 0.3435160489732885,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.2938775361515651,
1023
+ "score_ci_high": 0.38124144600400245,
1024
+ "sacrebleu_ci_low": 0.2938775361515651,
1025
+ "sacrebleu_ci_high": 0.38124144600400245
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 1021,
1031
+ 502,
1032
+ 284,
1033
+ 169
1034
+ ],
1035
+ "totals": [
1036
+ 1949,
1037
+ 1883,
1038
+ 1817,
1039
+ 1751
1040
+ ],
1041
+ "precisions": [
1042
+ 0.5238583889173936,
1043
+ 0.26659585767392463,
1044
+ 0.15630159603742433,
1045
+ 0.09651627641347801
1046
+ ],
1047
+ "bp": 1.0,
1048
+ "sys_len": 1949,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.21424358052936537,
1051
+ "score": 0.21424358052936537,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.19040761320563873,
1054
+ "score_ci_high": 0.2508897923390456,
1055
+ "sacrebleu_ci_low": 0.19040761320563873,
1056
+ "sacrebleu_ci_high": 0.2508897923390456
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1122,
1062
+ 547,
1063
+ 287,
1064
+ 157
1065
+ ],
1066
+ "totals": [
1067
+ 1974,
1068
+ 1908,
1069
+ 1842,
1070
+ 1776
1071
+ ],
1072
+ "precisions": [
1073
+ 0.5683890577507599,
1074
+ 0.2866876310272537,
1075
+ 0.15580890336590664,
1076
+ 0.0884009009009009
1077
+ ],
1078
+ "bp": 0.9391156766806551,
1079
+ "sys_len": 1974,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.2044066388062864,
1082
+ "score": 0.2044066388062864,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.17477612690667202,
1085
+ "score_ci_high": 0.23063630240994007,
1086
+ "sacrebleu_ci_low": 0.17477612690667202,
1087
+ "sacrebleu_ci_high": 0.23063630240994007
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1184,
1093
+ 706,
1094
+ 459,
1095
+ 307
1096
+ ],
1097
+ "totals": [
1098
+ 1741,
1099
+ 1675,
1100
+ 1609,
1101
+ 1543
1102
+ ],
1103
+ "precisions": [
1104
+ 0.6800689259046525,
1105
+ 0.42149253731343284,
1106
+ 0.2852703542573027,
1107
+ 0.19896305897602073
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 1741,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.35714368713727423,
1113
+ "score": 0.35714368713727423,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.3224604238321309,
1116
+ "score_ci_high": 0.41026435550762275,
1117
+ "sacrebleu_ci_low": 0.3224604238321309,
1118
+ "sacrebleu_ci_high": 0.41026435550762275
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 846,
1124
+ 309,
1125
+ 132,
1126
+ 62
1127
+ ],
1128
+ "totals": [
1129
+ 1698,
1130
+ 1632,
1131
+ 1566,
1132
+ 1500
1133
+ ],
1134
+ "precisions": [
1135
+ 0.49823321554770317,
1136
+ 0.18933823529411764,
1137
+ 0.0842911877394636,
1138
+ 0.04133333333333334
1139
+ ],
1140
+ "bp": 0.9790217565823072,
1141
+ "sys_len": 1698,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.13181982922133714,
1144
+ "score": 0.13181982922133714,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.10251074913175488,
1147
+ "score_ci_high": 0.18667757600957346,
1148
+ "sacrebleu_ci_low": 0.10251074913175488,
1149
+ "sacrebleu_ci_high": 0.18667757600957346
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 817,
1155
+ 278,
1156
+ 122,
1157
+ 63
1158
+ ],
1159
+ "totals": [
1160
+ 1724,
1161
+ 1658,
1162
+ 1592,
1163
+ 1526
1164
+ ],
1165
+ "precisions": [
1166
+ 0.4738979118329466,
1167
+ 0.16767189384800965,
1168
+ 0.07663316582914573,
1169
+ 0.041284403669724766
1170
+ ],
1171
+ "bp": 0.9942163261750401,
1172
+ "sys_len": 1724,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.12518948488181658,
1175
+ "score": 0.12518948488181658,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.10085361909517791,
1178
+ "score_ci_high": 0.1695266774061832,
1179
+ "sacrebleu_ci_low": 0.10085361909517791,
1180
+ "sacrebleu_ci_high": 0.1695266774061832
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1164,
1186
+ 692,
1187
+ 452,
1188
+ 295
1189
+ ],
1190
+ "totals": [
1191
+ 1737,
1192
+ 1671,
1193
+ 1605,
1194
+ 1539
1195
+ ],
1196
+ "precisions": [
1197
+ 0.6701208981001727,
1198
+ 0.41412327947336924,
1199
+ 0.28161993769470406,
1200
+ 0.19168291098115658
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 1737,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.3498504204327118,
1206
+ "score": 0.3498504204327118,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.3154848870877421,
1209
+ "score_ci_high": 0.42165027559439294,
1210
+ "sacrebleu_ci_low": 0.3154848870877421,
1211
+ "sacrebleu_ci_high": 0.42165027559439294
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1200,
1217
+ 704,
1218
+ 458,
1219
+ 314
1220
+ ],
1221
+ "totals": [
1222
+ 1781,
1223
+ 1715,
1224
+ 1649,
1225
+ 1583
1226
+ ],
1227
+ "precisions": [
1228
+ 0.673778775968557,
1229
+ 0.41049562682215746,
1230
+ 0.2777440873256519,
1231
+ 0.1983575489576753
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 1781,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.3513418261131799,
1237
+ "score": 0.3513418261131799,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.3037866384367266,
1240
+ "score_ci_high": 0.39510754604511705,
1241
+ "sacrebleu_ci_low": 0.3037866384367266,
1242
+ "sacrebleu_ci_high": 0.39510754604511705
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1072,
1248
+ 530,
1249
+ 303,
1250
+ 175
1251
+ ],
1252
+ "totals": [
1253
+ 1809,
1254
+ 1743,
1255
+ 1677,
1256
+ 1611
1257
+ ],
1258
+ "precisions": [
1259
+ 0.5925925925925926,
1260
+ 0.30407343660355707,
1261
+ 0.18067978533094814,
1262
+ 0.10862818125387959
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 1809,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.24386343888275555,
1268
+ "score": 0.24386343888275555,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.2168231022955434,
1271
+ "score_ci_high": 0.2798922379137682,
1272
+ "sacrebleu_ci_low": 0.2168231022955434,
1273
+ "sacrebleu_ci_high": 0.2798922379137682
1274
+ },
1275
+ "score": 0.22997684790035774,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.3447885069799008,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }
results/bluebench/2025-06-19T18-10-05_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-19T22:09:59.730715Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-3-2-3b-instruct,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/meta-llama/llama-3-2-3b-instruct",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.5666666666666667,
180
+ "accuracy_ci_low": 0.4666666666666667,
181
+ "accuracy_ci_high": 0.6777777777777778,
182
+ "score_name": "accuracy",
183
+ "score": 0.5666666666666667,
184
+ "score_ci_high": 0.6777777777777778,
185
+ "score_ci_low": 0.4666666666666667,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.7333333333333333,
190
+ "accuracy_ci_low": 0.6333333333333333,
191
+ "accuracy_ci_high": 0.8222222222222222,
192
+ "score_name": "accuracy",
193
+ "score": 0.7333333333333333,
194
+ "score_ci_high": 0.8222222222222222,
195
+ "score_ci_low": 0.6333333333333333,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.8222222222222222,
200
+ "accuracy_ci_low": 0.7444444444444445,
201
+ "accuracy_ci_high": 0.9,
202
+ "score_name": "accuracy",
203
+ "score": 0.8222222222222222,
204
+ "score_ci_high": 0.9,
205
+ "score_ci_low": 0.7444444444444445,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 0.6444444444444445,
210
+ "accuracy_ci_low": 0.5333333333333333,
211
+ "accuracy_ci_high": 0.7333333333333333,
212
+ "score_name": "accuracy",
213
+ "score": 0.6444444444444445,
214
+ "score_ci_high": 0.7333333333333333,
215
+ "score_ci_low": 0.5333333333333333,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.6888888888888889,
220
+ "accuracy_ci_low": 0.5888888888888889,
221
+ "accuracy_ci_high": 0.7777777777777778,
222
+ "score_name": "accuracy",
223
+ "score": 0.6888888888888889,
224
+ "score_ci_high": 0.7777777777777778,
225
+ "score_ci_low": 0.5888888888888889,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.8111111111111111,
230
+ "accuracy_ci_low": 0.7222222222222222,
231
+ "accuracy_ci_high": 0.8777777777777778,
232
+ "score_name": "accuracy",
233
+ "score": 0.8111111111111111,
234
+ "score_ci_high": 0.8777777777777778,
235
+ "score_ci_low": 0.7222222222222222,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.7222222222222222,
240
+ "accuracy_ci_low": 0.6111111111111112,
241
+ "accuracy_ci_high": 0.8,
242
+ "score_name": "accuracy",
243
+ "score": 0.7222222222222222,
244
+ "score_ci_high": 0.8,
245
+ "score_ci_low": 0.6111111111111112,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.7111111111111111,
250
+ "accuracy_ci_low": 0.6111111111111112,
251
+ "accuracy_ci_high": 0.8,
252
+ "score_name": "accuracy",
253
+ "score": 0.7111111111111111,
254
+ "score_ci_high": 0.8,
255
+ "score_ci_low": 0.6111111111111112,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.7111111111111111,
260
+ "accuracy_ci_low": 0.6111111111111112,
261
+ "accuracy_ci_high": 0.8,
262
+ "score_name": "accuracy",
263
+ "score": 0.7111111111111111,
264
+ "score_ci_high": 0.8,
265
+ "score_ci_low": 0.6111111111111112,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.7666666666666667,
270
+ "accuracy_ci_low": 0.6666666666666666,
271
+ "accuracy_ci_high": 0.8444444444444444,
272
+ "score_name": "accuracy",
273
+ "score": 0.7666666666666667,
274
+ "score_ci_high": 0.8444444444444444,
275
+ "score_ci_low": 0.6666666666666666,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8,
280
+ "accuracy_ci_low": 0.7111111111111111,
281
+ "accuracy_ci_high": 0.8777777777777778,
282
+ "score_name": "accuracy",
283
+ "score": 0.8,
284
+ "score_ci_high": 0.8777777777777778,
285
+ "score_ci_low": 0.7111111111111111,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.7252525252525253,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.5,
296
+ "score": 0.5,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.5,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.5026737967914439,
307
+ "f1_Organization": 0.2875,
308
+ "f1_Location": 0.28571428571428575,
309
+ "f1_macro": 0.35862936083524316,
310
+ "recall_macro": 0.3171773628661296,
311
+ "precision_macro": 0.4188335014421971,
312
+ "in_classes_support": 0.7664783427495292,
313
+ "f1_micro": 0.32954545454545453,
314
+ "recall_micro": 0.3314285714285714,
315
+ "precision_micro": 0.327683615819209,
316
+ "score": 0.32954545454545453,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.28111538926666857,
319
+ "score_ci_high": 0.37977770945501865,
320
+ "f1_micro_ci_low": 0.28111538926666857,
321
+ "f1_micro_ci_high": 0.37977770945501865
322
+ },
323
+ "score": 0.32954545454545453,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.4084507042253521,
330
+ "accuracy_ci_low": 0.30985915492957744,
331
+ "accuracy_ci_high": 0.5211267605633803,
332
+ "score_name": "accuracy",
333
+ "score": 0.4084507042253521,
334
+ "score_ci_high": 0.5211267605633803,
335
+ "score_ci_low": 0.30985915492957744,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.2535211267605634,
340
+ "accuracy_ci_low": 0.15492957746478872,
341
+ "accuracy_ci_high": 0.36619718309859156,
342
+ "score_name": "accuracy",
343
+ "score": 0.2535211267605634,
344
+ "score_ci_high": 0.36619718309859156,
345
+ "score_ci_low": 0.15492957746478872,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.19718309859154928,
350
+ "accuracy_ci_low": 0.11267605633802817,
351
+ "accuracy_ci_high": 0.29577464788732394,
352
+ "score_name": "accuracy",
353
+ "score": 0.19718309859154928,
354
+ "score_ci_high": 0.29577464788732394,
355
+ "score_ci_low": 0.11267605633802817,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.29577464788732394,
360
+ "accuracy_ci_low": 0.19718309859154928,
361
+ "accuracy_ci_high": 0.39436619718309857,
362
+ "score_name": "accuracy",
363
+ "score": 0.29577464788732394,
364
+ "score_ci_high": 0.39436619718309857,
365
+ "score_ci_low": 0.19718309859154928,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.4788732394366197,
370
+ "accuracy_ci_low": 0.36619718309859156,
371
+ "accuracy_ci_high": 0.5915492957746479,
372
+ "score_name": "accuracy",
373
+ "score": 0.4788732394366197,
374
+ "score_ci_high": 0.5915492957746479,
375
+ "score_ci_low": 0.36619718309859156,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.19718309859154928,
380
+ "accuracy_ci_low": 0.11267605633802817,
381
+ "accuracy_ci_high": 0.29577464788732394,
382
+ "score_name": "accuracy",
383
+ "score": 0.19718309859154928,
384
+ "score_ci_high": 0.29577464788732394,
385
+ "score_ci_low": 0.11267605633802817,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.29577464788732394,
390
+ "accuracy_ci_low": 0.19718309859154928,
391
+ "accuracy_ci_high": 0.4225352112676056,
392
+ "score_name": "accuracy",
393
+ "score": 0.29577464788732394,
394
+ "score_ci_high": 0.4225352112676056,
395
+ "score_ci_low": 0.19718309859154928,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.30985915492957744,
400
+ "accuracy_ci_low": 0.2112676056338028,
401
+ "accuracy_ci_high": 0.4225352112676056,
402
+ "score_name": "accuracy",
403
+ "score": 0.30985915492957744,
404
+ "score_ci_high": 0.4225352112676056,
405
+ "score_ci_low": 0.2112676056338028,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.22535211267605634,
410
+ "accuracy_ci_low": 0.14084507042253522,
411
+ "accuracy_ci_high": 0.323943661971831,
412
+ "score_name": "accuracy",
413
+ "score": 0.22535211267605634,
414
+ "score_ci_high": 0.323943661971831,
415
+ "score_ci_low": 0.14084507042253522,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.11267605633802817,
420
+ "accuracy_ci_low": 0.056338028169014086,
421
+ "accuracy_ci_high": 0.19718309859154928,
422
+ "score_name": "accuracy",
423
+ "score": 0.11267605633802817,
424
+ "score_ci_high": 0.19718309859154928,
425
+ "score_ci_low": 0.056338028169014086,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.23943661971830985,
430
+ "accuracy_ci_low": 0.15492957746478872,
431
+ "accuracy_ci_high": 0.352112676056338,
432
+ "score_name": "accuracy",
433
+ "score": 0.23943661971830985,
434
+ "score_ci_high": 0.352112676056338,
435
+ "score_ci_low": 0.15492957746478872,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.2676056338028169,
440
+ "accuracy_ci_low": 0.16901408450704225,
441
+ "accuracy_ci_high": 0.38028169014084506,
442
+ "score_name": "accuracy",
443
+ "score": 0.2676056338028169,
444
+ "score_ci_high": 0.38028169014084506,
445
+ "score_ci_low": 0.16901408450704225,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.15492957746478872,
450
+ "accuracy_ci_low": 0.08450704225352113,
451
+ "accuracy_ci_high": 0.2535211267605634,
452
+ "score_name": "accuracy",
453
+ "score": 0.15492957746478872,
454
+ "score_ci_high": 0.2535211267605634,
455
+ "score_ci_low": 0.08450704225352113,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.49295774647887325,
460
+ "accuracy_ci_low": 0.38028169014084506,
461
+ "accuracy_ci_high": 0.6197183098591549,
462
+ "score_name": "accuracy",
463
+ "score": 0.49295774647887325,
464
+ "score_ci_high": 0.6197183098591549,
465
+ "score_ci_low": 0.38028169014084506,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.2806841046277666,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.322066985645933,
475
+ "f1_suggestive": 0.2,
476
+ "f1_descriptive": 0.4,
477
+ "f1_generic": 0.3157894736842105,
478
+ "f1_arbitrary": 0.45454545454545453,
479
+ "f1_fanciful": 0.24,
480
+ "f1_macro_ci_low": 0.2326130794928424,
481
+ "f1_macro_ci_high": 0.4327896628836512,
482
+ "score_name": "f1_micro",
483
+ "score": 0.34523809523809523,
484
+ "score_ci_high": 0.44408416032543374,
485
+ "score_ci_low": 0.25,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.3411764705882353,
488
+ "accuracy_ci_low": 0.24705882352941178,
489
+ "accuracy_ci_high": 0.43529411764705883,
490
+ "f1_micro": 0.34523809523809523,
491
+ "f1_micro_ci_low": 0.25,
492
+ "f1_micro_ci_high": 0.44408416032543374
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.4862053369516056,
496
+ "f1_no": 0.48484848484848486,
497
+ "f1_yes": 0.48756218905472637,
498
+ "f1_macro_ci_low": 0.4185718876526183,
499
+ "f1_macro_ci_high": 0.5583302726222448,
500
+ "score_name": "f1_micro",
501
+ "score": 0.48621553884711777,
502
+ "score_ci_high": 0.555,
503
+ "score_ci_low": 0.41708542713567837,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.485,
506
+ "accuracy_ci_low": 0.415,
507
+ "accuracy_ci_high": 0.555,
508
+ "f1_micro": 0.48621553884711777,
509
+ "f1_micro_ci_low": 0.41708542713567837,
510
+ "f1_micro_ci_high": 0.555
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.21190211834221687,
514
+ "f1_conclusion": 0.14634146341463414,
515
+ "f1_analysis": 0.3673469387755102,
516
+ "f1_decree": 0.07692307692307693,
517
+ "f1_issue": 0.2631578947368421,
518
+ "f1_facts": 0.13333333333333333,
519
+ "f1_procedural history": 0.12121212121212122,
520
+ "f1_rule": 0.375,
521
+ "f1_macro_ci_low": 0.16201147151923767,
522
+ "f1_macro_ci_high": 0.27835110124455087,
523
+ "score_name": "f1_micro",
524
+ "score": 0.24427480916030533,
525
+ "score_ci_high": 0.3110332844595539,
526
+ "score_ci_low": 0.18933051276149385,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.24,
529
+ "accuracy_ci_low": 0.185,
530
+ "accuracy_ci_high": 0.305,
531
+ "f1_micro": 0.24427480916030533,
532
+ "f1_micro_ci_low": 0.18933051276149385,
533
+ "f1_micro_ci_high": 0.3110332844595539
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5140298832430668,
537
+ "f1_yes": 0.541871921182266,
538
+ "f1_no": 0.4861878453038674,
539
+ "f1_macro_ci_low": 0.44358226014090585,
540
+ "f1_macro_ci_high": 0.5836656602180865,
541
+ "score_name": "f1_micro",
542
+ "score": 0.515625,
543
+ "score_ci_high": 0.583858269920324,
544
+ "score_ci_low": 0.4443197729294639,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.495,
547
+ "accuracy_ci_low": 0.43,
548
+ "accuracy_ci_high": 0.565,
549
+ "f1_micro": 0.515625,
550
+ "f1_micro_ci_low": 0.4443197729294639,
551
+ "f1_micro_ci_high": 0.583858269920324
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.726027397260274,
555
+ "f1_yes": 0.7123287671232876,
556
+ "f1_no": 0.7397260273972602,
557
+ "f1_macro_ci_low": 0.6205412546681461,
558
+ "f1_macro_ci_high": 0.8095667611328509,
559
+ "score_name": "f1_micro",
560
+ "score": 0.726027397260274,
561
+ "score_ci_high": 0.8079470198675497,
562
+ "score_ci_low": 0.6153846153846154,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.6235294117647059,
565
+ "accuracy_ci_low": 0.5058823529411764,
566
+ "accuracy_ci_high": 0.7176470588235294,
567
+ "f1_micro": 0.726027397260274,
568
+ "f1_micro_ci_low": 0.6153846153846154,
569
+ "f1_micro_ci_high": 0.8079470198675497
570
+ },
571
+ "score": 0.46347616810115844,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.3839629098938318,
578
+ "f1_cars": 0.6804123711340206,
579
+ "f1_windows x": 0.03125,
580
+ "f1_atheism": 0.48,
581
+ "f1_christianity": 0.425,
582
+ "f1_religion": 0.18461538461538463,
583
+ "f1_medicine": 0.6376811594202898,
584
+ "f1_computer graphics": 0.27979274611398963,
585
+ "f1_microsoft windows": 0.35294117647058826,
586
+ "f1_middle east": 0.12244897959183673,
587
+ "f1_politics": 0.26666666666666666,
588
+ "f1_motorcycles": 0.47619047619047616,
589
+ "f1_mac hardware": 0.14492753623188406,
590
+ "f1_pc hardware": 0.358974358974359,
591
+ "f1_for sale": 0.3018867924528302,
592
+ "f1_guns": 0.2,
593
+ "f1_baseball": 0.8130081300813008,
594
+ "f1_space": 0.5194805194805194,
595
+ "f1_cryptography": 0.3466666666666667,
596
+ "f1_electronics": 0.41025641025641024,
597
+ "f1_hockey": 0.6470588235294118,
598
+ "f1_macro_ci_low": 0.35856171946837523,
599
+ "f1_macro_ci_high": 0.41714627316225344,
600
+ "score_name": "f1_micro",
601
+ "score": 0.4028352037802717,
602
+ "score_ci_high": 0.433682467300905,
603
+ "score_ci_low": 0.3727273654547539,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.341,
606
+ "accuracy_ci_low": 0.312,
607
+ "accuracy_ci_high": 0.369,
608
+ "f1_micro": 0.4028352037802717,
609
+ "f1_micro_ci_low": 0.3727273654547539,
610
+ "f1_micro_ci_high": 0.433682467300905
611
+ },
612
+ "score": 0.4028352037802717,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.522130366066412,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.9174311926605505,
620
+ "f1_credit card or prepaid card": 0.2619047619047619,
621
+ "f1_debt collection": 0.47904191616766467,
622
+ "f1_checking or savings account": 0.6534653465346535,
623
+ "f1_money transfer or virtual currency or money service": 0.56,
624
+ "f1_vehicle loan or lease": 0.23076923076923078,
625
+ "f1_mortgage": 0.7037037037037037,
626
+ "f1_payday loan or title loan or personal loan": 0.14285714285714285,
627
+ "f1_student loan": 0.75,
628
+ "f1_macro_ci_low": 0.47475792993209676,
629
+ "f1_macro_ci_high": 0.5842900552170582,
630
+ "score_name": "f1_micro",
631
+ "score": 0.8006150691952845,
632
+ "score_ci_high": 0.8235944353763129,
633
+ "score_ci_low": 0.7741691905584849,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.781,
636
+ "accuracy_ci_low": 0.753,
637
+ "accuracy_ci_high": 0.805,
638
+ "f1_micro": 0.8006150691952845,
639
+ "f1_micro_ci_low": 0.7741691905584849,
640
+ "f1_micro_ci_high": 0.8235944353763129
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.6094652526866444,
644
+ "f1_mortgages and loans": 0.7821229050279329,
645
+ "f1_credit card": 0.6666666666666666,
646
+ "f1_debt collection": 0.5684210526315789,
647
+ "f1_retail banking": 0.2912621359223301,
648
+ "f1_credit reporting": 0.7388535031847133,
649
+ "f1_macro_ci_low": 0.5670634281009035,
650
+ "f1_macro_ci_high": 0.6527536805294223,
651
+ "score_name": "f1_micro",
652
+ "score": 0.6524390243902439,
653
+ "score_ci_high": 0.6904276985743381,
654
+ "score_ci_low": 0.6066261962892265,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.642,
657
+ "accuracy_ci_low": 0.594,
658
+ "accuracy_ci_high": 0.68,
659
+ "f1_micro": 0.6524390243902439,
660
+ "f1_micro_ci_low": 0.6066261962892265,
661
+ "f1_micro_ci_high": 0.6904276985743381
662
+ },
663
+ "score": 0.7265270467927643,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "program_accuracy": 0.038,
671
+ "score": 0.038,
672
+ "score_name": "program_accuracy",
673
+ "execution_accuracy": 0.03,
674
+ "program_accuracy_ci_low": 0.027702359114314717,
675
+ "program_accuracy_ci_high": 0.05038214389818779,
676
+ "score_ci_low": 0.027702359114314717,
677
+ "score_ci_high": 0.05038214389818779,
678
+ "execution_accuracy_ci_low": 0.021,
679
+ "execution_accuracy_ci_high": 0.042
680
+ },
681
+ "score": 0.038,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.3238378212061128,
688
+ "recall": 0.5112460699696839,
689
+ "f1": 0.32731883927753985,
690
+ "precision_ci_low": 0.3032630188662264,
691
+ "precision_ci_high": 0.34255213352234126,
692
+ "recall_ci_low": 0.4946874887919962,
693
+ "recall_ci_high": 0.5283045722843877,
694
+ "f1_ci_low": 0.3123215475894989,
695
+ "f1_ci_high": 0.3436240931728296,
696
+ "score_name": "f1",
697
+ "score": 0.32731883927753985,
698
+ "score_ci_high": 0.3436240931728296,
699
+ "score_ci_low": 0.3123215475894989,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5872805261611939,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6561146193246047,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5541991981615623,
704
+ "faithfullness_f1_token_overlap": 0.2552259281757564,
705
+ "faithfullness_recall_token_overlap": 0.18820051905597926,
706
+ "faithfullness_precision_token_overlap": 0.5563191178123087,
707
+ "correctness_f1_token_overlap": 0.32731883927753985,
708
+ "correctness_recall_token_overlap": 0.5112460699696839,
709
+ "correctness_precision_token_overlap": 0.3238378212061128
710
+ },
711
+ "score": 0.32731883927753985,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.435,
718
+ "accuracy_ci_low": 0.403,
719
+ "accuracy_ci_high": 0.4660514308680489,
720
+ "score_name": "accuracy",
721
+ "score": 0.435,
722
+ "score_ci_high": 0.4660514308680489,
723
+ "score_ci_low": 0.403,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.744,
728
+ "accuracy_ci_low": 0.704,
729
+ "accuracy_ci_high": 0.782,
730
+ "score_name": "accuracy",
731
+ "score": 0.744,
732
+ "score_ci_high": 0.782,
733
+ "score_ci_low": 0.704,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.5895,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 0.0,
743
+ "severity_high": 0.0,
744
+ "severity_medium": 0.0,
745
+ "severity_low": 100.0,
746
+ "category_harmful_info": 0.8374238369454232,
747
+ "category_deception": 0.8230129361453682,
748
+ "category_explicit_content": 0.8649854577249951,
749
+ "category_violence": 0.8130838501311484,
750
+ "category_discrimination": 0.8325134645777916,
751
+ "category_substance_abuse": 0.9021757920583088,
752
+ "category_pii": 0.8319368116833545,
753
+ "safety": 0.8330806816902219,
754
+ "safety_ci_low": 0.8160996919801022,
755
+ "safety_ci_high": 0.8485437387510895,
756
+ "score_name": "safety",
757
+ "score": 0.8330806816902219,
758
+ "score_ci_high": 0.8485437387510895,
759
+ "score_ci_low": 0.8160996919801022,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.8330806816902219,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rouge2": 0.20526331379083082,
770
+ "rougeLsum": 0.3552440537707598,
771
+ "rougeL": 0.2898741985263412,
772
+ "score": 0.2898741985263412,
773
+ "score_name": "rougeL",
774
+ "rouge1": 0.41514442247584377,
775
+ "rouge2_ci_low": 0.19811392792291177,
776
+ "rouge2_ci_high": 0.21345928409773357,
777
+ "rougeLsum_ci_low": 0.34590687620199956,
778
+ "rougeLsum_ci_high": 0.3636463972631658,
779
+ "rougeL_ci_low": 0.2825891854116913,
780
+ "rougeL_ci_high": 0.29754587598623977,
781
+ "score_ci_low": 0.2825891854116913,
782
+ "score_ci_high": 0.29754587598623977,
783
+ "rouge1_ci_low": 0.404637721565675,
784
+ "rouge1_ci_high": 0.4244973791646393
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rouge2": 0.01759477613392835,
789
+ "rougeLsum": 0.10341637065855062,
790
+ "rougeL": 0.09030341343927119,
791
+ "score": 0.09030341343927119,
792
+ "score_name": "rougeL",
793
+ "rouge1": 0.12511971491959425,
794
+ "rouge2_ci_low": 0.0157175663284522,
795
+ "rouge2_ci_high": 0.01952934654149148,
796
+ "rougeLsum_ci_low": 0.099113259070034,
797
+ "rougeLsum_ci_high": 0.10780067044910371,
798
+ "rougeL_ci_low": 0.08652503643426336,
799
+ "rougeL_ci_high": 0.09415506056724576,
800
+ "score_ci_low": 0.08652503643426336,
801
+ "score_ci_high": 0.09415506056724576,
802
+ "rouge1_ci_low": 0.11934211041104513,
803
+ "rouge1_ci_high": 0.1306891027165195
804
+ },
805
+ "score": 0.1900888059828062,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 1145,
814
+ 640,
815
+ 404,
816
+ 258
817
+ ],
818
+ "totals": [
819
+ 1853,
820
+ 1787,
821
+ 1721,
822
+ 1655
823
+ ],
824
+ "precisions": [
825
+ 0.6179168915272532,
826
+ 0.35814213766088415,
827
+ 0.2347472399767577,
828
+ 0.15589123867069488
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 1853,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.2999866463267908,
834
+ "score": 0.2999866463267908,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.25379441814797343,
837
+ "score_ci_high": 0.34071904065425035,
838
+ "sacrebleu_ci_low": 0.25379441814797343,
839
+ "sacrebleu_ci_high": 0.34071904065425035
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1252,
845
+ 756,
846
+ 497,
847
+ 327
848
+ ],
849
+ "totals": [
850
+ 1813,
851
+ 1747,
852
+ 1681,
853
+ 1615
854
+ ],
855
+ "precisions": [
856
+ 0.6905681191395477,
857
+ 0.4327418431597023,
858
+ 0.2956573468173706,
859
+ 0.20247678018575851
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 1813,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.3657209415128666,
865
+ "score": 0.3657209415128666,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.3364370770791262,
868
+ "score_ci_high": 0.40451448818750857,
869
+ "sacrebleu_ci_low": 0.3364370770791262,
870
+ "sacrebleu_ci_high": 0.40451448818750857
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 613,
876
+ 215,
877
+ 85,
878
+ 31
879
+ ],
880
+ "totals": [
881
+ 1681,
882
+ 1615,
883
+ 1549,
884
+ 1483
885
+ ],
886
+ "precisions": [
887
+ 0.36466389054134446,
888
+ 0.1331269349845201,
889
+ 0.05487411233053583,
890
+ 0.020903573836817263
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 1681,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.08638467153981859,
896
+ "score": 0.08638467153981859,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.07076583616200617,
899
+ "score_ci_high": 0.10779016383654101,
900
+ "sacrebleu_ci_low": 0.07076583616200617,
901
+ "sacrebleu_ci_high": 0.10779016383654101
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 1062,
907
+ 552,
908
+ 321,
909
+ 192
910
+ ],
911
+ "totals": [
912
+ 1791,
913
+ 1725,
914
+ 1659,
915
+ 1593
916
+ ],
917
+ "precisions": [
918
+ 0.592964824120603,
919
+ 0.32,
920
+ 0.19349005424954793,
921
+ 0.12052730696798493
922
+ ],
923
+ "bp": 0.9757320386302776,
924
+ "sys_len": 1791,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.25165833423579964,
927
+ "score": 0.25165833423579964,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.21420916937636764,
930
+ "score_ci_high": 0.29216503924411175,
931
+ "sacrebleu_ci_low": 0.21420916937636764,
932
+ "sacrebleu_ci_high": 0.29216503924411175
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1358,
938
+ 892,
939
+ 643,
940
+ 470
941
+ ],
942
+ "totals": [
943
+ 2016,
944
+ 1950,
945
+ 1884,
946
+ 1818
947
+ ],
948
+ "precisions": [
949
+ 0.6736111111111112,
950
+ 0.45743589743589746,
951
+ 0.3412951167728238,
952
+ 0.2585258525852585
953
+ ],
954
+ "bp": 0.9745361636262269,
955
+ "sys_len": 2016,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.395723047050928,
958
+ "score": 0.395723047050928,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.3541486960862157,
961
+ "score_ci_high": 0.4412388057124849,
962
+ "sacrebleu_ci_low": 0.3541486960862157,
963
+ "sacrebleu_ci_high": 0.4412388057124849
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 1152,
969
+ 512,
970
+ 255,
971
+ 136
972
+ ],
973
+ "totals": [
974
+ 2575,
975
+ 2509,
976
+ 2443,
977
+ 2377
978
+ ],
979
+ "precisions": [
980
+ 0.44737864077669903,
981
+ 0.20406536468712633,
982
+ 0.10437986082685223,
983
+ 0.057214976861590244
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 2575,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.152806822981997,
989
+ "score": 0.152806822981997,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.13380815870985033,
992
+ "score_ci_high": 0.17371873213476355,
993
+ "sacrebleu_ci_low": 0.13380815870985033,
994
+ "sacrebleu_ci_high": 0.17371873213476355
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1330,
1000
+ 860,
1001
+ 596,
1002
+ 421
1003
+ ],
1004
+ "totals": [
1005
+ 1885,
1006
+ 1819,
1007
+ 1753,
1008
+ 1687
1009
+ ],
1010
+ "precisions": [
1011
+ 0.7055702917771883,
1012
+ 0.4727872457394172,
1013
+ 0.33998859098687967,
1014
+ 0.24955542382928275
1015
+ ],
1016
+ "bp": 0.9836888676493653,
1017
+ "sys_len": 1885,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.4034754414236742,
1020
+ "score": 0.4034754414236742,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.35937407097652574,
1023
+ "score_ci_high": 0.4406572095477221,
1024
+ "sacrebleu_ci_low": 0.35937407097652574,
1025
+ "sacrebleu_ci_high": 0.4406572095477221
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 1195,
1031
+ 710,
1032
+ 449,
1033
+ 298
1034
+ ],
1035
+ "totals": [
1036
+ 1898,
1037
+ 1832,
1038
+ 1766,
1039
+ 1700
1040
+ ],
1041
+ "precisions": [
1042
+ 0.6296101159114857,
1043
+ 0.3875545851528384,
1044
+ 0.25424688561721404,
1045
+ 0.17529411764705885
1046
+ ],
1047
+ "bp": 0.9734874071636694,
1048
+ "sys_len": 1898,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.3143672009255487,
1051
+ "score": 0.3143672009255487,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.28824612092656865,
1054
+ "score_ci_high": 0.3595680168847451,
1055
+ "sacrebleu_ci_low": 0.28824612092656865,
1056
+ "sacrebleu_ci_high": 0.3595680168847451
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1185,
1062
+ 632,
1063
+ 363,
1064
+ 210
1065
+ ],
1066
+ "totals": [
1067
+ 1964,
1068
+ 1898,
1069
+ 1832,
1070
+ 1766
1071
+ ],
1072
+ "precisions": [
1073
+ 0.6033604887983707,
1074
+ 0.332982086406744,
1075
+ 0.19814410480349345,
1076
+ 0.11891279728199321
1077
+ ],
1078
+ "bp": 0.9340473875491699,
1079
+ "sys_len": 1964,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.24500265020019107,
1082
+ "score": 0.24500265020019107,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.21966892731983978,
1085
+ "score_ci_high": 0.27134777294522555,
1086
+ "sacrebleu_ci_low": 0.21966892731983978,
1087
+ "sacrebleu_ci_high": 0.27134777294522555
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1271,
1093
+ 807,
1094
+ 551,
1095
+ 380
1096
+ ],
1097
+ "totals": [
1098
+ 1794,
1099
+ 1728,
1100
+ 1662,
1101
+ 1596
1102
+ ],
1103
+ "precisions": [
1104
+ 0.7084726867335562,
1105
+ 0.46701388888888884,
1106
+ 0.3315282791817088,
1107
+ 0.2380952380952381
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 1794,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.40200462477302346,
1113
+ "score": 0.40200462477302346,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.36426910997213796,
1116
+ "score_ci_high": 0.44904824239366326,
1117
+ "sacrebleu_ci_low": 0.36426910997213796,
1118
+ "sacrebleu_ci_high": 0.44904824239366326
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 989,
1124
+ 453,
1125
+ 243,
1126
+ 137
1127
+ ],
1128
+ "totals": [
1129
+ 1812,
1130
+ 1746,
1131
+ 1680,
1132
+ 1614
1133
+ ],
1134
+ "precisions": [
1135
+ 0.5458057395143487,
1136
+ 0.25945017182130586,
1137
+ 0.14464285714285713,
1138
+ 0.0848822800495663
1139
+ ],
1140
+ "bp": 1.0,
1141
+ "sys_len": 1812,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.20419801799597426,
1144
+ "score": 0.20419801799597426,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.18116496265849946,
1147
+ "score_ci_high": 0.25247298942958346,
1148
+ "sacrebleu_ci_low": 0.18116496265849946,
1149
+ "sacrebleu_ci_high": 0.25247298942958346
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 956,
1155
+ 417,
1156
+ 215,
1157
+ 111
1158
+ ],
1159
+ "totals": [
1160
+ 1742,
1161
+ 1676,
1162
+ 1610,
1163
+ 1544
1164
+ ],
1165
+ "precisions": [
1166
+ 0.5487944890929966,
1167
+ 0.24880668257756564,
1168
+ 0.13354037267080746,
1169
+ 0.07189119170984455
1170
+ ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 1742,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.19027862841650364,
1175
+ "score": 0.19027862841650364,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.15954407468337273,
1178
+ "score_ci_high": 0.2316279594911264,
1179
+ "sacrebleu_ci_low": 0.15954407468337273,
1180
+ "sacrebleu_ci_high": 0.2316279594911264
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1274,
1186
+ 831,
1187
+ 594,
1188
+ 443
1189
+ ],
1190
+ "totals": [
1191
+ 1787,
1192
+ 1721,
1193
+ 1655,
1194
+ 1589
1195
+ ],
1196
+ "precisions": [
1197
+ 0.7129266927811976,
1198
+ 0.48285880302149914,
1199
+ 0.3589123867069486,
1200
+ 0.27879169288860917
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 1787,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.43080757024308985,
1206
+ "score": 0.43080757024308985,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.3788088705992759,
1209
+ "score_ci_high": 0.48020746054863056,
1210
+ "sacrebleu_ci_low": 0.3788088705992759,
1211
+ "sacrebleu_ci_high": 0.48020746054863056
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1296,
1217
+ 820,
1218
+ 558,
1219
+ 381
1220
+ ],
1221
+ "totals": [
1222
+ 1844,
1223
+ 1778,
1224
+ 1712,
1225
+ 1646
1226
+ ],
1227
+ "precisions": [
1228
+ 0.7028199566160521,
1229
+ 0.4611923509561305,
1230
+ 0.3259345794392523,
1231
+ 0.23147023086269744
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 1844,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.3954466865992584,
1237
+ "score": 0.3954466865992584,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.3534147781069182,
1240
+ "score_ci_high": 0.4360974663884546,
1241
+ "sacrebleu_ci_low": 0.3534147781069182,
1242
+ "sacrebleu_ci_high": 0.4360974663884546
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1125,
1248
+ 590,
1249
+ 351,
1250
+ 212
1251
+ ],
1252
+ "totals": [
1253
+ 1856,
1254
+ 1790,
1255
+ 1724,
1256
+ 1658
1257
+ ],
1258
+ "precisions": [
1259
+ 0.6061422413793103,
1260
+ 0.3296089385474861,
1261
+ 0.20359628770301627,
1262
+ 0.1278648974668275
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 1856,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.26854908700533703,
1268
+ "score": 0.26854908700533703,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.2352580605615156,
1271
+ "score_ci_high": 0.30717127715915404,
1272
+ "sacrebleu_ci_low": 0.2352580605615156,
1273
+ "sacrebleu_ci_high": 0.30717127715915404
1274
+ },
1275
+ "score": 0.29376069141538674,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.43846688626660735,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }
results/bluebench/2025-06-19T20-10-50_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-20T00:10:45.998753Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/meta-llama/llama-4-maverick-17b-128e-instruct-fp8,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.8888888888888888,
180
+ "accuracy_ci_low": 0.8111111111111111,
181
+ "accuracy_ci_high": 0.9444444444444444,
182
+ "score_name": "accuracy",
183
+ "score": 0.8888888888888888,
184
+ "score_ci_high": 0.9444444444444444,
185
+ "score_ci_low": 0.8111111111111111,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.9777777777777777,
190
+ "accuracy_ci_low": 0.9222222222222223,
191
+ "accuracy_ci_high": 1.0,
192
+ "score_name": "accuracy",
193
+ "score": 0.9777777777777777,
194
+ "score_ci_high": 1.0,
195
+ "score_ci_low": 0.9222222222222223,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 1.0,
200
+ "accuracy_ci_low": 1.0,
201
+ "accuracy_ci_high": 1.0,
202
+ "score_name": "accuracy",
203
+ "score": 1.0,
204
+ "score_ci_high": 1.0,
205
+ "score_ci_low": 1.0,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 1.0,
210
+ "accuracy_ci_low": 1.0,
211
+ "accuracy_ci_high": 1.0,
212
+ "score_name": "accuracy",
213
+ "score": 1.0,
214
+ "score_ci_high": 1.0,
215
+ "score_ci_low": 1.0,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.9888888888888889,
220
+ "accuracy_ci_low": 0.9389750917617445,
221
+ "accuracy_ci_high": 1.0,
222
+ "score_name": "accuracy",
223
+ "score": 0.9888888888888889,
224
+ "score_ci_high": 1.0,
225
+ "score_ci_low": 0.9389750917617445,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.9888888888888889,
230
+ "accuracy_ci_low": 0.9333333333333333,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 0.9888888888888889,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 0.9333333333333333,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 1.0,
240
+ "accuracy_ci_low": 1.0,
241
+ "accuracy_ci_high": 1.0,
242
+ "score_name": "accuracy",
243
+ "score": 1.0,
244
+ "score_ci_high": 1.0,
245
+ "score_ci_low": 1.0,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 1.0,
250
+ "accuracy_ci_low": 1.0,
251
+ "accuracy_ci_high": 1.0,
252
+ "score_name": "accuracy",
253
+ "score": 1.0,
254
+ "score_ci_high": 1.0,
255
+ "score_ci_low": 1.0,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.8888888888888888,
260
+ "accuracy_ci_low": 0.8111111111111111,
261
+ "accuracy_ci_high": 0.9444444444444444,
262
+ "score_name": "accuracy",
263
+ "score": 0.8888888888888888,
264
+ "score_ci_high": 0.9444444444444444,
265
+ "score_ci_low": 0.8111111111111111,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.9888888888888889,
270
+ "accuracy_ci_low": 0.9444444444444444,
271
+ "accuracy_ci_high": 1.0,
272
+ "score_name": "accuracy",
273
+ "score": 0.9888888888888889,
274
+ "score_ci_high": 1.0,
275
+ "score_ci_low": 0.9444444444444444,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8666666666666667,
280
+ "accuracy_ci_low": 0.7888888888888889,
281
+ "accuracy_ci_high": 0.9333333333333333,
282
+ "score_name": "accuracy",
283
+ "score": 0.8666666666666667,
284
+ "score_ci_high": 0.9333333333333333,
285
+ "score_ci_low": 0.7888888888888889,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.9626262626262626,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.5,
296
+ "score": 0.5,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.5,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.5662337662337662,
307
+ "f1_Organization": 0.34810126582278483,
308
+ "f1_Location": 0.4031620553359684,
309
+ "f1_macro": 0.4391656957975065,
310
+ "recall_macro": 0.39723251248500296,
311
+ "precision_macro": 0.49375910707000065,
312
+ "in_classes_support": 0.5212636695018227,
313
+ "f1_micro": 0.31899109792284863,
314
+ "recall_micro": 0.4095238095238095,
315
+ "precision_micro": 0.26123936816524906,
316
+ "score": 0.31899109792284863,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.277439612598807,
319
+ "score_ci_high": 0.3678125973620034,
320
+ "f1_micro_ci_low": 0.277439612598807,
321
+ "f1_micro_ci_high": 0.3678125973620034
322
+ },
323
+ "score": 0.31899109792284863,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.5915492957746479,
330
+ "accuracy_ci_low": 0.4788732394366197,
331
+ "accuracy_ci_high": 0.704225352112676,
332
+ "score_name": "accuracy",
333
+ "score": 0.5915492957746479,
334
+ "score_ci_high": 0.704225352112676,
335
+ "score_ci_low": 0.4788732394366197,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.2676056338028169,
340
+ "accuracy_ci_low": 0.16901408450704225,
341
+ "accuracy_ci_high": 0.38028169014084506,
342
+ "score_name": "accuracy",
343
+ "score": 0.2676056338028169,
344
+ "score_ci_high": 0.38028169014084506,
345
+ "score_ci_low": 0.16901408450704225,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.23943661971830985,
350
+ "accuracy_ci_low": 0.15492957746478872,
351
+ "accuracy_ci_high": 0.352112676056338,
352
+ "score_name": "accuracy",
353
+ "score": 0.23943661971830985,
354
+ "score_ci_high": 0.352112676056338,
355
+ "score_ci_low": 0.15492957746478872,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.5070422535211268,
360
+ "accuracy_ci_low": 0.39436619718309857,
361
+ "accuracy_ci_high": 0.6197183098591549,
362
+ "score_name": "accuracy",
363
+ "score": 0.5070422535211268,
364
+ "score_ci_high": 0.6197183098591549,
365
+ "score_ci_low": 0.39436619718309857,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.6901408450704225,
370
+ "accuracy_ci_low": 0.5633802816901409,
371
+ "accuracy_ci_high": 0.7887323943661971,
372
+ "score_name": "accuracy",
373
+ "score": 0.6901408450704225,
374
+ "score_ci_high": 0.7887323943661971,
375
+ "score_ci_low": 0.5633802816901409,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.39436619718309857,
380
+ "accuracy_ci_low": 0.29577464788732394,
381
+ "accuracy_ci_high": 0.5070422535211268,
382
+ "score_name": "accuracy",
383
+ "score": 0.39436619718309857,
384
+ "score_ci_high": 0.5070422535211268,
385
+ "score_ci_low": 0.29577464788732394,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.5211267605633803,
390
+ "accuracy_ci_low": 0.39436619718309857,
391
+ "accuracy_ci_high": 0.6197183098591549,
392
+ "score_name": "accuracy",
393
+ "score": 0.5211267605633803,
394
+ "score_ci_high": 0.6197183098591549,
395
+ "score_ci_low": 0.39436619718309857,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.7746478873239436,
400
+ "accuracy_ci_low": 0.6619718309859155,
401
+ "accuracy_ci_high": 0.8591549295774648,
402
+ "score_name": "accuracy",
403
+ "score": 0.7746478873239436,
404
+ "score_ci_high": 0.8591549295774648,
405
+ "score_ci_low": 0.6619718309859155,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.5774647887323944,
410
+ "accuracy_ci_low": 0.4631453652997223,
411
+ "accuracy_ci_high": 0.6901408450704225,
412
+ "score_name": "accuracy",
413
+ "score": 0.5774647887323944,
414
+ "score_ci_high": 0.6901408450704225,
415
+ "score_ci_low": 0.4631453652997223,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.2112676056338028,
420
+ "accuracy_ci_low": 0.1267605633802817,
421
+ "accuracy_ci_high": 0.323943661971831,
422
+ "score_name": "accuracy",
423
+ "score": 0.2112676056338028,
424
+ "score_ci_high": 0.323943661971831,
425
+ "score_ci_low": 0.1267605633802817,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.5633802816901409,
430
+ "accuracy_ci_low": 0.4507042253521127,
431
+ "accuracy_ci_high": 0.668060546470624,
432
+ "score_name": "accuracy",
433
+ "score": 0.5633802816901409,
434
+ "score_ci_high": 0.668060546470624,
435
+ "score_ci_low": 0.4507042253521127,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.6901408450704225,
440
+ "accuracy_ci_low": 0.5774647887323944,
441
+ "accuracy_ci_high": 0.7887323943661971,
442
+ "score_name": "accuracy",
443
+ "score": 0.6901408450704225,
444
+ "score_ci_high": 0.7887323943661971,
445
+ "score_ci_low": 0.5774647887323944,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.4084507042253521,
450
+ "accuracy_ci_low": 0.29577464788732394,
451
+ "accuracy_ci_high": 0.5211267605633803,
452
+ "score_name": "accuracy",
453
+ "score": 0.4084507042253521,
454
+ "score_ci_high": 0.5211267605633803,
455
+ "score_ci_low": 0.29577464788732394,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.647887323943662,
460
+ "accuracy_ci_low": 0.5352112676056338,
461
+ "accuracy_ci_high": 0.7605633802816901,
462
+ "score_name": "accuracy",
463
+ "score": 0.647887323943662,
464
+ "score_ci_high": 0.7605633802816901,
465
+ "score_ci_low": 0.5352112676056338,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.506036217303823,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.5831370899915895,
475
+ "f1_suggestive": 0.4827586206896552,
476
+ "f1_generic": 0.6666666666666666,
477
+ "f1_descriptive": 0.6666666666666666,
478
+ "f1_fanciful": 0.4166666666666667,
479
+ "f1_arbitrary": 0.6829268292682927,
480
+ "f1_macro_ci_low": 0.47939113487694995,
481
+ "f1_macro_ci_high": 0.6897999117090845,
482
+ "score_name": "f1_micro",
483
+ "score": 0.5987261146496815,
484
+ "score_ci_high": 0.6962025316455697,
485
+ "score_ci_low": 0.4807376602538022,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.5529411764705883,
488
+ "accuracy_ci_low": 0.43529411764705883,
489
+ "accuracy_ci_high": 0.6588235294117647,
490
+ "f1_micro": 0.5987261146496815,
491
+ "f1_micro_ci_low": 0.4807376602538022,
492
+ "f1_micro_ci_high": 0.6962025316455697
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.6763636363636364,
496
+ "f1_no": 0.7927272727272727,
497
+ "f1_yes": 0.56,
498
+ "f1_macro_ci_low": 0.6036339063806182,
499
+ "f1_macro_ci_high": 0.7468677315003386,
500
+ "score_name": "f1_micro",
501
+ "score": 0.7306666666666667,
502
+ "score_ci_high": 0.7853403141361257,
503
+ "score_ci_low": 0.6630296211830374,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.685,
506
+ "accuracy_ci_low": 0.62,
507
+ "accuracy_ci_high": 0.745,
508
+ "f1_micro": 0.7306666666666667,
509
+ "f1_micro_ci_low": 0.6630296211830374,
510
+ "f1_micro_ci_high": 0.7853403141361257
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.2642730181773706,
514
+ "f1_conclusion": 0.0625,
515
+ "f1_issue": 0.16326530612244897,
516
+ "f1_decree": 0.2,
517
+ "f1_analysis": 0.4375,
518
+ "f1_facts": 0.32558139534883723,
519
+ "f1_procedural history": 0.19047619047619047,
520
+ "f1_rule": 0.47058823529411764,
521
+ "f1_macro_ci_low": 0.20968219014642994,
522
+ "f1_macro_ci_high": 0.33253527853885895,
523
+ "score_name": "f1_micro",
524
+ "score": 0.28938906752411575,
525
+ "score_ci_high": 0.3618842117391186,
526
+ "score_ci_low": 0.22364217252396165,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.225,
529
+ "accuracy_ci_low": 0.175,
530
+ "accuracy_ci_high": 0.29,
531
+ "f1_micro": 0.28938906752411575,
532
+ "f1_micro_ci_low": 0.22364217252396165,
533
+ "f1_micro_ci_high": 0.3618842117391186
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.5830978545264259,
537
+ "f1_yes": 0.5918367346938775,
538
+ "f1_no": 0.5743589743589743,
539
+ "f1_macro_ci_low": 0.5111003138485096,
540
+ "f1_macro_ci_high": 0.6506689237239318,
541
+ "score_name": "f1_micro",
542
+ "score": 0.5831202046035806,
543
+ "score_ci_high": 0.649616368286445,
544
+ "score_ci_low": 0.5114249450573659,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.57,
547
+ "accuracy_ci_low": 0.4968446470094224,
548
+ "accuracy_ci_high": 0.635,
549
+ "f1_micro": 0.5831202046035806,
550
+ "f1_micro_ci_low": 0.5114249450573659,
551
+ "f1_micro_ci_high": 0.649616368286445
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.7776061776061776,
555
+ "f1_yes": 0.7714285714285715,
556
+ "f1_no": 0.7837837837837838,
557
+ "f1_macro_ci_low": 0.6936100514418908,
558
+ "f1_macro_ci_high": 0.8455722600304791,
559
+ "score_name": "f1_micro",
560
+ "score": 0.7777777777777778,
561
+ "score_ci_high": 0.8435374149659864,
562
+ "score_ci_low": 0.6950354609929078,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.6588235294117647,
565
+ "accuracy_ci_low": 0.5647058823529412,
566
+ "accuracy_ci_high": 0.7411764705882353,
567
+ "f1_micro": 0.7777777777777778,
568
+ "f1_micro_ci_low": 0.6950354609929078,
569
+ "f1_micro_ci_high": 0.8435374149659864
570
+ },
571
+ "score": 0.5959359662443645,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.6167905620006249,
578
+ "f1_cars": 0.8089887640449438,
579
+ "f1_windows x": 0.06153846153846154,
580
+ "f1_computer graphics": 0.5510204081632653,
581
+ "f1_atheism": 0.1951219512195122,
582
+ "f1_christianity": 0.8288288288288288,
583
+ "f1_religion": 0.1568627450980392,
584
+ "f1_medicine": 0.8505747126436781,
585
+ "f1_microsoft windows": 0.75,
586
+ "f1_middle east": 0.6666666666666666,
587
+ "f1_motorcycles": 0.7619047619047619,
588
+ "f1_politics": 0.359375,
589
+ "f1_pc hardware": 0.6619718309859155,
590
+ "f1_mac hardware": 0.7358490566037735,
591
+ "f1_for sale": 0.5806451612903226,
592
+ "f1_guns": 0.3561643835616438,
593
+ "f1_space": 0.82,
594
+ "f1_cryptography": 0.6666666666666666,
595
+ "f1_baseball": 0.9166666666666666,
596
+ "f1_hockey": 0.9402985074626866,
597
+ "f1_electronics": 0.6666666666666666,
598
+ "f1_macro_ci_low": 0.5929180247135345,
599
+ "f1_macro_ci_high": 0.6464945617502024,
600
+ "score_name": "f1_micro",
601
+ "score": 0.661588683351469,
602
+ "score_ci_high": 0.6918918918918919,
603
+ "score_ci_low": 0.6351762173413632,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.608,
606
+ "accuracy_ci_low": 0.582,
607
+ "accuracy_ci_high": 0.6398246959343236,
608
+ "f1_micro": 0.661588683351469,
609
+ "f1_micro_ci_low": 0.6351762173413632,
610
+ "f1_micro_ci_high": 0.6918918918918919
611
+ },
612
+ "score": 0.661588683351469,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.7156339434074247,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.924907063197026,
620
+ "f1_credit card or prepaid card": 0.8,
621
+ "f1_debt collection": 0.659217877094972,
622
+ "f1_checking or savings account": 0.8070175438596491,
623
+ "f1_money transfer or virtual currency or money service": 0.6896551724137931,
624
+ "f1_student loan": 0.7741935483870968,
625
+ "f1_vehicle loan or lease": 0.625,
626
+ "f1_mortgage": 0.875,
627
+ "f1_payday loan or title loan or personal loan": 0.2857142857142857,
628
+ "f1_macro_ci_low": 0.6660333456490072,
629
+ "f1_macro_ci_high": 0.7763587756574478,
630
+ "score_name": "f1_micro",
631
+ "score": 0.863659401926001,
632
+ "score_ci_high": 0.8836978702477332,
633
+ "score_ci_low": 0.8417078870760507,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.852,
636
+ "accuracy_ci_low": 0.8286876874270778,
637
+ "accuracy_ci_high": 0.871,
638
+ "f1_micro": 0.863659401926001,
639
+ "f1_micro_ci_low": 0.8417078870760507,
640
+ "f1_micro_ci_high": 0.8836978702477332
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.7760995406721664,
644
+ "f1_mortgages and loans": 0.8539325842696629,
645
+ "f1_credit card": 0.8444444444444444,
646
+ "f1_debt collection": 0.7117117117117117,
647
+ "f1_credit reporting": 0.752851711026616,
648
+ "f1_retail banking": 0.7175572519083969,
649
+ "f1_macro_ci_low": 0.7391680922929513,
650
+ "f1_macro_ci_high": 0.8135480304798374,
651
+ "score_name": "f1_micro",
652
+ "score": 0.7741273100616016,
653
+ "score_ci_high": 0.809811768563787,
654
+ "score_ci_low": 0.7373612854039264,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.754,
657
+ "accuracy_ci_low": 0.716,
658
+ "accuracy_ci_high": 0.794,
659
+ "f1_micro": 0.7741273100616016,
660
+ "f1_micro_ci_low": 0.7373612854039264,
661
+ "f1_micro_ci_high": 0.809811768563787
662
+ },
663
+ "score": 0.8188933559938013,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "execution_accuracy": 0.162,
671
+ "program_accuracy": 0.182,
672
+ "score": 0.182,
673
+ "score_name": "program_accuracy",
674
+ "execution_accuracy_ci_low": 0.13950521872118443,
675
+ "execution_accuracy_ci_high": 0.186,
676
+ "program_accuracy_ci_low": 0.159,
677
+ "program_accuracy_ci_high": 0.207,
678
+ "score_ci_low": 0.159,
679
+ "score_ci_high": 0.207
680
+ },
681
+ "score": 0.182,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.3472291622071149,
688
+ "recall": 0.5550279207444034,
689
+ "f1": 0.36009972180369076,
690
+ "precision_ci_low": 0.3242384129530926,
691
+ "precision_ci_high": 0.37066398547692364,
692
+ "recall_ci_low": 0.5394838751494561,
693
+ "recall_ci_high": 0.5723924699685504,
694
+ "f1_ci_low": 0.3409334568421404,
695
+ "f1_ci_high": 0.379665380653375,
696
+ "score_name": "f1",
697
+ "score": 0.36009972180369076,
698
+ "score_ci_high": 0.379665380653375,
699
+ "score_ci_low": 0.3409334568421404,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.6113383284211159,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6853377008934816,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5746090712646643,
704
+ "faithfullness_f1_token_overlap": 0.2924701621871153,
705
+ "faithfullness_recall_token_overlap": 0.21371625625934337,
706
+ "faithfullness_precision_token_overlap": 0.5963665736789522,
707
+ "correctness_f1_token_overlap": 0.36009972180369076,
708
+ "correctness_recall_token_overlap": 0.5550279207444034,
709
+ "correctness_precision_token_overlap": 0.3472291622071149
710
+ },
711
+ "score": 0.36009972180369076,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.56,
718
+ "accuracy_ci_low": 0.531,
719
+ "accuracy_ci_high": 0.591,
720
+ "score_name": "accuracy",
721
+ "score": 0.56,
722
+ "score_ci_high": 0.591,
723
+ "score_ci_low": 0.531,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.792,
728
+ "accuracy_ci_low": 0.754,
729
+ "accuracy_ci_high": 0.824,
730
+ "score_name": "accuracy",
731
+ "score": 0.792,
732
+ "score_ci_high": 0.824,
733
+ "score_ci_low": 0.754,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.676,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 0.0,
743
+ "severity_high": 1.0,
744
+ "severity_medium": 3.0,
745
+ "severity_low": 96.0,
746
+ "category_harmful_info": 0.8329531024875386,
747
+ "category_deception": 0.8794551490608491,
748
+ "category_explicit_content": 0.8125141191512649,
749
+ "category_violence": 0.802003665895216,
750
+ "category_discrimination": 0.7836076390175591,
751
+ "category_substance_abuse": 0.7930756719575989,
752
+ "category_pii": 0.8714520589069084,
753
+ "safety": 0.8206946513594852,
754
+ "safety_ci_low": 0.7989320712853799,
755
+ "safety_ci_high": 0.8426819113033088,
756
+ "score_name": "safety",
757
+ "score": 0.8206946513594852,
758
+ "score_ci_high": 0.8426819113033088,
759
+ "score_ci_low": 0.7989320712853799,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.8206946513594852,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rouge1": 0.4312754109443129,
770
+ "rougeL": 0.3112998958854518,
771
+ "score": 0.3112998958854518,
772
+ "score_name": "rougeL",
773
+ "rouge2": 0.2266838399222002,
774
+ "rougeLsum": 0.37538852494202224,
775
+ "rouge1_ci_low": 0.421001593558283,
776
+ "rouge1_ci_high": 0.4405064760072156,
777
+ "rougeL_ci_low": 0.3032282310196722,
778
+ "rougeL_ci_high": 0.3196260335861935,
779
+ "score_ci_low": 0.3032282310196722,
780
+ "score_ci_high": 0.3196260335861935,
781
+ "rouge2_ci_low": 0.21901058073811183,
782
+ "rouge2_ci_high": 0.23539084125213852,
783
+ "rougeLsum_ci_low": 0.36595005735640374,
784
+ "rougeLsum_ci_high": 0.3843319255959285
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rouge1": 0.12832603897211398,
789
+ "rougeL": 0.09174745116852957,
790
+ "score": 0.09174745116852957,
791
+ "score_name": "rougeL",
792
+ "rouge2": 0.018830792500723063,
793
+ "rougeLsum": 0.1046960824796928,
794
+ "rouge1_ci_low": 0.1226586252529639,
795
+ "rouge1_ci_high": 0.13355174728344751,
796
+ "rougeL_ci_low": 0.08777783885967977,
797
+ "rougeL_ci_high": 0.09513486417451719,
798
+ "score_ci_low": 0.08777783885967977,
799
+ "score_ci_high": 0.09513486417451719,
800
+ "rouge2_ci_low": 0.01683460814089518,
801
+ "rouge2_ci_high": 0.020698285797084142,
802
+ "rougeLsum_ci_low": 0.10022616921218402,
803
+ "rougeLsum_ci_high": 0.10887661841550207
804
+ },
805
+ "score": 0.2015236735269907,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 1275,
814
+ 830,
815
+ 586,
816
+ 417
817
+ ],
818
+ "totals": [
819
+ 1807,
820
+ 1741,
821
+ 1675,
822
+ 1609
823
+ ],
824
+ "precisions": [
825
+ 0.7055893746541229,
826
+ 0.47673750717978175,
827
+ 0.34985074626865675,
828
+ 0.2591671845866998
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 1807,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.41790112689604164,
834
+ "score": 0.41790112689604164,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.37467076425053164,
837
+ "score_ci_high": 0.46212421285570143,
838
+ "sacrebleu_ci_low": 0.37467076425053164,
839
+ "sacrebleu_ci_high": 0.46212421285570143
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1296,
845
+ 865,
846
+ 615,
847
+ 440
848
+ ],
849
+ "totals": [
850
+ 1821,
851
+ 1755,
852
+ 1689,
853
+ 1623
854
+ ],
855
+ "precisions": [
856
+ 0.7116968698517299,
857
+ 0.49287749287749283,
858
+ 0.36412078152753113,
859
+ 0.2711028958718423
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 1821,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.4313734544798424,
865
+ "score": 0.4313734544798424,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.39011475848676946,
868
+ "score_ci_high": 0.46975242344141427,
869
+ "sacrebleu_ci_low": 0.39011475848676946,
870
+ "sacrebleu_ci_high": 0.46975242344141427
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 963,
876
+ 569,
877
+ 359,
878
+ 232
879
+ ],
880
+ "totals": [
881
+ 1592,
882
+ 1526,
883
+ 1460,
884
+ 1394
885
+ ],
886
+ "precisions": [
887
+ 0.6048994974874372,
888
+ 0.372870249017038,
889
+ 0.24589041095890413,
890
+ 0.16642754662840747
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 1592,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.3099573506400797,
896
+ "score": 0.3099573506400797,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.2802812834383563,
899
+ "score_ci_high": 0.34144542695693814,
900
+ "sacrebleu_ci_low": 0.2802812834383563,
901
+ "sacrebleu_ci_high": 0.34144542695693814
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 1260,
907
+ 822,
908
+ 589,
909
+ 442
910
+ ],
911
+ "totals": [
912
+ 1834,
913
+ 1768,
914
+ 1702,
915
+ 1636
916
+ ],
917
+ "precisions": [
918
+ 0.6870229007633588,
919
+ 0.4649321266968326,
920
+ 0.34606345475910694,
921
+ 0.2701711491442543
922
+ ],
923
+ "bp": 0.9994548923547389,
924
+ "sys_len": 1834,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.41548186092135636,
927
+ "score": 0.41548186092135636,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.3641424966826166,
930
+ "score_ci_high": 0.46838012385794353,
931
+ "sacrebleu_ci_low": 0.3641424966826166,
932
+ "sacrebleu_ci_high": 0.46838012385794353
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1594,
938
+ 1232,
939
+ 984,
940
+ 810
941
+ ],
942
+ "totals": [
943
+ 2012,
944
+ 1946,
945
+ 1880,
946
+ 1814
947
+ ],
948
+ "precisions": [
949
+ 0.7922465208747514,
950
+ 0.6330935251798561,
951
+ 0.5234042553191489,
952
+ 0.44652701212789414
953
+ ],
954
+ "bp": 0.9725507672852267,
955
+ "sys_len": 2012,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.5690698546093208,
958
+ "score": 0.5690698546093208,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.52882560047152,
961
+ "score_ci_high": 0.6139798297379135,
962
+ "sacrebleu_ci_low": 0.52882560047152,
963
+ "sacrebleu_ci_high": 0.6139798297379135
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 1376,
969
+ 781,
970
+ 495,
971
+ 326
972
+ ],
973
+ "totals": [
974
+ 2361,
975
+ 2295,
976
+ 2229,
977
+ 2163
978
+ ],
979
+ "precisions": [
980
+ 0.5828038966539602,
981
+ 0.34030501089324616,
982
+ 0.22207267833109018,
983
+ 0.15071659731853906
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 2361,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.28543797421890393,
989
+ "score": 0.28543797421890393,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.2567799199475619,
992
+ "score_ci_high": 0.31814664829371603,
993
+ "sacrebleu_ci_low": 0.2567799199475619,
994
+ "sacrebleu_ci_high": 0.31814664829371603
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1451,
1000
+ 1053,
1001
+ 814,
1002
+ 640
1003
+ ],
1004
+ "totals": [
1005
+ 1898,
1006
+ 1832,
1007
+ 1766,
1008
+ 1700
1009
+ ],
1010
+ "precisions": [
1011
+ 0.7644889357218125,
1012
+ 0.5747816593886462,
1013
+ 0.46092865232163077,
1014
+ 0.3764705882352941
1015
+ ],
1016
+ "bp": 0.9905611611284771,
1017
+ "sys_len": 1898,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.52052430367519,
1020
+ "score": 0.52052430367519,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.46893066617087675,
1023
+ "score_ci_high": 0.5524273546032454,
1024
+ "sacrebleu_ci_low": 0.46893066617087675,
1025
+ "sacrebleu_ci_high": 0.5524273546032454
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 1416,
1031
+ 1021,
1032
+ 771,
1033
+ 581
1034
+ ],
1035
+ "totals": [
1036
+ 1945,
1037
+ 1879,
1038
+ 1813,
1039
+ 1747
1040
+ ],
1041
+ "precisions": [
1042
+ 0.7280205655526992,
1043
+ 0.5433741351782863,
1044
+ 0.42526199669056813,
1045
+ 0.3325701202060675
1046
+ ],
1047
+ "bp": 0.9979455579909386,
1048
+ "sys_len": 1945,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.4853471146871033,
1051
+ "score": 0.4853471146871033,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.45737023291404577,
1054
+ "score_ci_high": 0.5267894003889202,
1055
+ "sacrebleu_ci_low": 0.45737023291404577,
1056
+ "sacrebleu_ci_high": 0.5267894003889202
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1269,
1062
+ 719,
1063
+ 439,
1064
+ 270
1065
+ ],
1066
+ "totals": [
1067
+ 1972,
1068
+ 1906,
1069
+ 1840,
1070
+ 1774
1071
+ ],
1072
+ "precisions": [
1073
+ 0.6435091277890467,
1074
+ 0.3772298006295907,
1075
+ 0.23858695652173914,
1076
+ 0.15219842164599773
1077
+ ],
1078
+ "bp": 0.9381039423957293,
1079
+ "sys_len": 1972,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.28744539070468517,
1082
+ "score": 0.28744539070468517,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.2566627625544366,
1085
+ "score_ci_high": 0.3146920359001326,
1086
+ "sacrebleu_ci_low": 0.2566627625544366,
1087
+ "sacrebleu_ci_high": 0.3146920359001326
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1324,
1093
+ 920,
1094
+ 666,
1095
+ 477
1096
+ ],
1097
+ "totals": [
1098
+ 1893,
1099
+ 1827,
1100
+ 1761,
1101
+ 1695
1102
+ ],
1103
+ "precisions": [
1104
+ 0.6994189117802431,
1105
+ 0.5035577449370553,
1106
+ 0.3781942078364565,
1107
+ 0.2814159292035398
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 1893,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.44001000355221576,
1113
+ "score": 0.44001000355221576,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.40032693502671896,
1116
+ "score_ci_high": 0.4786325473299439,
1117
+ "sacrebleu_ci_low": 0.40032693502671896,
1118
+ "sacrebleu_ci_high": 0.4786325473299439
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 1135,
1124
+ 622,
1125
+ 371,
1126
+ 231
1127
+ ],
1128
+ "totals": [
1129
+ 2011,
1130
+ 1945,
1131
+ 1879,
1132
+ 1813
1133
+ ],
1134
+ "precisions": [
1135
+ 0.564395822973645,
1136
+ 0.3197943444730077,
1137
+ 0.19744544970729114,
1138
+ 0.1274131274131274
1139
+ ],
1140
+ "bp": 1.0,
1141
+ "sys_len": 2011,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.259584626709523,
1144
+ "score": 0.259584626709523,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.22697300814650573,
1147
+ "score_ci_high": 0.2908889327642379,
1148
+ "sacrebleu_ci_low": 0.22697300814650573,
1149
+ "sacrebleu_ci_high": 0.2908889327642379
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 1120,
1155
+ 635,
1156
+ 400,
1157
+ 262
1158
+ ],
1159
+ "totals": [
1160
+ 1861,
1161
+ 1795,
1162
+ 1729,
1163
+ 1663
1164
+ ],
1165
+ "precisions": [
1166
+ 0.6018269747447609,
1167
+ 0.3537604456824513,
1168
+ 0.2313475997686524,
1169
+ 0.15754660252555622
1170
+ ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 1861,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.29679989332755485,
1175
+ "score": 0.29679989332755485,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.2606890146882507,
1178
+ "score_ci_high": 0.34577889084499225,
1179
+ "sacrebleu_ci_low": 0.2606890146882507,
1180
+ "sacrebleu_ci_high": 0.34577889084499225
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1324,
1186
+ 947,
1187
+ 723,
1188
+ 555
1189
+ ],
1190
+ "totals": [
1191
+ 1810,
1192
+ 1744,
1193
+ 1678,
1194
+ 1612
1195
+ ],
1196
+ "precisions": [
1197
+ 0.7314917127071823,
1198
+ 0.5430045871559632,
1199
+ 0.4308700834326579,
1200
+ 0.3442928039702233
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 1810,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.49268778913543754,
1206
+ "score": 0.49268778913543754,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.40671829932229786,
1209
+ "score_ci_high": 0.5323333720792753,
1210
+ "sacrebleu_ci_low": 0.40671829932229786,
1211
+ "sacrebleu_ci_high": 0.5323333720792753
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1324,
1217
+ 922,
1218
+ 663,
1219
+ 481
1220
+ ],
1221
+ "totals": [
1222
+ 1831,
1223
+ 1765,
1224
+ 1699,
1225
+ 1633
1226
+ ],
1227
+ "precisions": [
1228
+ 0.7231021299836154,
1229
+ 0.5223796033994335,
1230
+ 0.3902295467922307,
1231
+ 0.2945499081445193
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 1831,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.4564741865050132,
1237
+ "score": 0.4564741865050132,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.38825961035825357,
1240
+ "score_ci_high": 0.5021079213683431,
1241
+ "sacrebleu_ci_low": 0.38825961035825357,
1242
+ "sacrebleu_ci_high": 0.5021079213683431
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1203,
1248
+ 678,
1249
+ 414,
1250
+ 248
1251
+ ],
1252
+ "totals": [
1253
+ 1924,
1254
+ 1858,
1255
+ 1792,
1256
+ 1726
1257
+ ],
1258
+ "precisions": [
1259
+ 0.6252598752598753,
1260
+ 0.36490850376749195,
1261
+ 0.23102678571428573,
1262
+ 0.1436848203939745
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 1924,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.2950050444470296,
1268
+ "score": 0.2950050444470296,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.25511324454078954,
1271
+ "score_ci_high": 0.33543848071805,
1272
+ "sacrebleu_ci_low": 0.25511324454078954,
1273
+ "sacrebleu_ci_high": 0.33543848071805
1274
+ },
1275
+ "score": 0.3975399983006198,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.5386099714179504,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }
results/bluebench/2025-06-19T21-59-04_evaluation_results.json ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "environment_info": {
3
+ "timestamp_utc": "2025-06-20T01:59:00.198687Z",
4
+ "command_line_invocation": [
5
+ "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
+ "--tasks",
7
+ "benchmarks.bluebench",
8
+ "--model",
9
+ "cross_provider",
10
+ "--model_args",
11
+ "model_name=watsonx/mistralai/pixtral-12b,max_tokens=256",
12
+ "--output_path",
13
+ "./results/bluebench",
14
+ "--log_samples",
15
+ "--trust_remote_code",
16
+ "--batch_size",
17
+ "8",
18
+ "--verbosity",
19
+ "ERROR"
20
+ ],
21
+ "parsed_arguments": {
22
+ "tasks": [
23
+ "benchmarks.bluebench"
24
+ ],
25
+ "split": "test",
26
+ "num_fewshots": null,
27
+ "limit": null,
28
+ "batch_size": 8,
29
+ "model": "watsonx/mistralai/pixtral-12b",
30
+ "model_args": {
31
+ "max_tokens": 256
32
+ },
33
+ "gen_kwargs": null,
34
+ "chat_template_kwargs": null,
35
+ "output_path": "./results/bluebench",
36
+ "output_file_prefix": "evaluation_results",
37
+ "log_samples": true,
38
+ "verbosity": "ERROR",
39
+ "apply_chat_template": false,
40
+ "trust_remote_code": true,
41
+ "disable_hf_cache": false,
42
+ "cache_dir": null
43
+ },
44
+ "unitxt_version": "1.24.0",
45
+ "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
+ "python_version": "3.10.18",
47
+ "system": "Linux",
48
+ "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
+ "installed_packages": {
50
+ "nvidia-cufile-cu12": "1.11.1.6",
51
+ "triton": "3.3.1",
52
+ "nltk": "3.9.1",
53
+ "anyio": "4.9.0",
54
+ "absl-py": "2.3.0",
55
+ "tiktoken": "0.9.0",
56
+ "charset-normalizer": "3.4.2",
57
+ "nvidia-cuda-runtime-cu12": "12.6.77",
58
+ "sympy": "1.14.0",
59
+ "mecab-ko": "1.0.1",
60
+ "litellm": "1.72.6.post1",
61
+ "httpcore": "1.0.9",
62
+ "Jinja2": "3.1.6",
63
+ "jsonschema-specifications": "2025.4.1",
64
+ "pydantic_core": "2.33.2",
65
+ "nvidia-cusparse-cu12": "12.5.4.2",
66
+ "yarl": "1.20.1",
67
+ "openai": "1.88.0",
68
+ "portalocker": "3.2.0",
69
+ "pandas": "2.3.0",
70
+ "multiprocess": "0.70.16",
71
+ "jsonschema": "4.24.0",
72
+ "unitxt": "1.24.0",
73
+ "nvidia-nvjitlink-cu12": "12.6.85",
74
+ "nvidia-cublas-cu12": "12.6.4.1",
75
+ "pydantic": "2.11.7",
76
+ "async-timeout": "5.0.1",
77
+ "annotated-types": "0.7.0",
78
+ "rouge_score": "0.1.2",
79
+ "contourpy": "1.3.2",
80
+ "aiosignal": "1.3.2",
81
+ "nvidia-cuda-cupti-cu12": "12.6.80",
82
+ "pillow": "11.2.1",
83
+ "six": "1.17.0",
84
+ "diskcache": "5.6.3",
85
+ "tqdm": "4.67.1",
86
+ "pyarrow": "20.0.0",
87
+ "h11": "0.16.0",
88
+ "zipp": "3.19.2",
89
+ "tzdata": "2025.2",
90
+ "bert-score": "0.3.13",
91
+ "setuptools": "80.9.0",
92
+ "referencing": "0.36.2",
93
+ "sacrebleu": "2.5.1",
94
+ "filelock": "3.18.0",
95
+ "urllib3": "2.5.0",
96
+ "scipy": "1.15.3",
97
+ "nvidia-nccl-cu12": "2.26.2",
98
+ "kiwisolver": "1.4.8",
99
+ "networkx": "3.4.2",
100
+ "typing-inspection": "0.4.1",
101
+ "lxml": "5.4.0",
102
+ "sniffio": "1.3.1",
103
+ "scikit-learn": "1.7.0",
104
+ "nvidia-curand-cu12": "10.3.7.77",
105
+ "pip": "25.1.1",
106
+ "fonttools": "4.58.4",
107
+ "transformers": "4.52.4",
108
+ "datasets": "3.6.0",
109
+ "nvidia-cusolver-cu12": "11.7.1.2",
110
+ "cycler": "0.12.1",
111
+ "evaluate": "0.4.3",
112
+ "distro": "1.9.0",
113
+ "idna": "3.10",
114
+ "MarkupSafe": "3.0.2",
115
+ "frozenlist": "1.7.0",
116
+ "pyparsing": "3.2.3",
117
+ "jiter": "0.10.0",
118
+ "importlib_metadata": "8.0.0",
119
+ "packaging": "24.2",
120
+ "psutil": "7.0.0",
121
+ "mecab-ko-dic": "1.0.0",
122
+ "joblib": "1.5.1",
123
+ "fsspec": "2025.3.0",
124
+ "dill": "0.3.8",
125
+ "tokenizers": "0.21.1",
126
+ "wheel": "0.45.1",
127
+ "nvidia-nvtx-cu12": "12.6.77",
128
+ "nvidia-cusparselt-cu12": "0.6.3",
129
+ "hf-xet": "1.1.4",
130
+ "propcache": "0.3.2",
131
+ "numpy": "2.2.6",
132
+ "mpmath": "1.3.0",
133
+ "multidict": "6.5.0",
134
+ "conllu": "6.0.0",
135
+ "safetensors": "0.5.3",
136
+ "requests": "2.32.4",
137
+ "regex": "2024.11.6",
138
+ "aiohttp": "3.12.13",
139
+ "tabulate": "0.9.0",
140
+ "certifi": "2025.6.15",
141
+ "accelerate": "1.8.0",
142
+ "nvidia-cufft-cu12": "11.3.0.4",
143
+ "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
+ "click": "8.2.1",
145
+ "typing_extensions": "4.12.2",
146
+ "attrs": "25.3.0",
147
+ "exceptiongroup": "1.3.0",
148
+ "tenacity": "9.1.2",
149
+ "pytz": "2025.2",
150
+ "aiohappyeyeballs": "2.6.1",
151
+ "python-dateutil": "2.9.0.post0",
152
+ "torch": "2.7.1",
153
+ "python-dotenv": "1.1.0",
154
+ "httpx": "0.28.1",
155
+ "matplotlib": "3.10.3",
156
+ "xxhash": "3.5.0",
157
+ "PyYAML": "6.0.2",
158
+ "huggingface-hub": "0.33.0",
159
+ "colorama": "0.4.6",
160
+ "rpds-py": "0.25.1",
161
+ "threadpoolctl": "3.6.0",
162
+ "nvidia-cudnn-cu12": "9.5.1.17",
163
+ "jaraco.collections": "5.1.0",
164
+ "tomli": "2.0.1",
165
+ "backports.tarfile": "1.2.0",
166
+ "jaraco.context": "5.3.0",
167
+ "typeguard": "4.3.0",
168
+ "autocommand": "2.2.2",
169
+ "jaraco.text": "3.12.1",
170
+ "more-itertools": "10.3.0",
171
+ "platformdirs": "4.2.2",
172
+ "inflect": "7.3.1",
173
+ "jaraco.functools": "4.0.1"
174
+ }
175
+ },
176
+ "results": {
177
+ "bias": {
178
+ "safety_bbq_age": {
179
+ "accuracy": 0.5333333333333333,
180
+ "accuracy_ci_low": 0.43333333333333335,
181
+ "accuracy_ci_high": 0.6444444444444445,
182
+ "score_name": "accuracy",
183
+ "score": 0.5333333333333333,
184
+ "score_ci_high": 0.6444444444444445,
185
+ "score_ci_low": 0.43333333333333335,
186
+ "num_of_instances": 90
187
+ },
188
+ "safety_bbq_disability_status": {
189
+ "accuracy": 0.5555555555555556,
190
+ "accuracy_ci_low": 0.45555555555555555,
191
+ "accuracy_ci_high": 0.6555555555555556,
192
+ "score_name": "accuracy",
193
+ "score": 0.5555555555555556,
194
+ "score_ci_high": 0.6555555555555556,
195
+ "score_ci_low": 0.45555555555555555,
196
+ "num_of_instances": 90
197
+ },
198
+ "safety_bbq_gender_identity": {
199
+ "accuracy": 0.8777777777777778,
200
+ "accuracy_ci_low": 0.8,
201
+ "accuracy_ci_high": 0.9333333333333333,
202
+ "score_name": "accuracy",
203
+ "score": 0.8777777777777778,
204
+ "score_ci_high": 0.9333333333333333,
205
+ "score_ci_low": 0.8,
206
+ "num_of_instances": 90
207
+ },
208
+ "safety_bbq_nationality": {
209
+ "accuracy": 0.5777777777777777,
210
+ "accuracy_ci_low": 0.4666666666666667,
211
+ "accuracy_ci_high": 0.6777777777777778,
212
+ "score_name": "accuracy",
213
+ "score": 0.5777777777777777,
214
+ "score_ci_high": 0.6777777777777778,
215
+ "score_ci_low": 0.4666666666666667,
216
+ "num_of_instances": 90
217
+ },
218
+ "safety_bbq_physical_appearance": {
219
+ "accuracy": 0.6111111111111112,
220
+ "accuracy_ci_low": 0.5111111111111111,
221
+ "accuracy_ci_high": 0.7,
222
+ "score_name": "accuracy",
223
+ "score": 0.6111111111111112,
224
+ "score_ci_high": 0.7,
225
+ "score_ci_low": 0.5111111111111111,
226
+ "num_of_instances": 90
227
+ },
228
+ "safety_bbq_race_ethnicity": {
229
+ "accuracy": 0.9888888888888889,
230
+ "accuracy_ci_low": 0.9407763312346947,
231
+ "accuracy_ci_high": 1.0,
232
+ "score_name": "accuracy",
233
+ "score": 0.9888888888888889,
234
+ "score_ci_high": 1.0,
235
+ "score_ci_low": 0.9407763312346947,
236
+ "num_of_instances": 90
237
+ },
238
+ "safety_bbq_race_x_gender": {
239
+ "accuracy": 0.8,
240
+ "accuracy_ci_low": 0.7111111111111111,
241
+ "accuracy_ci_high": 0.8666666666666667,
242
+ "score_name": "accuracy",
243
+ "score": 0.8,
244
+ "score_ci_high": 0.8666666666666667,
245
+ "score_ci_low": 0.7111111111111111,
246
+ "num_of_instances": 90
247
+ },
248
+ "safety_bbq_race_x_ses": {
249
+ "accuracy": 0.6333333333333333,
250
+ "accuracy_ci_low": 0.5222222222222223,
251
+ "accuracy_ci_high": 0.7222222222222222,
252
+ "score_name": "accuracy",
253
+ "score": 0.6333333333333333,
254
+ "score_ci_high": 0.7222222222222222,
255
+ "score_ci_low": 0.5222222222222223,
256
+ "num_of_instances": 90
257
+ },
258
+ "safety_bbq_religion": {
259
+ "accuracy": 0.6444444444444445,
260
+ "accuracy_ci_low": 0.5333333333333333,
261
+ "accuracy_ci_high": 0.7444444444444445,
262
+ "score_name": "accuracy",
263
+ "score": 0.6444444444444445,
264
+ "score_ci_high": 0.7444444444444445,
265
+ "score_ci_low": 0.5333333333333333,
266
+ "num_of_instances": 90
267
+ },
268
+ "safety_bbq_ses": {
269
+ "accuracy": 0.6555555555555556,
270
+ "accuracy_ci_low": 0.5555555555555556,
271
+ "accuracy_ci_high": 0.7505365189670177,
272
+ "score_name": "accuracy",
273
+ "score": 0.6555555555555556,
274
+ "score_ci_high": 0.7505365189670177,
275
+ "score_ci_low": 0.5555555555555556,
276
+ "num_of_instances": 90
277
+ },
278
+ "safety_bbq_sexual_orientation": {
279
+ "accuracy": 0.8666666666666667,
280
+ "accuracy_ci_low": 0.7888888888888889,
281
+ "accuracy_ci_high": 0.9333333333333333,
282
+ "score_name": "accuracy",
283
+ "score": 0.8666666666666667,
284
+ "score_ci_high": 0.9333333333333333,
285
+ "score_ci_low": 0.7888888888888889,
286
+ "num_of_instances": 90
287
+ },
288
+ "score": 0.704040404040404,
289
+ "score_name": "subsets_mean",
290
+ "num_of_instances": 990
291
+ },
292
+ "chatbot_abilities": {
293
+ "arena_hard_generation_english_gpt_4_0314_reference": {
294
+ "num_of_instances": 500,
295
+ "llama_3_70b_instruct_template_arena_hard": 0.5,
296
+ "score": 0.5,
297
+ "score_name": "llama_3_70b_instruct_template_arena_hard"
298
+ },
299
+ "score": 0.5,
300
+ "score_name": "subsets_mean",
301
+ "num_of_instances": 500
302
+ },
303
+ "entity_extraction": {
304
+ "universal_ner_en_ewt": {
305
+ "num_of_instances": 1000,
306
+ "f1_Person": 0.2839506172839506,
307
+ "f1_Organization": 0.260586319218241,
308
+ "f1_Location": 0.19917012448132781,
309
+ "f1_macro": 0.24790235366117316,
310
+ "recall_macro": 0.2073423475003688,
311
+ "precision_macro": 0.3173997367545755,
312
+ "in_classes_support": 0.47339699863574347,
313
+ "f1_micro": 0.17488076311605724,
314
+ "recall_micro": 0.20952380952380953,
315
+ "precision_micro": 0.15006821282401092,
316
+ "score": 0.17488076311605724,
317
+ "score_name": "f1_micro",
318
+ "score_ci_low": 0.14029219675059787,
319
+ "score_ci_high": 0.2240183994224699,
320
+ "f1_micro_ci_low": 0.14029219675059787,
321
+ "f1_micro_ci_high": 0.2240183994224699
322
+ },
323
+ "score": 0.17488076311605724,
324
+ "score_name": "subsets_mean",
325
+ "num_of_instances": 1000
326
+ },
327
+ "knowledge": {
328
+ "mmlu_pro_biology": {
329
+ "accuracy": 0.5633802816901409,
330
+ "accuracy_ci_low": 0.43661971830985913,
331
+ "accuracy_ci_high": 0.676056338028169,
332
+ "score_name": "accuracy",
333
+ "score": 0.5633802816901409,
334
+ "score_ci_high": 0.676056338028169,
335
+ "score_ci_low": 0.43661971830985913,
336
+ "num_of_instances": 71
337
+ },
338
+ "mmlu_pro_business": {
339
+ "accuracy": 0.15492957746478872,
340
+ "accuracy_ci_low": 0.08450704225352113,
341
+ "accuracy_ci_high": 0.2535211267605634,
342
+ "score_name": "accuracy",
343
+ "score": 0.15492957746478872,
344
+ "score_ci_high": 0.2535211267605634,
345
+ "score_ci_low": 0.08450704225352113,
346
+ "num_of_instances": 71
347
+ },
348
+ "mmlu_pro_chemistry": {
349
+ "accuracy": 0.08450704225352113,
350
+ "accuracy_ci_low": 0.028169014084507043,
351
+ "accuracy_ci_high": 0.18309859154929578,
352
+ "score_name": "accuracy",
353
+ "score": 0.08450704225352113,
354
+ "score_ci_high": 0.18309859154929578,
355
+ "score_ci_low": 0.028169014084507043,
356
+ "num_of_instances": 71
357
+ },
358
+ "mmlu_pro_computer_science": {
359
+ "accuracy": 0.3380281690140845,
360
+ "accuracy_ci_low": 0.22535211267605634,
361
+ "accuracy_ci_high": 0.4507042253521127,
362
+ "score_name": "accuracy",
363
+ "score": 0.3380281690140845,
364
+ "score_ci_high": 0.4507042253521127,
365
+ "score_ci_low": 0.22535211267605634,
366
+ "num_of_instances": 71
367
+ },
368
+ "mmlu_pro_economics": {
369
+ "accuracy": 0.5633802816901409,
370
+ "accuracy_ci_low": 0.43661971830985913,
371
+ "accuracy_ci_high": 0.676056338028169,
372
+ "score_name": "accuracy",
373
+ "score": 0.5633802816901409,
374
+ "score_ci_high": 0.676056338028169,
375
+ "score_ci_low": 0.43661971830985913,
376
+ "num_of_instances": 71
377
+ },
378
+ "mmlu_pro_engineering": {
379
+ "accuracy": 0.14084507042253522,
380
+ "accuracy_ci_low": 0.07042253521126761,
381
+ "accuracy_ci_high": 0.23943661971830985,
382
+ "score_name": "accuracy",
383
+ "score": 0.14084507042253522,
384
+ "score_ci_high": 0.23943661971830985,
385
+ "score_ci_low": 0.07042253521126761,
386
+ "num_of_instances": 71
387
+ },
388
+ "mmlu_pro_health": {
389
+ "accuracy": 0.38028169014084506,
390
+ "accuracy_ci_low": 0.2676056338028169,
391
+ "accuracy_ci_high": 0.49295774647887325,
392
+ "score_name": "accuracy",
393
+ "score": 0.38028169014084506,
394
+ "score_ci_high": 0.49295774647887325,
395
+ "score_ci_low": 0.2676056338028169,
396
+ "num_of_instances": 71
397
+ },
398
+ "mmlu_pro_history": {
399
+ "accuracy": 0.49295774647887325,
400
+ "accuracy_ci_low": 0.36619718309859156,
401
+ "accuracy_ci_high": 0.6056338028169014,
402
+ "score_name": "accuracy",
403
+ "score": 0.49295774647887325,
404
+ "score_ci_high": 0.6056338028169014,
405
+ "score_ci_low": 0.36619718309859156,
406
+ "num_of_instances": 71
407
+ },
408
+ "mmlu_pro_law": {
409
+ "accuracy": 0.28169014084507044,
410
+ "accuracy_ci_low": 0.18309859154929578,
411
+ "accuracy_ci_high": 0.39436619718309857,
412
+ "score_name": "accuracy",
413
+ "score": 0.28169014084507044,
414
+ "score_ci_high": 0.39436619718309857,
415
+ "score_ci_low": 0.18309859154929578,
416
+ "num_of_instances": 71
417
+ },
418
+ "mmlu_pro_math": {
419
+ "accuracy": 0.056338028169014086,
420
+ "accuracy_ci_low": 0.014084507042253521,
421
+ "accuracy_ci_high": 0.1267605633802817,
422
+ "score_name": "accuracy",
423
+ "score": 0.056338028169014086,
424
+ "score_ci_high": 0.1267605633802817,
425
+ "score_ci_low": 0.014084507042253521,
426
+ "num_of_instances": 71
427
+ },
428
+ "mmlu_pro_other": {
429
+ "accuracy": 0.28169014084507044,
430
+ "accuracy_ci_low": 0.19718309859154928,
431
+ "accuracy_ci_high": 0.39436619718309857,
432
+ "score_name": "accuracy",
433
+ "score": 0.28169014084507044,
434
+ "score_ci_high": 0.39436619718309857,
435
+ "score_ci_low": 0.19718309859154928,
436
+ "num_of_instances": 71
437
+ },
438
+ "mmlu_pro_philosophy": {
439
+ "accuracy": 0.38028169014084506,
440
+ "accuracy_ci_low": 0.2676056338028169,
441
+ "accuracy_ci_high": 0.5095143645267136,
442
+ "score_name": "accuracy",
443
+ "score": 0.38028169014084506,
444
+ "score_ci_high": 0.5095143645267136,
445
+ "score_ci_low": 0.2676056338028169,
446
+ "num_of_instances": 71
447
+ },
448
+ "mmlu_pro_physics": {
449
+ "accuracy": 0.18309859154929578,
450
+ "accuracy_ci_low": 0.09859154929577464,
451
+ "accuracy_ci_high": 0.28169014084507044,
452
+ "score_name": "accuracy",
453
+ "score": 0.18309859154929578,
454
+ "score_ci_high": 0.28169014084507044,
455
+ "score_ci_low": 0.09859154929577464,
456
+ "num_of_instances": 71
457
+ },
458
+ "mmlu_pro_psychology": {
459
+ "accuracy": 0.5915492957746479,
460
+ "accuracy_ci_low": 0.4647887323943662,
461
+ "accuracy_ci_high": 0.704225352112676,
462
+ "score_name": "accuracy",
463
+ "score": 0.5915492957746479,
464
+ "score_ci_high": 0.704225352112676,
465
+ "score_ci_low": 0.4647887323943662,
466
+ "num_of_instances": 71
467
+ },
468
+ "score": 0.3209255533199195,
469
+ "score_name": "subsets_mean",
470
+ "num_of_instances": 994
471
+ },
472
+ "legal": {
473
+ "legalbench_abercrombie": {
474
+ "f1_macro": 0.07819548872180451,
475
+ "f1_suggestive": 0.0,
476
+ "f1_generic": 0.0,
477
+ "f1_fanciful": 0.10526315789473684,
478
+ "f1_descriptive": 0.2857142857142857,
479
+ "f1_arbitrary": 0.0,
480
+ "f1_macro_ci_low": 0.029629629629629627,
481
+ "f1_macro_ci_high": 0.1446114401751038,
482
+ "score_name": "f1_micro",
483
+ "score": 0.1,
484
+ "score_ci_high": 0.1976990689891533,
485
+ "score_ci_low": 0.0392156862745098,
486
+ "num_of_instances": 85,
487
+ "accuracy": 0.058823529411764705,
488
+ "accuracy_ci_low": 0.023529411764705882,
489
+ "accuracy_ci_high": 0.1261289751719794,
490
+ "f1_micro": 0.1,
491
+ "f1_micro_ci_low": 0.0392156862745098,
492
+ "f1_micro_ci_high": 0.1976990689891533
493
+ },
494
+ "legalbench_corporate_lobbying": {
495
+ "f1_macro": 0.4318713450292398,
496
+ "f1_no": 0.5526315789473685,
497
+ "f1_yes": 0.3111111111111111,
498
+ "f1_macro_ci_low": 0.3568758383648559,
499
+ "f1_macro_ci_high": 0.5105344080350164,
500
+ "score_name": "f1_micro",
501
+ "score": 0.48427672955974843,
502
+ "score_ci_high": 0.553538495446083,
503
+ "score_ci_low": 0.40855056637270504,
504
+ "num_of_instances": 200,
505
+ "accuracy": 0.385,
506
+ "accuracy_ci_low": 0.32,
507
+ "accuracy_ci_high": 0.45,
508
+ "f1_micro": 0.48427672955974843,
509
+ "f1_micro_ci_low": 0.40855056637270504,
510
+ "f1_micro_ci_high": 0.553538495446083
511
+ },
512
+ "legalbench_function_of_decision_section": {
513
+ "f1_macro": 0.07867608581894296,
514
+ "f1_conclusion": 0.0,
515
+ "f1_decree": 0.07692307692307693,
516
+ "f1_issue": 0.0,
517
+ "f1_analysis": 0.4166666666666667,
518
+ "f1_facts": 0.0,
519
+ "f1_procedural history": 0.0,
520
+ "f1_rule": 0.05714285714285714,
521
+ "f1_macro_ci_low": 0.04893392684609269,
522
+ "f1_macro_ci_high": 0.12399894886334993,
523
+ "score_name": "f1_micro",
524
+ "score": 0.10619469026548672,
525
+ "score_ci_high": 0.17107924198886906,
526
+ "score_ci_low": 0.06000153614696125,
527
+ "num_of_instances": 200,
528
+ "accuracy": 0.06,
529
+ "accuracy_ci_low": 0.035,
530
+ "accuracy_ci_high": 0.1,
531
+ "f1_micro": 0.10619469026548672,
532
+ "f1_micro_ci_low": 0.06000153614696125,
533
+ "f1_micro_ci_high": 0.17107924198886906
534
+ },
535
+ "legalbench_international_citizenship_questions": {
536
+ "f1_macro": 0.31866564807741277,
537
+ "f1_yes": 0.20952380952380953,
538
+ "f1_no": 0.42780748663101603,
539
+ "f1_macro_ci_low": 0.2502995617982514,
540
+ "f1_macro_ci_high": 0.39353002888833133,
541
+ "score_name": "f1_micro",
542
+ "score": 0.3493150684931507,
543
+ "score_ci_high": 0.42,
544
+ "score_ci_low": 0.2701836639419085,
545
+ "num_of_instances": 200,
546
+ "accuracy": 0.255,
547
+ "accuracy_ci_low": 0.195,
548
+ "accuracy_ci_high": 0.315,
549
+ "f1_micro": 0.3493150684931507,
550
+ "f1_micro_ci_low": 0.2701836639419085,
551
+ "f1_micro_ci_high": 0.42
552
+ },
553
+ "legalbench_proa": {
554
+ "f1_macro": 0.8489843979982338,
555
+ "f1_yes": 0.8607594936708861,
556
+ "f1_no": 0.8372093023255814,
557
+ "f1_macro_ci_low": 0.762371922413286,
558
+ "f1_macro_ci_high": 0.9100766335383308,
559
+ "score_name": "f1_micro",
560
+ "score": 0.8484848484848485,
561
+ "score_ci_high": 0.9090909090909091,
562
+ "score_ci_low": 0.7590361445783133,
563
+ "num_of_instances": 85,
564
+ "accuracy": 0.8235294117647058,
565
+ "accuracy_ci_low": 0.7294117647058823,
566
+ "accuracy_ci_high": 0.8941176470588236,
567
+ "f1_micro": 0.8484848484848485,
568
+ "f1_micro_ci_low": 0.7590361445783133,
569
+ "f1_micro_ci_high": 0.9090909090909091
570
+ },
571
+ "score": 0.3776542673606469,
572
+ "score_name": "subsets_mean",
573
+ "num_of_instances": 770
574
+ },
575
+ "news_classification": {
576
+ "20_newsgroups_short": {
577
+ "f1_macro": 0.2527665164224348,
578
+ "f1_cars": 0.417910447761194,
579
+ "f1_windows x": 0.0,
580
+ "f1_atheism": 0.05128205128205128,
581
+ "f1_religion": 0.15584415584415584,
582
+ "f1_politics": 0.15789473684210525,
583
+ "f1_medicine": 0.5,
584
+ "f1_christianity": 0.07142857142857142,
585
+ "f1_computer graphics": 0.2702702702702703,
586
+ "f1_microsoft windows": 0.16393442622950818,
587
+ "f1_middle east": 0.12,
588
+ "f1_motorcycles": 0.375,
589
+ "f1_pc hardware": 0.3157894736842105,
590
+ "f1_mac hardware": 0.14285714285714285,
591
+ "f1_for sale": 0.2222222222222222,
592
+ "f1_guns": 0.11538461538461539,
593
+ "f1_space": 0.34782608695652173,
594
+ "f1_cryptography": 0.4126984126984127,
595
+ "f1_baseball": 0.35135135135135137,
596
+ "f1_hockey": 0.5,
597
+ "f1_electronics": 0.36363636363636365,
598
+ "f1_macro_ci_low": 0.22757237570678052,
599
+ "f1_macro_ci_high": 0.28553448529743153,
600
+ "score_name": "f1_micro",
601
+ "score": 0.26766917293233083,
602
+ "score_ci_high": 0.29941755760789834,
603
+ "score_ci_low": 0.23630344400865025,
604
+ "num_of_instances": 1000,
605
+ "accuracy": 0.178,
606
+ "accuracy_ci_low": 0.155,
607
+ "accuracy_ci_high": 0.201,
608
+ "f1_micro": 0.26766917293233083,
609
+ "f1_micro_ci_low": 0.23630344400865025,
610
+ "f1_micro_ci_high": 0.29941755760789834
611
+ },
612
+ "score": 0.26766917293233083,
613
+ "score_name": "subsets_mean",
614
+ "num_of_instances": 1000
615
+ },
616
+ "product_help": {
617
+ "cfpb_product_2023": {
618
+ "f1_macro": 0.5296897956484732,
619
+ "f1_credit reporting or credit repair services or other personal consumer reports": 0.6554621848739496,
620
+ "f1_credit card or prepaid card": 0.4943820224719101,
621
+ "f1_debt collection": 0.4297520661157025,
622
+ "f1_payday loan or title loan or personal loan": 0.375,
623
+ "f1_checking or savings account": 0.611764705882353,
624
+ "f1_money transfer or virtual currency or money service": 0.5555555555555556,
625
+ "f1_mortgage": 0.509090909090909,
626
+ "f1_student loan": 0.5555555555555556,
627
+ "f1_vehicle loan or lease": 0.5806451612903226,
628
+ "f1_macro_ci_low": 0.469057355767322,
629
+ "f1_macro_ci_high": 0.598469091946616,
630
+ "score_name": "f1_micro",
631
+ "score": 0.6116883116883117,
632
+ "score_ci_high": 0.6423076923076924,
633
+ "score_ci_low": 0.5828077788480714,
634
+ "num_of_instances": 1000,
635
+ "accuracy": 0.471,
636
+ "accuracy_ci_low": 0.443,
637
+ "accuracy_ci_high": 0.503,
638
+ "f1_micro": 0.6116883116883117,
639
+ "f1_micro_ci_low": 0.5828077788480714,
640
+ "f1_micro_ci_high": 0.6423076923076924
641
+ },
642
+ "cfpb_product_watsonx": {
643
+ "f1_macro": 0.5903065912163163,
644
+ "f1_mortgages and loans": 0.6282051282051282,
645
+ "f1_credit card": 0.6842105263157895,
646
+ "f1_debt collection": 0.49504950495049505,
647
+ "f1_credit reporting": 0.5847457627118644,
648
+ "f1_retail banking": 0.559322033898305,
649
+ "f1_macro_ci_low": 0.5491833392518851,
650
+ "f1_macro_ci_high": 0.6385093727045127,
651
+ "score_name": "f1_micro",
652
+ "score": 0.5856481481481481,
653
+ "score_ci_high": 0.632183908045977,
654
+ "score_ci_low": 0.54416153401534,
655
+ "num_of_instances": 500,
656
+ "accuracy": 0.506,
657
+ "accuracy_ci_low": 0.466,
658
+ "accuracy_ci_high": 0.552,
659
+ "f1_micro": 0.5856481481481481,
660
+ "f1_micro_ci_low": 0.54416153401534,
661
+ "f1_micro_ci_high": 0.632183908045977
662
+ },
663
+ "score": 0.59866822991823,
664
+ "score_name": "subsets_mean",
665
+ "num_of_instances": 1500
666
+ },
667
+ "qa_finance": {
668
+ "fin_qa": {
669
+ "num_of_instances": 1000,
670
+ "execution_accuracy": 0.038,
671
+ "program_accuracy": 0.036,
672
+ "score": 0.036,
673
+ "score_name": "program_accuracy",
674
+ "execution_accuracy_ci_low": 0.028,
675
+ "execution_accuracy_ci_high": 0.051,
676
+ "program_accuracy_ci_low": 0.026,
677
+ "program_accuracy_ci_high": 0.048,
678
+ "score_ci_low": 0.026,
679
+ "score_ci_high": 0.048
680
+ },
681
+ "score": 0.036,
682
+ "score_name": "subsets_mean",
683
+ "num_of_instances": 1000
684
+ },
685
+ "rag_general": {
686
+ "rag_response_generation_clapnq": {
687
+ "precision": 0.28525760105453485,
688
+ "recall": 0.5602956481447632,
689
+ "f1": 0.3115553425716912,
690
+ "precision_ci_low": 0.26514229625282976,
691
+ "precision_ci_high": 0.3063459879682981,
692
+ "recall_ci_low": 0.5448358892557127,
693
+ "recall_ci_high": 0.5784430830027775,
694
+ "f1_ci_low": 0.29470532845103153,
695
+ "f1_ci_high": 0.33107953368338633,
696
+ "score_name": "f1",
697
+ "score": 0.3115553425716912,
698
+ "score_ci_high": 0.33107953368338633,
699
+ "score_ci_low": 0.29470532845103153,
700
+ "num_of_instances": 600,
701
+ "correctness_f1_bert_score.deberta_large_mnli": 0.5722805594901244,
702
+ "correctness_recall_bert_score.deberta_large_mnli": 0.6597029569248358,
703
+ "correctness_precision_bert_score.deberta_large_mnli": 0.5267530478537082,
704
+ "faithfullness_f1_token_overlap": 0.30842516813067145,
705
+ "faithfullness_recall_token_overlap": 0.25295558254033107,
706
+ "faithfullness_precision_token_overlap": 0.5261232421481096,
707
+ "correctness_f1_token_overlap": 0.3115553425716912,
708
+ "correctness_recall_token_overlap": 0.5602956481447632,
709
+ "correctness_precision_token_overlap": 0.28525760105453485
710
+ },
711
+ "score": 0.3115553425716912,
712
+ "score_name": "subsets_mean",
713
+ "num_of_instances": 600
714
+ },
715
+ "reasoning": {
716
+ "hellaswag": {
717
+ "accuracy": 0.574,
718
+ "accuracy_ci_low": 0.543,
719
+ "accuracy_ci_high": 0.606,
720
+ "score_name": "accuracy",
721
+ "score": 0.574,
722
+ "score_ci_high": 0.606,
723
+ "score_ci_low": 0.543,
724
+ "num_of_instances": 1000
725
+ },
726
+ "openbook_qa": {
727
+ "accuracy": 0.87,
728
+ "accuracy_ci_low": 0.84,
729
+ "accuracy_ci_high": 0.898,
730
+ "score_name": "accuracy",
731
+ "score": 0.87,
732
+ "score_ci_high": 0.898,
733
+ "score_ci_low": 0.84,
734
+ "num_of_instances": 500
735
+ },
736
+ "score": 0.722,
737
+ "score_name": "subsets_mean",
738
+ "num_of_instances": 1500
739
+ },
740
+ "safety": {
741
+ "attaq_500": {
742
+ "severity_critical": 3.0,
743
+ "severity_high": 3.0,
744
+ "severity_medium": 3.0,
745
+ "severity_low": 91.0,
746
+ "category_harmful_info": 0.7363050992171916,
747
+ "category_deception": 0.7075176269117028,
748
+ "category_explicit_content": 0.7862109684883944,
749
+ "category_violence": 0.8046535633740918,
750
+ "category_discrimination": 0.7672554519441394,
751
+ "category_substance_abuse": 0.9027199372959634,
752
+ "category_pii": 0.8223799069722494,
753
+ "safety": 0.7756195961658118,
754
+ "safety_ci_low": 0.7397039726251121,
755
+ "safety_ci_high": 0.8122983590465129,
756
+ "score_name": "safety",
757
+ "score": 0.7756195961658118,
758
+ "score_ci_high": 0.8122983590465129,
759
+ "score_ci_low": 0.7397039726251121,
760
+ "num_of_instances": 100
761
+ },
762
+ "score": 0.7756195961658118,
763
+ "score_name": "subsets_mean",
764
+ "num_of_instances": 100
765
+ },
766
+ "summarization": {
767
+ "billsum_document_filtered_to_6000_chars": {
768
+ "num_of_instances": 528,
769
+ "rougeL": 0.2623368936139326,
770
+ "score": 0.2623368936139326,
771
+ "score_name": "rougeL",
772
+ "rougeLsum": 0.3246861800674231,
773
+ "rouge1": 0.37875353275553886,
774
+ "rouge2": 0.17634540475082816,
775
+ "rougeL_ci_low": 0.25607817865790744,
776
+ "rougeL_ci_high": 0.2699573234237949,
777
+ "score_ci_low": 0.25607817865790744,
778
+ "score_ci_high": 0.2699573234237949,
779
+ "rougeLsum_ci_low": 0.31633678336605703,
780
+ "rougeLsum_ci_high": 0.3342014722776119,
781
+ "rouge1_ci_low": 0.36953845902369503,
782
+ "rouge1_ci_high": 0.38917434338527224,
783
+ "rouge2_ci_low": 0.17007943381657917,
784
+ "rouge2_ci_high": 0.18480657837175643
785
+ },
786
+ "tldr_document_filtered_to_6000_chars": {
787
+ "num_of_instances": 1000,
788
+ "rougeL": 0.09126672648981966,
789
+ "score": 0.09126672648981966,
790
+ "score_name": "rougeL",
791
+ "rougeLsum": 0.10371587819726082,
792
+ "rouge1": 0.12400577725256573,
793
+ "rouge2": 0.017067409640734738,
794
+ "rougeL_ci_low": 0.08642017587563405,
795
+ "rougeL_ci_high": 0.09509880053104167,
796
+ "score_ci_low": 0.08642017587563405,
797
+ "score_ci_high": 0.09509880053104167,
798
+ "rougeLsum_ci_low": 0.09869637123218795,
799
+ "rougeLsum_ci_high": 0.10797131588828912,
800
+ "rouge1_ci_low": 0.11835018459961112,
801
+ "rouge1_ci_high": 0.1293417832607229,
802
+ "rouge2_ci_low": 0.015170057207432563,
803
+ "rouge2_ci_high": 0.019497436133181917
804
+ },
805
+ "score": 0.17680181005187612,
806
+ "score_name": "subsets_mean",
807
+ "num_of_instances": 1528
808
+ },
809
+ "translation": {
810
+ "mt_flores_101_ara_eng": {
811
+ "num_of_instances": 66,
812
+ "counts": [
813
+ 1175,
814
+ 692,
815
+ 449,
816
+ 309
817
+ ],
818
+ "totals": [
819
+ 1795,
820
+ 1729,
821
+ 1663,
822
+ 1597
823
+ ],
824
+ "precisions": [
825
+ 0.6545961002785515,
826
+ 0.40023134759976864,
827
+ 0.269993986770896,
828
+ 0.19348778960551033
829
+ ],
830
+ "bp": 1.0,
831
+ "sys_len": 1795,
832
+ "ref_len": 1734,
833
+ "sacrebleu": 0.34203696369018,
834
+ "score": 0.34203696369018,
835
+ "score_name": "sacrebleu",
836
+ "score_ci_low": 0.30827619341319407,
837
+ "score_ci_high": 0.3934585756605337,
838
+ "sacrebleu_ci_low": 0.30827619341319407,
839
+ "sacrebleu_ci_high": 0.3934585756605337
840
+ },
841
+ "mt_flores_101_deu_eng": {
842
+ "num_of_instances": 66,
843
+ "counts": [
844
+ 1238,
845
+ 753,
846
+ 502,
847
+ 333
848
+ ],
849
+ "totals": [
850
+ 1807,
851
+ 1741,
852
+ 1675,
853
+ 1609
854
+ ],
855
+ "precisions": [
856
+ 0.6851134477033757,
857
+ 0.43251005169442847,
858
+ 0.29970149253731343,
859
+ 0.2069608452454941
860
+ ],
861
+ "bp": 1.0,
862
+ "sys_len": 1807,
863
+ "ref_len": 1734,
864
+ "sacrebleu": 0.36820013898054776,
865
+ "score": 0.36820013898054776,
866
+ "score_name": "sacrebleu",
867
+ "score_ci_low": 0.3262271352518154,
868
+ "score_ci_high": 0.41519158941270023,
869
+ "sacrebleu_ci_low": 0.3262271352518154,
870
+ "sacrebleu_ci_high": 0.41519158941270023
871
+ },
872
+ "mt_flores_101_eng_ara": {
873
+ "num_of_instances": 66,
874
+ "counts": [
875
+ 247,
876
+ 51,
877
+ 24,
878
+ 13
879
+ ],
880
+ "totals": [
881
+ 3575,
882
+ 3509,
883
+ 3443,
884
+ 3377
885
+ ],
886
+ "precisions": [
887
+ 0.06909090909090909,
888
+ 0.014534055286406384,
889
+ 0.0069706651176299735,
890
+ 0.0038495706248149247
891
+ ],
892
+ "bp": 1.0,
893
+ "sys_len": 3575,
894
+ "ref_len": 1589,
895
+ "sacrebleu": 0.012812195485921624,
896
+ "score": 0.012812195485921624,
897
+ "score_name": "sacrebleu",
898
+ "score_ci_low": 0.006078646745883528,
899
+ "score_ci_high": 0.020936322376328288,
900
+ "sacrebleu_ci_low": 0.006078646745883528,
901
+ "sacrebleu_ci_high": 0.020936322376328288
902
+ },
903
+ "mt_flores_101_eng_deu": {
904
+ "num_of_instances": 66,
905
+ "counts": [
906
+ 1113,
907
+ 599,
908
+ 371,
909
+ 238
910
+ ],
911
+ "totals": [
912
+ 1788,
913
+ 1722,
914
+ 1656,
915
+ 1590
916
+ ],
917
+ "precisions": [
918
+ 0.6224832214765101,
919
+ 0.34785133565621373,
920
+ 0.22403381642512077,
921
+ 0.14968553459119496
922
+ ],
923
+ "bp": 0.9740561253203749,
924
+ "sys_len": 1788,
925
+ "ref_len": 1835,
926
+ "sacrebleu": 0.2843398077874606,
927
+ "score": 0.2843398077874606,
928
+ "score_name": "sacrebleu",
929
+ "score_ci_low": 0.23626058332848057,
930
+ "score_ci_high": 0.31993924524942646,
931
+ "sacrebleu_ci_low": 0.23626058332848057,
932
+ "sacrebleu_ci_high": 0.31993924524942646
933
+ },
934
+ "mt_flores_101_eng_fra": {
935
+ "num_of_instances": 66,
936
+ "counts": [
937
+ 1420,
938
+ 985,
939
+ 727,
940
+ 545
941
+ ],
942
+ "totals": [
943
+ 2032,
944
+ 1966,
945
+ 1900,
946
+ 1834
947
+ ],
948
+ "precisions": [
949
+ 0.6988188976377953,
950
+ 0.5010172939979654,
951
+ 0.38263157894736843,
952
+ 0.29716466739367503
953
+ ],
954
+ "bp": 0.9824394796731021,
955
+ "sys_len": 2032,
956
+ "ref_len": 2068,
957
+ "sacrebleu": 0.4388384183123361,
958
+ "score": 0.4388384183123361,
959
+ "score_name": "sacrebleu",
960
+ "score_ci_low": 0.39345852629040573,
961
+ "score_ci_high": 0.4779013774696765,
962
+ "sacrebleu_ci_low": 0.39345852629040573,
963
+ "sacrebleu_ci_high": 0.4779013774696765
964
+ },
965
+ "mt_flores_101_eng_kor": {
966
+ "num_of_instances": 66,
967
+ "counts": [
968
+ 1094,
969
+ 480,
970
+ 264,
971
+ 149
972
+ ],
973
+ "totals": [
974
+ 2582,
975
+ 2516,
976
+ 2450,
977
+ 2384
978
+ ],
979
+ "precisions": [
980
+ 0.42370255615801705,
981
+ 0.1907790143084261,
982
+ 0.10775510204081633,
983
+ 0.0625
984
+ ],
985
+ "bp": 1.0,
986
+ "sys_len": 2582,
987
+ "ref_len": 2235,
988
+ "sacrebleu": 0.15274865193932807,
989
+ "score": 0.15274865193932807,
990
+ "score_name": "sacrebleu",
991
+ "score_ci_low": 0.11545139318026426,
992
+ "score_ci_high": 0.18226843784214114,
993
+ "sacrebleu_ci_low": 0.11545139318026426,
994
+ "sacrebleu_ci_high": 0.18226843784214114
995
+ },
996
+ "mt_flores_101_eng_por": {
997
+ "num_of_instances": 66,
998
+ "counts": [
999
+ 1376,
1000
+ 942,
1001
+ 686,
1002
+ 504
1003
+ ],
1004
+ "totals": [
1005
+ 1895,
1006
+ 1829,
1007
+ 1763,
1008
+ 1697
1009
+ ],
1010
+ "precisions": [
1011
+ 0.7261213720316623,
1012
+ 0.5150355385456534,
1013
+ 0.3891094724900737,
1014
+ 0.29699469652327637
1015
+ ],
1016
+ "bp": 0.988979382694272,
1017
+ "sys_len": 1895,
1018
+ "ref_len": 1916,
1019
+ "sacrebleu": 0.4509246392883171,
1020
+ "score": 0.4509246392883171,
1021
+ "score_name": "sacrebleu",
1022
+ "score_ci_low": 0.3969123124838706,
1023
+ "score_ci_high": 0.49488969802829147,
1024
+ "sacrebleu_ci_low": 0.3969123124838706,
1025
+ "sacrebleu_ci_high": 0.49488969802829147
1026
+ },
1027
+ "mt_flores_101_eng_ron": {
1028
+ "num_of_instances": 66,
1029
+ "counts": [
1030
+ 1007,
1031
+ 563,
1032
+ 348,
1033
+ 213
1034
+ ],
1035
+ "totals": [
1036
+ 1937,
1037
+ 1871,
1038
+ 1805,
1039
+ 1739
1040
+ ],
1041
+ "precisions": [
1042
+ 0.5198760970573051,
1043
+ 0.3009086050240513,
1044
+ 0.192797783933518,
1045
+ 0.12248418631397355
1046
+ ],
1047
+ "bp": 0.9938240032224314,
1048
+ "sys_len": 1937,
1049
+ "ref_len": 1949,
1050
+ "sacrebleu": 0.24501270828054722,
1051
+ "score": 0.24501270828054722,
1052
+ "score_name": "sacrebleu",
1053
+ "score_ci_low": 0.20771746826159335,
1054
+ "score_ci_high": 0.2893120776468884,
1055
+ "sacrebleu_ci_low": 0.20771746826159335,
1056
+ "sacrebleu_ci_high": 0.2893120776468884
1057
+ },
1058
+ "mt_flores_101_eng_spa": {
1059
+ "num_of_instances": 66,
1060
+ "counts": [
1061
+ 1254,
1062
+ 697,
1063
+ 407,
1064
+ 241
1065
+ ],
1066
+ "totals": [
1067
+ 1994,
1068
+ 1928,
1069
+ 1862,
1070
+ 1796
1071
+ ],
1072
+ "precisions": [
1073
+ 0.6288866599799399,
1074
+ 0.36151452282157676,
1075
+ 0.21858216970998925,
1076
+ 0.13418708240534521
1077
+ ],
1078
+ "bp": 0.9491803375373334,
1079
+ "sys_len": 1994,
1080
+ "ref_len": 2098,
1081
+ "sacrebleu": 0.27124055641744416,
1082
+ "score": 0.27124055641744416,
1083
+ "score_name": "sacrebleu",
1084
+ "score_ci_low": 0.238444782766854,
1085
+ "score_ci_high": 0.29802738323060723,
1086
+ "sacrebleu_ci_low": 0.238444782766854,
1087
+ "sacrebleu_ci_high": 0.29802738323060723
1088
+ },
1089
+ "mt_flores_101_fra_eng": {
1090
+ "num_of_instances": 66,
1091
+ "counts": [
1092
+ 1272,
1093
+ 821,
1094
+ 564,
1095
+ 393
1096
+ ],
1097
+ "totals": [
1098
+ 1814,
1099
+ 1748,
1100
+ 1682,
1101
+ 1616
1102
+ ],
1103
+ "precisions": [
1104
+ 0.701212789415656,
1105
+ 0.4696796338672769,
1106
+ 0.33531510107015455,
1107
+ 0.24319306930693071
1108
+ ],
1109
+ "bp": 1.0,
1110
+ "sys_len": 1814,
1111
+ "ref_len": 1734,
1112
+ "sacrebleu": 0.4048218693289096,
1113
+ "score": 0.4048218693289096,
1114
+ "score_name": "sacrebleu",
1115
+ "score_ci_low": 0.3701816768681021,
1116
+ "score_ci_high": 0.4500189391713141,
1117
+ "sacrebleu_ci_low": 0.3701816768681021,
1118
+ "sacrebleu_ci_high": 0.4500189391713141
1119
+ },
1120
+ "mt_flores_101_jpn_eng": {
1121
+ "num_of_instances": 66,
1122
+ "counts": [
1123
+ 1013,
1124
+ 461,
1125
+ 263,
1126
+ 158
1127
+ ],
1128
+ "totals": [
1129
+ 1820,
1130
+ 1754,
1131
+ 1688,
1132
+ 1622
1133
+ ],
1134
+ "precisions": [
1135
+ 0.5565934065934066,
1136
+ 0.2628278221208666,
1137
+ 0.15580568720379145,
1138
+ 0.09741060419235512
1139
+ ],
1140
+ "bp": 1.0,
1141
+ "sys_len": 1820,
1142
+ "ref_len": 1734,
1143
+ "sacrebleu": 0.2170699640137964,
1144
+ "score": 0.2170699640137964,
1145
+ "score_name": "sacrebleu",
1146
+ "score_ci_low": 0.18647053560514687,
1147
+ "score_ci_high": 0.26290605520041826,
1148
+ "sacrebleu_ci_low": 0.18647053560514687,
1149
+ "sacrebleu_ci_high": 0.26290605520041826
1150
+ },
1151
+ "mt_flores_101_kor_eng": {
1152
+ "num_of_instances": 66,
1153
+ "counts": [
1154
+ 993,
1155
+ 465,
1156
+ 267,
1157
+ 159
1158
+ ],
1159
+ "totals": [
1160
+ 1895,
1161
+ 1829,
1162
+ 1763,
1163
+ 1697
1164
+ ],
1165
+ "precisions": [
1166
+ 0.5240105540897098,
1167
+ 0.2542372881355932,
1168
+ 0.1514463981849121,
1169
+ 0.09369475545079553
1170
+ ],
1171
+ "bp": 1.0,
1172
+ "sys_len": 1895,
1173
+ "ref_len": 1734,
1174
+ "sacrebleu": 0.20851551650550446,
1175
+ "score": 0.20851551650550446,
1176
+ "score_name": "sacrebleu",
1177
+ "score_ci_low": 0.17957414369928407,
1178
+ "score_ci_high": 0.2554257407203763,
1179
+ "sacrebleu_ci_low": 0.17957414369928407,
1180
+ "sacrebleu_ci_high": 0.2554257407203763
1181
+ },
1182
+ "mt_flores_101_por_eng": {
1183
+ "num_of_instances": 66,
1184
+ "counts": [
1185
+ 1283,
1186
+ 864,
1187
+ 623,
1188
+ 463
1189
+ ],
1190
+ "totals": [
1191
+ 1786,
1192
+ 1720,
1193
+ 1654,
1194
+ 1588
1195
+ ],
1196
+ "precisions": [
1197
+ 0.7183650615901456,
1198
+ 0.5023255813953488,
1199
+ 0.3766626360338573,
1200
+ 0.2915617128463476
1201
+ ],
1202
+ "bp": 1.0,
1203
+ "sys_len": 1786,
1204
+ "ref_len": 1734,
1205
+ "sacrebleu": 0.4461731000368479,
1206
+ "score": 0.4461731000368479,
1207
+ "score_name": "sacrebleu",
1208
+ "score_ci_low": 0.3937462177900075,
1209
+ "score_ci_high": 0.5074742125274534,
1210
+ "sacrebleu_ci_low": 0.3937462177900075,
1211
+ "sacrebleu_ci_high": 0.5074742125274534
1212
+ },
1213
+ "mt_flores_101_ron_eng": {
1214
+ "num_of_instances": 66,
1215
+ "counts": [
1216
+ 1272,
1217
+ 815,
1218
+ 567,
1219
+ 394
1220
+ ],
1221
+ "totals": [
1222
+ 1883,
1223
+ 1817,
1224
+ 1751,
1225
+ 1685
1226
+ ],
1227
+ "precisions": [
1228
+ 0.6755177907594265,
1229
+ 0.4485415520088057,
1230
+ 0.32381496287835526,
1231
+ 0.23382789317507416
1232
+ ],
1233
+ "bp": 1.0,
1234
+ "sys_len": 1883,
1235
+ "ref_len": 1734,
1236
+ "sacrebleu": 0.3891868659594801,
1237
+ "score": 0.3891868659594801,
1238
+ "score_name": "sacrebleu",
1239
+ "score_ci_low": 0.3320885601116078,
1240
+ "score_ci_high": 0.43155018915229526,
1241
+ "sacrebleu_ci_low": 0.3320885601116078,
1242
+ "sacrebleu_ci_high": 0.43155018915229526
1243
+ },
1244
+ "mt_flores_101_spa_eng": {
1245
+ "num_of_instances": 66,
1246
+ "counts": [
1247
+ 1163,
1248
+ 647,
1249
+ 405,
1250
+ 255
1251
+ ],
1252
+ "totals": [
1253
+ 1850,
1254
+ 1784,
1255
+ 1718,
1256
+ 1652
1257
+ ],
1258
+ "precisions": [
1259
+ 0.6286486486486487,
1260
+ 0.36266816143497754,
1261
+ 0.23573923166472643,
1262
+ 0.15435835351089588
1263
+ ],
1264
+ "bp": 1.0,
1265
+ "sys_len": 1850,
1266
+ "ref_len": 1734,
1267
+ "sacrebleu": 0.30180043021927255,
1268
+ "score": 0.30180043021927255,
1269
+ "score_name": "sacrebleu",
1270
+ "score_ci_low": 0.2688254114664501,
1271
+ "score_ci_high": 0.3447493011880723,
1272
+ "sacrebleu_ci_low": 0.2688254114664501,
1273
+ "sacrebleu_ci_high": 0.3447493011880723
1274
+ },
1275
+ "score": 0.30224812174972626,
1276
+ "score_name": "subsets_mean",
1277
+ "num_of_instances": 990
1278
+ },
1279
+ "score": 0.40523563547897645,
1280
+ "score_name": "subsets_mean",
1281
+ "num_of_instances": 12472
1282
+ }
1283
+ }