jbnayahu commited on
Commit
c1db7ac
·
unverified ·
1 Parent(s): 189956e

Signed-off-by: Jonathan Bnayahu <[email protected]>

results/bluebench/2025-06-19T11-21-54_evaluation_results.json DELETED
@@ -1,1283 +0,0 @@
1
- {
2
- "environment_info": {
3
- "timestamp_utc": "2025-06-19T15:21:49.633185Z",
4
- "command_line_invocation": [
5
- "/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
6
- "--tasks",
7
- "benchmarks.bluebench",
8
- "--model",
9
- "cross_provider",
10
- "--model_args",
11
- "model_name=watsonx/ibm/granite-3-3-8b-instruct,max_tokens=256",
12
- "--output_path",
13
- "./results/bluebench",
14
- "--log_samples",
15
- "--trust_remote_code",
16
- "--batch_size",
17
- "8",
18
- "--verbosity",
19
- "ERROR"
20
- ],
21
- "parsed_arguments": {
22
- "tasks": [
23
- "benchmarks.bluebench"
24
- ],
25
- "split": "test",
26
- "num_fewshots": null,
27
- "limit": null,
28
- "batch_size": 8,
29
- "model": "watsonx/ibm/granite-3-3-8b-instruct",
30
- "model_args": {
31
- "max_tokens": 256
32
- },
33
- "gen_kwargs": null,
34
- "chat_template_kwargs": null,
35
- "output_path": "./results/bluebench",
36
- "output_file_prefix": "evaluation_results",
37
- "log_samples": true,
38
- "verbosity": "ERROR",
39
- "apply_chat_template": false,
40
- "trust_remote_code": true,
41
- "disable_hf_cache": false,
42
- "cache_dir": null
43
- },
44
- "unitxt_version": "1.24.0",
45
- "unitxt_commit_hash": "2bfd4494ec443ef86013e30d31f4860177124476",
46
- "python_version": "3.10.18",
47
- "system": "Linux",
48
- "system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
49
- "installed_packages": {
50
- "nvidia-cufile-cu12": "1.11.1.6",
51
- "triton": "3.3.1",
52
- "nltk": "3.9.1",
53
- "anyio": "4.9.0",
54
- "absl-py": "2.3.0",
55
- "tiktoken": "0.9.0",
56
- "charset-normalizer": "3.4.2",
57
- "nvidia-cuda-runtime-cu12": "12.6.77",
58
- "sympy": "1.14.0",
59
- "mecab-ko": "1.0.1",
60
- "litellm": "1.72.6.post1",
61
- "httpcore": "1.0.9",
62
- "Jinja2": "3.1.6",
63
- "jsonschema-specifications": "2025.4.1",
64
- "pydantic_core": "2.33.2",
65
- "nvidia-cusparse-cu12": "12.5.4.2",
66
- "yarl": "1.20.1",
67
- "openai": "1.88.0",
68
- "portalocker": "3.2.0",
69
- "pandas": "2.3.0",
70
- "multiprocess": "0.70.16",
71
- "jsonschema": "4.24.0",
72
- "unitxt": "1.24.0",
73
- "nvidia-nvjitlink-cu12": "12.6.85",
74
- "nvidia-cublas-cu12": "12.6.4.1",
75
- "pydantic": "2.11.7",
76
- "async-timeout": "5.0.1",
77
- "annotated-types": "0.7.0",
78
- "rouge_score": "0.1.2",
79
- "contourpy": "1.3.2",
80
- "aiosignal": "1.3.2",
81
- "nvidia-cuda-cupti-cu12": "12.6.80",
82
- "pillow": "11.2.1",
83
- "six": "1.17.0",
84
- "diskcache": "5.6.3",
85
- "tqdm": "4.67.1",
86
- "pyarrow": "20.0.0",
87
- "h11": "0.16.0",
88
- "zipp": "3.19.2",
89
- "tzdata": "2025.2",
90
- "bert-score": "0.3.13",
91
- "setuptools": "80.9.0",
92
- "referencing": "0.36.2",
93
- "sacrebleu": "2.5.1",
94
- "filelock": "3.18.0",
95
- "urllib3": "2.5.0",
96
- "scipy": "1.15.3",
97
- "nvidia-nccl-cu12": "2.26.2",
98
- "kiwisolver": "1.4.8",
99
- "networkx": "3.4.2",
100
- "typing-inspection": "0.4.1",
101
- "lxml": "5.4.0",
102
- "sniffio": "1.3.1",
103
- "scikit-learn": "1.7.0",
104
- "nvidia-curand-cu12": "10.3.7.77",
105
- "pip": "25.1.1",
106
- "fonttools": "4.58.4",
107
- "transformers": "4.52.4",
108
- "datasets": "3.6.0",
109
- "nvidia-cusolver-cu12": "11.7.1.2",
110
- "cycler": "0.12.1",
111
- "evaluate": "0.4.3",
112
- "distro": "1.9.0",
113
- "idna": "3.10",
114
- "MarkupSafe": "3.0.2",
115
- "frozenlist": "1.7.0",
116
- "pyparsing": "3.2.3",
117
- "jiter": "0.10.0",
118
- "importlib_metadata": "8.0.0",
119
- "packaging": "24.2",
120
- "psutil": "7.0.0",
121
- "mecab-ko-dic": "1.0.0",
122
- "joblib": "1.5.1",
123
- "fsspec": "2025.3.0",
124
- "dill": "0.3.8",
125
- "tokenizers": "0.21.1",
126
- "wheel": "0.45.1",
127
- "nvidia-nvtx-cu12": "12.6.77",
128
- "nvidia-cusparselt-cu12": "0.6.3",
129
- "hf-xet": "1.1.4",
130
- "propcache": "0.3.2",
131
- "numpy": "2.2.6",
132
- "mpmath": "1.3.0",
133
- "multidict": "6.5.0",
134
- "conllu": "6.0.0",
135
- "safetensors": "0.5.3",
136
- "requests": "2.32.4",
137
- "regex": "2024.11.6",
138
- "aiohttp": "3.12.13",
139
- "tabulate": "0.9.0",
140
- "certifi": "2025.6.15",
141
- "accelerate": "1.8.0",
142
- "nvidia-cufft-cu12": "11.3.0.4",
143
- "nvidia-cuda-nvrtc-cu12": "12.6.77",
144
- "click": "8.2.1",
145
- "typing_extensions": "4.12.2",
146
- "attrs": "25.3.0",
147
- "exceptiongroup": "1.3.0",
148
- "tenacity": "9.1.2",
149
- "pytz": "2025.2",
150
- "aiohappyeyeballs": "2.6.1",
151
- "python-dateutil": "2.9.0.post0",
152
- "torch": "2.7.1",
153
- "python-dotenv": "1.1.0",
154
- "httpx": "0.28.1",
155
- "matplotlib": "3.10.3",
156
- "xxhash": "3.5.0",
157
- "PyYAML": "6.0.2",
158
- "huggingface-hub": "0.33.0",
159
- "colorama": "0.4.6",
160
- "rpds-py": "0.25.1",
161
- "threadpoolctl": "3.6.0",
162
- "nvidia-cudnn-cu12": "9.5.1.17",
163
- "jaraco.collections": "5.1.0",
164
- "tomli": "2.0.1",
165
- "backports.tarfile": "1.2.0",
166
- "jaraco.context": "5.3.0",
167
- "typeguard": "4.3.0",
168
- "autocommand": "2.2.2",
169
- "jaraco.text": "3.12.1",
170
- "more-itertools": "10.3.0",
171
- "platformdirs": "4.2.2",
172
- "inflect": "7.3.1",
173
- "jaraco.functools": "4.0.1"
174
- }
175
- },
176
- "results": {
177
- "bias": {
178
- "safety_bbq_age": {
179
- "accuracy": 0.5555555555555556,
180
- "accuracy_ci_low": 0.45555555555555555,
181
- "accuracy_ci_high": 0.6555555555555556,
182
- "score_name": "accuracy",
183
- "score": 0.5555555555555556,
184
- "score_ci_high": 0.6555555555555556,
185
- "score_ci_low": 0.45555555555555555,
186
- "num_of_instances": 90
187
- },
188
- "safety_bbq_disability_status": {
189
- "accuracy": 0.6222222222222222,
190
- "accuracy_ci_low": 0.5222222222222223,
191
- "accuracy_ci_high": 0.7222222222222222,
192
- "score_name": "accuracy",
193
- "score": 0.6222222222222222,
194
- "score_ci_high": 0.7222222222222222,
195
- "score_ci_low": 0.5222222222222223,
196
- "num_of_instances": 90
197
- },
198
- "safety_bbq_gender_identity": {
199
- "accuracy": 0.8777777777777778,
200
- "accuracy_ci_low": 0.8,
201
- "accuracy_ci_high": 0.9333333333333333,
202
- "score_name": "accuracy",
203
- "score": 0.8777777777777778,
204
- "score_ci_high": 0.9333333333333333,
205
- "score_ci_low": 0.8,
206
- "num_of_instances": 90
207
- },
208
- "safety_bbq_nationality": {
209
- "accuracy": 0.6333333333333333,
210
- "accuracy_ci_low": 0.5333333333333333,
211
- "accuracy_ci_high": 0.7333333333333333,
212
- "score_name": "accuracy",
213
- "score": 0.6333333333333333,
214
- "score_ci_high": 0.7333333333333333,
215
- "score_ci_low": 0.5333333333333333,
216
- "num_of_instances": 90
217
- },
218
- "safety_bbq_physical_appearance": {
219
- "accuracy": 0.6555555555555556,
220
- "accuracy_ci_low": 0.5555555555555556,
221
- "accuracy_ci_high": 0.7539633744548231,
222
- "score_name": "accuracy",
223
- "score": 0.6555555555555556,
224
- "score_ci_high": 0.7539633744548231,
225
- "score_ci_low": 0.5555555555555556,
226
- "num_of_instances": 90
227
- },
228
- "safety_bbq_race_ethnicity": {
229
- "accuracy": 0.9333333333333333,
230
- "accuracy_ci_low": 0.8666666666666667,
231
- "accuracy_ci_high": 0.9777777777777777,
232
- "score_name": "accuracy",
233
- "score": 0.9333333333333333,
234
- "score_ci_high": 0.9777777777777777,
235
- "score_ci_low": 0.8666666666666667,
236
- "num_of_instances": 90
237
- },
238
- "safety_bbq_race_x_gender": {
239
- "accuracy": 0.8888888888888888,
240
- "accuracy_ci_low": 0.8222222222222222,
241
- "accuracy_ci_high": 0.9444444444444444,
242
- "score_name": "accuracy",
243
- "score": 0.8888888888888888,
244
- "score_ci_high": 0.9444444444444444,
245
- "score_ci_low": 0.8222222222222222,
246
- "num_of_instances": 90
247
- },
248
- "safety_bbq_race_x_ses": {
249
- "accuracy": 0.9333333333333333,
250
- "accuracy_ci_low": 0.8666666666666667,
251
- "accuracy_ci_high": 0.9777777777777777,
252
- "score_name": "accuracy",
253
- "score": 0.9333333333333333,
254
- "score_ci_high": 0.9777777777777777,
255
- "score_ci_low": 0.8666666666666667,
256
- "num_of_instances": 90
257
- },
258
- "safety_bbq_religion": {
259
- "accuracy": 0.7666666666666667,
260
- "accuracy_ci_low": 0.6720698151047421,
261
- "accuracy_ci_high": 0.8444444444444444,
262
- "score_name": "accuracy",
263
- "score": 0.7666666666666667,
264
- "score_ci_high": 0.8444444444444444,
265
- "score_ci_low": 0.6720698151047421,
266
- "num_of_instances": 90
267
- },
268
- "safety_bbq_ses": {
269
- "accuracy": 0.6333333333333333,
270
- "accuracy_ci_low": 0.5333333333333333,
271
- "accuracy_ci_high": 0.7283280971833935,
272
- "score_name": "accuracy",
273
- "score": 0.6333333333333333,
274
- "score_ci_high": 0.7283280971833935,
275
- "score_ci_low": 0.5333333333333333,
276
- "num_of_instances": 90
277
- },
278
- "safety_bbq_sexual_orientation": {
279
- "accuracy": 0.7666666666666667,
280
- "accuracy_ci_low": 0.6666666666666666,
281
- "accuracy_ci_high": 0.8444444444444444,
282
- "score_name": "accuracy",
283
- "score": 0.7666666666666667,
284
- "score_ci_high": 0.8444444444444444,
285
- "score_ci_low": 0.6666666666666666,
286
- "num_of_instances": 90
287
- },
288
- "score": 0.7515151515151515,
289
- "score_name": "subsets_mean",
290
- "num_of_instances": 990
291
- },
292
- "chatbot_abilities": {
293
- "arena_hard_generation_english_gpt_4_0314_reference": {
294
- "num_of_instances": 500,
295
- "llama_3_70b_instruct_template_arena_hard": 0.5,
296
- "score": 0.5,
297
- "score_name": "llama_3_70b_instruct_template_arena_hard"
298
- },
299
- "score": 0.5,
300
- "score_name": "subsets_mean",
301
- "num_of_instances": 500
302
- },
303
- "entity_extraction": {
304
- "universal_ner_en_ewt": {
305
- "num_of_instances": 1000,
306
- "f1_Person": 0.5102639296187683,
307
- "f1_Organization": 0.3381294964028777,
308
- "f1_Location": 0.35652173913043483,
309
- "f1_macro": 0.40163838838402693,
310
- "recall_macro": 0.3240210323686792,
311
- "precision_macro": 0.530656067251462,
312
- "in_classes_support": 0.5625,
313
- "f1_micro": 0.31789282470481384,
314
- "recall_micro": 0.3333333333333333,
315
- "precision_micro": 0.3038194444444444,
316
- "score": 0.31789282470481384,
317
- "score_name": "f1_micro",
318
- "score_ci_low": 0.26482961534023236,
319
- "score_ci_high": 0.37029988780714157,
320
- "f1_micro_ci_low": 0.26482961534023236,
321
- "f1_micro_ci_high": 0.37029988780714157
322
- },
323
- "score": 0.31789282470481384,
324
- "score_name": "subsets_mean",
325
- "num_of_instances": 1000
326
- },
327
- "knowledge": {
328
- "mmlu_pro_biology": {
329
- "accuracy": 0.5211267605633803,
330
- "accuracy_ci_low": 0.4084507042253521,
331
- "accuracy_ci_high": 0.6338028169014085,
332
- "score_name": "accuracy",
333
- "score": 0.5211267605633803,
334
- "score_ci_high": 0.6338028169014085,
335
- "score_ci_low": 0.4084507042253521,
336
- "num_of_instances": 71
337
- },
338
- "mmlu_pro_business": {
339
- "accuracy": 0.19718309859154928,
340
- "accuracy_ci_low": 0.11267605633802817,
341
- "accuracy_ci_high": 0.29577464788732394,
342
- "score_name": "accuracy",
343
- "score": 0.19718309859154928,
344
- "score_ci_high": 0.29577464788732394,
345
- "score_ci_low": 0.11267605633802817,
346
- "num_of_instances": 71
347
- },
348
- "mmlu_pro_chemistry": {
349
- "accuracy": 0.23943661971830985,
350
- "accuracy_ci_low": 0.15492957746478872,
351
- "accuracy_ci_high": 0.3380281690140845,
352
- "score_name": "accuracy",
353
- "score": 0.23943661971830985,
354
- "score_ci_high": 0.3380281690140845,
355
- "score_ci_low": 0.15492957746478872,
356
- "num_of_instances": 71
357
- },
358
- "mmlu_pro_computer_science": {
359
- "accuracy": 0.43661971830985913,
360
- "accuracy_ci_low": 0.323943661971831,
361
- "accuracy_ci_high": 0.5492957746478874,
362
- "score_name": "accuracy",
363
- "score": 0.43661971830985913,
364
- "score_ci_high": 0.5492957746478874,
365
- "score_ci_low": 0.323943661971831,
366
- "num_of_instances": 71
367
- },
368
- "mmlu_pro_economics": {
369
- "accuracy": 0.38028169014084506,
370
- "accuracy_ci_low": 0.2676056338028169,
371
- "accuracy_ci_high": 0.49295774647887325,
372
- "score_name": "accuracy",
373
- "score": 0.38028169014084506,
374
- "score_ci_high": 0.49295774647887325,
375
- "score_ci_low": 0.2676056338028169,
376
- "num_of_instances": 71
377
- },
378
- "mmlu_pro_engineering": {
379
- "accuracy": 0.2535211267605634,
380
- "accuracy_ci_low": 0.16901408450704225,
381
- "accuracy_ci_high": 0.36048330202820134,
382
- "score_name": "accuracy",
383
- "score": 0.2535211267605634,
384
- "score_ci_high": 0.36048330202820134,
385
- "score_ci_low": 0.16901408450704225,
386
- "num_of_instances": 71
387
- },
388
- "mmlu_pro_health": {
389
- "accuracy": 0.36619718309859156,
390
- "accuracy_ci_low": 0.2535211267605634,
391
- "accuracy_ci_high": 0.4788732394366197,
392
- "score_name": "accuracy",
393
- "score": 0.36619718309859156,
394
- "score_ci_high": 0.4788732394366197,
395
- "score_ci_low": 0.2535211267605634,
396
- "num_of_instances": 71
397
- },
398
- "mmlu_pro_history": {
399
- "accuracy": 0.36619718309859156,
400
- "accuracy_ci_low": 0.2535211267605634,
401
- "accuracy_ci_high": 0.4788732394366197,
402
- "score_name": "accuracy",
403
- "score": 0.36619718309859156,
404
- "score_ci_high": 0.4788732394366197,
405
- "score_ci_low": 0.2535211267605634,
406
- "num_of_instances": 71
407
- },
408
- "mmlu_pro_law": {
409
- "accuracy": 0.36619718309859156,
410
- "accuracy_ci_low": 0.2535211267605634,
411
- "accuracy_ci_high": 0.4788732394366197,
412
- "score_name": "accuracy",
413
- "score": 0.36619718309859156,
414
- "score_ci_high": 0.4788732394366197,
415
- "score_ci_low": 0.2535211267605634,
416
- "num_of_instances": 71
417
- },
418
- "mmlu_pro_math": {
419
- "accuracy": 0.1267605633802817,
420
- "accuracy_ci_low": 0.056338028169014086,
421
- "accuracy_ci_high": 0.22535211267605634,
422
- "score_name": "accuracy",
423
- "score": 0.1267605633802817,
424
- "score_ci_high": 0.22535211267605634,
425
- "score_ci_low": 0.056338028169014086,
426
- "num_of_instances": 71
427
- },
428
- "mmlu_pro_other": {
429
- "accuracy": 0.22535211267605634,
430
- "accuracy_ci_low": 0.14084507042253522,
431
- "accuracy_ci_high": 0.323943661971831,
432
- "score_name": "accuracy",
433
- "score": 0.22535211267605634,
434
- "score_ci_high": 0.323943661971831,
435
- "score_ci_low": 0.14084507042253522,
436
- "num_of_instances": 71
437
- },
438
- "mmlu_pro_philosophy": {
439
- "accuracy": 0.4084507042253521,
440
- "accuracy_ci_low": 0.30985915492957744,
441
- "accuracy_ci_high": 0.5352112676056338,
442
- "score_name": "accuracy",
443
- "score": 0.4084507042253521,
444
- "score_ci_high": 0.5352112676056338,
445
- "score_ci_low": 0.30985915492957744,
446
- "num_of_instances": 71
447
- },
448
- "mmlu_pro_physics": {
449
- "accuracy": 0.29577464788732394,
450
- "accuracy_ci_low": 0.19718309859154928,
451
- "accuracy_ci_high": 0.4084507042253521,
452
- "score_name": "accuracy",
453
- "score": 0.29577464788732394,
454
- "score_ci_high": 0.4084507042253521,
455
- "score_ci_low": 0.19718309859154928,
456
- "num_of_instances": 71
457
- },
458
- "mmlu_pro_psychology": {
459
- "accuracy": 0.5352112676056338,
460
- "accuracy_ci_low": 0.4084507042253521,
461
- "accuracy_ci_high": 0.647887323943662,
462
- "score_name": "accuracy",
463
- "score": 0.5352112676056338,
464
- "score_ci_high": 0.647887323943662,
465
- "score_ci_low": 0.4084507042253521,
466
- "num_of_instances": 71
467
- },
468
- "score": 0.33702213279678067,
469
- "score_name": "subsets_mean",
470
- "num_of_instances": 994
471
- },
472
- "legal": {
473
- "legalbench_abercrombie": {
474
- "f1_macro": 0.2696554985630616,
475
- "f1_suggestive": 0.2727272727272727,
476
- "f1_arbitrary": 0.43137254901960786,
477
- "f1_generic": 0.11764705882352941,
478
- "f1_fanciful": 0.2,
479
- "f1_descriptive": 0.32653061224489793,
480
- "f1_macro_ci_low": 0.18689773936584586,
481
- "f1_macro_ci_high": 0.37923074712363225,
482
- "score_name": "f1_micro",
483
- "score": 0.31446540880503143,
484
- "score_ci_high": 0.42038216560509556,
485
- "score_ci_low": 0.21656050955414013,
486
- "num_of_instances": 85,
487
- "accuracy": 0.29411764705882354,
488
- "accuracy_ci_low": 0.2,
489
- "accuracy_ci_high": 0.4,
490
- "f1_micro": 0.31446540880503143,
491
- "f1_micro_ci_low": 0.21656050955414013,
492
- "f1_micro_ci_high": 0.42038216560509556
493
- },
494
- "legalbench_corporate_lobbying": {
495
- "f1_macro": 0.5388253241800153,
496
- "f1_no": 0.7298245614035088,
497
- "f1_yes": 0.34782608695652173,
498
- "f1_macro_ci_low": 0.47191290375757455,
499
- "f1_macro_ci_high": 0.6216206779092042,
500
- "score_name": "f1_micro",
501
- "score": 0.636604774535809,
502
- "score_ci_high": 0.6985040092826637,
503
- "score_ci_low": 0.5691144311757004,
504
- "num_of_instances": 200,
505
- "accuracy": 0.6,
506
- "accuracy_ci_low": 0.53,
507
- "accuracy_ci_high": 0.665,
508
- "f1_micro": 0.636604774535809,
509
- "f1_micro_ci_low": 0.5691144311757004,
510
- "f1_micro_ci_high": 0.6985040092826637
511
- },
512
- "legalbench_function_of_decision_section": {
513
- "f1_macro": 0.2947177227927682,
514
- "f1_conclusion": 0.2127659574468085,
515
- "f1_decree": 0.23529411764705882,
516
- "f1_issue": 0.2711864406779661,
517
- "f1_rule": 0.42857142857142855,
518
- "f1_analysis": 0.4444444444444444,
519
- "f1_facts": 0.21621621621621623,
520
- "f1_procedural history": 0.2545454545454545,
521
- "f1_macro_ci_low": 0.23794703715833648,
522
- "f1_macro_ci_high": 0.36665623309642204,
523
- "score_name": "f1_micro",
524
- "score": 0.30409356725146197,
525
- "score_ci_high": 0.3711587285161421,
526
- "score_ci_low": 0.23855266549315363,
527
- "num_of_instances": 200,
528
- "accuracy": 0.26,
529
- "accuracy_ci_low": 0.2,
530
- "accuracy_ci_high": 0.32,
531
- "f1_micro": 0.30409356725146197,
532
- "f1_micro_ci_low": 0.23855266549315363,
533
- "f1_micro_ci_high": 0.3711587285161421
534
- },
535
- "legalbench_international_citizenship_questions": {
536
- "f1_macro": 0.49092908191313905,
537
- "f1_yes": 0.5700934579439252,
538
- "f1_no": 0.4117647058823529,
539
- "f1_macro_ci_low": 0.4178065856787266,
540
- "f1_macro_ci_high": 0.5601203681213927,
541
- "score_name": "f1_micro",
542
- "score": 0.5,
543
- "score_ci_high": 0.566970455032283,
544
- "score_ci_low": 0.42555336134062,
545
- "num_of_instances": 200,
546
- "accuracy": 0.48,
547
- "accuracy_ci_low": 0.405,
548
- "accuracy_ci_high": 0.545,
549
- "f1_micro": 0.5,
550
- "f1_micro_ci_low": 0.42555336134062,
551
- "f1_micro_ci_high": 0.566970455032283
552
- },
553
- "legalbench_proa": {
554
- "f1_macro": 0.8315276273022751,
555
- "f1_yes": 0.8169014084507042,
556
- "f1_no": 0.8461538461538461,
557
- "f1_macro_ci_low": 0.7549023325928579,
558
- "f1_macro_ci_high": 0.890440353074843,
559
- "score_name": "f1_micro",
560
- "score": 0.8322147651006712,
561
- "score_ci_high": 0.8903225806451613,
562
- "score_ci_low": 0.7554946760306516,
563
- "num_of_instances": 85,
564
- "accuracy": 0.7294117647058823,
565
- "accuracy_ci_low": 0.6352941176470588,
566
- "accuracy_ci_high": 0.8117647058823529,
567
- "f1_micro": 0.8322147651006712,
568
- "f1_micro_ci_low": 0.7554946760306516,
569
- "f1_micro_ci_high": 0.8903225806451613
570
- },
571
- "score": 0.5174757031385947,
572
- "score_name": "subsets_mean",
573
- "num_of_instances": 770
574
- },
575
- "news_classification": {
576
- "20_newsgroups_short": {
577
- "f1_macro": 0.42272407811143237,
578
- "f1_cars": 0.6078431372549019,
579
- "f1_pc hardware": 0.34080717488789236,
580
- "f1_windows x": 0.029850746268656716,
581
- "f1_computer graphics": 0.4367816091954023,
582
- "f1_atheism": 0.21739130434782608,
583
- "f1_religion": 0.23300970873786409,
584
- "f1_medicine": 0.8641975308641975,
585
- "f1_christianity": 0.1694915254237288,
586
- "f1_microsoft windows": 0.39436619718309857,
587
- "f1_middle east": 0.43037974683544306,
588
- "f1_politics": 0.291970802919708,
589
- "f1_motorcycles": 0.43902439024390244,
590
- "f1_mac hardware": 0.09090909090909091,
591
- "f1_for sale": 0.625,
592
- "f1_guns": 0.18181818181818182,
593
- "f1_space": 0.5569620253164557,
594
- "f1_cryptography": 0.4482758620689655,
595
- "f1_baseball": 0.8545454545454545,
596
- "f1_hockey": 0.859504132231405,
597
- "f1_electronics": 0.38235294117647056,
598
- "f1_macro_ci_low": 0.3988534736802405,
599
- "f1_macro_ci_high": 0.4557473948035634,
600
- "score_name": "f1_micro",
601
- "score": 0.44368600682593856,
602
- "score_ci_high": 0.47444463958776134,
603
- "score_ci_low": 0.4135801299006492,
604
- "num_of_instances": 1000,
605
- "accuracy": 0.39,
606
- "accuracy_ci_low": 0.36,
607
- "accuracy_ci_high": 0.418,
608
- "f1_micro": 0.44368600682593856,
609
- "f1_micro_ci_low": 0.4135801299006492,
610
- "f1_micro_ci_high": 0.47444463958776134
611
- },
612
- "score": 0.44368600682593856,
613
- "score_name": "subsets_mean",
614
- "num_of_instances": 1000
615
- },
616
- "product_help": {
617
- "cfpb_product_2023": {
618
- "f1_macro": 0.6409217061975553,
619
- "f1_credit reporting or credit repair services or other personal consumer reports": 0.9205673758865248,
620
- "f1_credit card or prepaid card": 0.6363636363636364,
621
- "f1_checking or savings account": 0.7766990291262136,
622
- "f1_mortgage": 0.7777777777777778,
623
- "f1_debt collection": 0.6222222222222222,
624
- "f1_student loan": 0.88,
625
- "f1_payday loan or title loan or personal loan": 0.35294117647058826,
626
- "f1_vehicle loan or lease": 0.5517241379310345,
627
- "f1_money transfer or virtual currency or money service": 0.25,
628
- "f1_macro_ci_low": 0.5901810957914123,
629
- "f1_macro_ci_high": 0.7054871287846897,
630
- "score_name": "f1_micro",
631
- "score": 0.8491446345256609,
632
- "score_ci_high": 0.8701030927835052,
633
- "score_ci_low": 0.8291666666666667,
634
- "num_of_instances": 1000,
635
- "accuracy": 0.819,
636
- "accuracy_ci_low": 0.796,
637
- "accuracy_ci_high": 0.843,
638
- "f1_micro": 0.8491446345256609,
639
- "f1_micro_ci_low": 0.8291666666666667,
640
- "f1_micro_ci_high": 0.8701030927835052
641
- },
642
- "cfpb_product_watsonx": {
643
- "f1_macro": 0.7132677588870594,
644
- "f1_mortgages and loans": 0.7771428571428571,
645
- "f1_credit card": 0.7023809523809523,
646
- "f1_debt collection": 0.6854460093896714,
647
- "f1_credit reporting": 0.7601476014760148,
648
- "f1_retail banking": 0.6412213740458015,
649
- "f1_macro_ci_low": 0.672279823384184,
650
- "f1_macro_ci_high": 0.7539657340394554,
651
- "score_name": "f1_micro",
652
- "score": 0.7202505219206681,
653
- "score_ci_high": 0.7576596149340853,
654
- "score_ci_low": 0.6805865270375967,
655
- "num_of_instances": 500,
656
- "accuracy": 0.69,
657
- "accuracy_ci_low": 0.65,
658
- "accuracy_ci_high": 0.73,
659
- "f1_micro": 0.7202505219206681,
660
- "f1_micro_ci_low": 0.6805865270375967,
661
- "f1_micro_ci_high": 0.7576596149340853
662
- },
663
- "score": 0.7846975782231644,
664
- "score_name": "subsets_mean",
665
- "num_of_instances": 1500
666
- },
667
- "qa_finance": {
668
- "fin_qa": {
669
- "num_of_instances": 1000,
670
- "execution_accuracy": 0.074,
671
- "program_accuracy": 0.085,
672
- "score": 0.085,
673
- "score_name": "program_accuracy",
674
- "execution_accuracy_ci_low": 0.058,
675
- "execution_accuracy_ci_high": 0.091,
676
- "program_accuracy_ci_low": 0.068,
677
- "program_accuracy_ci_high": 0.102,
678
- "score_ci_low": 0.068,
679
- "score_ci_high": 0.102
680
- },
681
- "score": 0.085,
682
- "score_name": "subsets_mean",
683
- "num_of_instances": 1000
684
- },
685
- "rag_general": {
686
- "rag_response_generation_clapnq": {
687
- "precision": 0.30022844870852566,
688
- "recall": 0.5840193774846996,
689
- "f1": 0.3357215148632638,
690
- "precision_ci_low": 0.28030967471726836,
691
- "precision_ci_high": 0.32121747414474766,
692
- "recall_ci_low": 0.565861900260428,
693
- "recall_ci_high": 0.59971992711831,
694
- "f1_ci_low": 0.3175124739653954,
695
- "f1_ci_high": 0.35218969004250933,
696
- "score_name": "f1",
697
- "score": 0.3357215148632638,
698
- "score_ci_high": 0.35218969004250933,
699
- "score_ci_low": 0.3175124739653954,
700
- "num_of_instances": 600,
701
- "correctness_f1_bert_score.deberta_large_mnli": 0.6000729685028394,
702
- "correctness_recall_bert_score.deberta_large_mnli": 0.6848867724835873,
703
- "correctness_precision_bert_score.deberta_large_mnli": 0.5528717580189307,
704
- "faithfullness_f1_token_overlap": 0.33597285355913525,
705
- "faithfullness_recall_token_overlap": 0.27114762054953845,
706
- "faithfullness_precision_token_overlap": 0.5569731492695155,
707
- "correctness_f1_token_overlap": 0.3357215148632638,
708
- "correctness_recall_token_overlap": 0.5840193774846996,
709
- "correctness_precision_token_overlap": 0.30022844870852566
710
- },
711
- "score": 0.3357215148632638,
712
- "score_name": "subsets_mean",
713
- "num_of_instances": 600
714
- },
715
- "reasoning": {
716
- "hellaswag": {
717
- "accuracy": 0.455,
718
- "accuracy_ci_low": 0.423,
719
- "accuracy_ci_high": 0.485,
720
- "score_name": "accuracy",
721
- "score": 0.455,
722
- "score_ci_high": 0.485,
723
- "score_ci_low": 0.423,
724
- "num_of_instances": 1000
725
- },
726
- "openbook_qa": {
727
- "accuracy": 0.782,
728
- "accuracy_ci_low": 0.744,
729
- "accuracy_ci_high": 0.818,
730
- "score_name": "accuracy",
731
- "score": 0.782,
732
- "score_ci_high": 0.818,
733
- "score_ci_low": 0.744,
734
- "num_of_instances": 500
735
- },
736
- "score": 0.6185,
737
- "score_name": "subsets_mean",
738
- "num_of_instances": 1500
739
- },
740
- "safety": {
741
- "attaq_500": {
742
- "severity_critical": 0.0,
743
- "severity_high": 0.0,
744
- "severity_medium": 0.0,
745
- "severity_low": 100.0,
746
- "category_harmful_info": 0.8662501443103547,
747
- "category_deception": 0.8769584958149929,
748
- "category_explicit_content": 0.8376609406085929,
749
- "category_violence": 0.8456707175466277,
750
- "category_discrimination": 0.8991804392564864,
751
- "category_substance_abuse": 0.8935730718904071,
752
- "category_pii": 0.8467056638112774,
753
- "safety": 0.860165383811626,
754
- "safety_ci_low": 0.8426579493684276,
755
- "safety_ci_high": 0.8778207056405374,
756
- "score_name": "safety",
757
- "score": 0.860165383811626,
758
- "score_ci_high": 0.8778207056405374,
759
- "score_ci_low": 0.8426579493684276,
760
- "num_of_instances": 100
761
- },
762
- "score": 0.860165383811626,
763
- "score_name": "subsets_mean",
764
- "num_of_instances": 100
765
- },
766
- "summarization": {
767
- "billsum_document_filtered_to_6000_chars": {
768
- "num_of_instances": 528,
769
- "rougeL": 0.28749112328021914,
770
- "score": 0.28749112328021914,
771
- "score_name": "rougeL",
772
- "rougeLsum": 0.3503820014906059,
773
- "rouge2": 0.20109007723824623,
774
- "rouge1": 0.4200585739584912,
775
- "rougeL_ci_low": 0.2804794753326623,
776
- "rougeL_ci_high": 0.29447838537921134,
777
- "score_ci_low": 0.2804794753326623,
778
- "score_ci_high": 0.29447838537921134,
779
- "rougeLsum_ci_low": 0.341921573094731,
780
- "rougeLsum_ci_high": 0.35863585426859207,
781
- "rouge2_ci_low": 0.19416899053732958,
782
- "rouge2_ci_high": 0.20872476773642967,
783
- "rouge1_ci_low": 0.41035793857223635,
784
- "rouge1_ci_high": 0.4281932704537228
785
- },
786
- "tldr_document_filtered_to_6000_chars": {
787
- "num_of_instances": 1000,
788
- "rougeL": 0.07979202357473647,
789
- "score": 0.07979202357473647,
790
- "score_name": "rougeL",
791
- "rougeLsum": 0.0922932399263996,
792
- "rouge2": 0.015117853576507847,
793
- "rouge1": 0.11247814548815566,
794
- "rougeL_ci_low": 0.0764789144644062,
795
- "rougeL_ci_high": 0.08304032568245756,
796
- "score_ci_low": 0.0764789144644062,
797
- "score_ci_high": 0.08304032568245756,
798
- "rougeLsum_ci_low": 0.0880597944044916,
799
- "rougeLsum_ci_high": 0.09606464509440052,
800
- "rouge2_ci_low": 0.01362250797390663,
801
- "rouge2_ci_high": 0.0168799885499115,
802
- "rouge1_ci_low": 0.10733708561154955,
803
- "rouge1_ci_high": 0.11723898467910755
804
- },
805
- "score": 0.1836415734274778,
806
- "score_name": "subsets_mean",
807
- "num_of_instances": 1528
808
- },
809
- "translation": {
810
- "mt_flores_101_ara_eng": {
811
- "num_of_instances": 66,
812
- "counts": [
813
- 1154,
814
- 637,
815
- 382,
816
- 237
817
- ],
818
- "totals": [
819
- 3013,
820
- 2947,
821
- 2881,
822
- 2815
823
- ],
824
- "precisions": [
825
- 0.383006969797544,
826
- 0.2161520190023753,
827
- 0.13259284970496357,
828
- 0.08419182948490231
829
- ],
830
- "bp": 1.0,
831
- "sys_len": 3013,
832
- "ref_len": 1734,
833
- "sacrebleu": 0.17435684678472682,
834
- "score": 0.17435684678472682,
835
- "score_name": "sacrebleu",
836
- "score_ci_low": 0.12709535962365245,
837
- "score_ci_high": 0.21064271607309265,
838
- "sacrebleu_ci_low": 0.12709535962365245,
839
- "sacrebleu_ci_high": 0.21064271607309265
840
- },
841
- "mt_flores_101_deu_eng": {
842
- "num_of_instances": 66,
843
- "counts": [
844
- 1215,
845
- 695,
846
- 422,
847
- 256
848
- ],
849
- "totals": [
850
- 3433,
851
- 3367,
852
- 3301,
853
- 3235
854
- ],
855
- "precisions": [
856
- 0.35391785610253423,
857
- 0.20641520641520641,
858
- 0.12784004847016056,
859
- 0.07913446676970634
860
- ],
861
- "bp": 1.0,
862
- "sys_len": 3433,
863
- "ref_len": 1734,
864
- "sacrebleu": 0.16488046075977367,
865
- "score": 0.16488046075977367,
866
- "score_name": "sacrebleu",
867
- "score_ci_low": 0.12825986690370522,
868
- "score_ci_high": 0.20812836267228596,
869
- "sacrebleu_ci_low": 0.12825986690370522,
870
- "sacrebleu_ci_high": 0.20812836267228596
871
- },
872
- "mt_flores_101_eng_ara": {
873
- "num_of_instances": 66,
874
- "counts": [
875
- 726,
876
- 321,
877
- 159,
878
- 82
879
- ],
880
- "totals": [
881
- 2297,
882
- 2231,
883
- 2165,
884
- 2099
885
- ],
886
- "precisions": [
887
- 0.3160644318676535,
888
- 0.14388166741371583,
889
- 0.07344110854503465,
890
- 0.03906622201048118
891
- ],
892
- "bp": 1.0,
893
- "sys_len": 2297,
894
- "ref_len": 1589,
895
- "sacrebleu": 0.10687605905530678,
896
- "score": 0.10687605905530678,
897
- "score_name": "sacrebleu",
898
- "score_ci_low": 0.08639846348006232,
899
- "score_ci_high": 0.13425269082562755,
900
- "sacrebleu_ci_low": 0.08639846348006232,
901
- "sacrebleu_ci_high": 0.13425269082562755
902
- },
903
- "mt_flores_101_eng_deu": {
904
- "num_of_instances": 66,
905
- "counts": [
906
- 1066,
907
- 564,
908
- 332,
909
- 194
910
- ],
911
- "totals": [
912
- 2300,
913
- 2234,
914
- 2168,
915
- 2102
916
- ],
917
- "precisions": [
918
- 0.46347826086956523,
919
- 0.252461951656222,
920
- 0.15313653136531366,
921
- 0.0922930542340628
922
- ],
923
- "bp": 1.0,
924
- "sys_len": 2300,
925
- "ref_len": 1835,
926
- "sacrebleu": 0.2016593123773307,
927
- "score": 0.2016593123773307,
928
- "score_name": "sacrebleu",
929
- "score_ci_low": 0.177292145733578,
930
- "score_ci_high": 0.24439707428713803,
931
- "sacrebleu_ci_low": 0.177292145733578,
932
- "sacrebleu_ci_high": 0.24439707428713803
933
- },
934
- "mt_flores_101_eng_fra": {
935
- "num_of_instances": 66,
936
- "counts": [
937
- 1409,
938
- 950,
939
- 692,
940
- 517
941
- ],
942
- "totals": [
943
- 3275,
944
- 3209,
945
- 3143,
946
- 3077
947
- ],
948
- "precisions": [
949
- 0.4302290076335878,
950
- 0.2960423808039888,
951
- 0.2201718103722558,
952
- 0.168020799480013
953
- ],
954
- "bp": 1.0,
955
- "sys_len": 3275,
956
- "ref_len": 2068,
957
- "sacrebleu": 0.2619959538476516,
958
- "score": 0.2619959538476516,
959
- "score_name": "sacrebleu",
960
- "score_ci_low": 0.21071110880640612,
961
- "score_ci_high": 0.30599931494111227,
962
- "sacrebleu_ci_low": 0.21071110880640612,
963
- "sacrebleu_ci_high": 0.30599931494111227
964
- },
965
- "mt_flores_101_eng_kor": {
966
- "num_of_instances": 66,
967
- "counts": [
968
- 1096,
969
- 465,
970
- 233,
971
- 132
972
- ],
973
- "totals": [
974
- 3883,
975
- 3817,
976
- 3751,
977
- 3685
978
- ],
979
- "precisions": [
980
- 0.28225598763842386,
981
- 0.12182342153523709,
982
- 0.0621167688616369,
983
- 0.03582089552238806
984
- ],
985
- "bp": 1.0,
986
- "sys_len": 3883,
987
- "ref_len": 2235,
988
- "sacrebleu": 0.09352545142421302,
989
- "score": 0.09352545142421302,
990
- "score_name": "sacrebleu",
991
- "score_ci_low": 0.0763987126727994,
992
- "score_ci_high": 0.11617390981932266,
993
- "sacrebleu_ci_low": 0.0763987126727994,
994
- "sacrebleu_ci_high": 0.11617390981932266
995
- },
996
- "mt_flores_101_eng_por": {
997
- "num_of_instances": 66,
998
- "counts": [
999
- 1328,
1000
- 850,
1001
- 588,
1002
- 412
1003
- ],
1004
- "totals": [
1005
- 3030,
1006
- 2964,
1007
- 2898,
1008
- 2832
1009
- ],
1010
- "precisions": [
1011
- 0.4382838283828383,
1012
- 0.286774628879892,
1013
- 0.2028985507246377,
1014
- 0.14548022598870058
1015
- ],
1016
- "bp": 1.0,
1017
- "sys_len": 3030,
1018
- "ref_len": 1916,
1019
- "sacrebleu": 0.2467997817029595,
1020
- "score": 0.2467997817029595,
1021
- "score_name": "sacrebleu",
1022
- "score_ci_low": 0.193392163449652,
1023
- "score_ci_high": 0.2974642241791255,
1024
- "sacrebleu_ci_low": 0.193392163449652,
1025
- "sacrebleu_ci_high": 0.2974642241791255
1026
- },
1027
- "mt_flores_101_eng_ron": {
1028
- "num_of_instances": 66,
1029
- "counts": [
1030
- 930,
1031
- 400,
1032
- 214,
1033
- 123
1034
- ],
1035
- "totals": [
1036
- 2961,
1037
- 2895,
1038
- 2829,
1039
- 2763
1040
- ],
1041
- "precisions": [
1042
- 0.3140830800405269,
1043
- 0.1381692573402418,
1044
- 0.07564510427712973,
1045
- 0.04451682953311618
1046
- ],
1047
- "bp": 1.0,
1048
- "sys_len": 2961,
1049
- "ref_len": 1949,
1050
- "sacrebleu": 0.1099487393546487,
1051
- "score": 0.1099487393546487,
1052
- "score_name": "sacrebleu",
1053
- "score_ci_low": 0.08284384518142485,
1054
- "score_ci_high": 0.13880651312628609,
1055
- "sacrebleu_ci_low": 0.08284384518142485,
1056
- "sacrebleu_ci_high": 0.13880651312628609
1057
- },
1058
- "mt_flores_101_eng_spa": {
1059
- "num_of_instances": 66,
1060
- "counts": [
1061
- 1217,
1062
- 624,
1063
- 347,
1064
- 198
1065
- ],
1066
- "totals": [
1067
- 3045,
1068
- 2979,
1069
- 2913,
1070
- 2847
1071
- ],
1072
- "precisions": [
1073
- 0.399671592775041,
1074
- 0.20946626384692849,
1075
- 0.11912118091314795,
1076
- 0.06954689146469968
1077
- ],
1078
- "bp": 1.0,
1079
- "sys_len": 3045,
1080
- "ref_len": 2098,
1081
- "sacrebleu": 0.1622822499255264,
1082
- "score": 0.1622822499255264,
1083
- "score_name": "sacrebleu",
1084
- "score_ci_low": 0.13321857221475644,
1085
- "score_ci_high": 0.19390301665624113,
1086
- "sacrebleu_ci_low": 0.13321857221475644,
1087
- "sacrebleu_ci_high": 0.19390301665624113
1088
- },
1089
- "mt_flores_101_fra_eng": {
1090
- "num_of_instances": 66,
1091
- "counts": [
1092
- 1236,
1093
- 735,
1094
- 470,
1095
- 308
1096
- ],
1097
- "totals": [
1098
- 2952,
1099
- 2886,
1100
- 2820,
1101
- 2754
1102
- ],
1103
- "precisions": [
1104
- 0.4186991869918699,
1105
- 0.25467775467775466,
1106
- 0.16666666666666669,
1107
- 0.11183732752360204
1108
- ],
1109
- "bp": 1.0,
1110
- "sys_len": 2952,
1111
- "ref_len": 1734,
1112
- "sacrebleu": 0.2111456628673961,
1113
- "score": 0.2111456628673961,
1114
- "score_name": "sacrebleu",
1115
- "score_ci_low": 0.1728340034401921,
1116
- "score_ci_high": 0.26908287892628974,
1117
- "sacrebleu_ci_low": 0.1728340034401921,
1118
- "sacrebleu_ci_high": 0.26908287892628974
1119
- },
1120
- "mt_flores_101_jpn_eng": {
1121
- "num_of_instances": 66,
1122
- "counts": [
1123
- 1018,
1124
- 437,
1125
- 232,
1126
- 128
1127
- ],
1128
- "totals": [
1129
- 3130,
1130
- 3064,
1131
- 2998,
1132
- 2932
1133
- ],
1134
- "precisions": [
1135
- 0.3252396166134185,
1136
- 0.14262402088772846,
1137
- 0.07738492328218813,
1138
- 0.04365620736698499
1139
- ],
1140
- "bp": 1.0,
1141
- "sys_len": 3130,
1142
- "ref_len": 1734,
1143
- "sacrebleu": 0.11188570922324435,
1144
- "score": 0.11188570922324435,
1145
- "score_name": "sacrebleu",
1146
- "score_ci_low": 0.09154049326122426,
1147
- "score_ci_high": 0.13827539969992217,
1148
- "sacrebleu_ci_low": 0.09154049326122426,
1149
- "sacrebleu_ci_high": 0.13827539969992217
1150
- },
1151
- "mt_flores_101_kor_eng": {
1152
- "num_of_instances": 66,
1153
- "counts": [
1154
- 986,
1155
- 447,
1156
- 233,
1157
- 127
1158
- ],
1159
- "totals": [
1160
- 3637,
1161
- 3571,
1162
- 3505,
1163
- 3439
1164
- ],
1165
- "precisions": [
1166
- 0.27110255705251585,
1167
- 0.12517502100252031,
1168
- 0.06647646219686162,
1169
- 0.03692933992439663
1170
- ],
1171
- "bp": 1.0,
1172
- "sys_len": 3637,
1173
- "ref_len": 1734,
1174
- "sacrebleu": 0.09553723823741646,
1175
- "score": 0.09553723823741646,
1176
- "score_name": "sacrebleu",
1177
- "score_ci_low": 0.06933902828362079,
1178
- "score_ci_high": 0.1273472328564688,
1179
- "sacrebleu_ci_low": 0.06933902828362079,
1180
- "sacrebleu_ci_high": 0.1273472328564688
1181
- },
1182
- "mt_flores_101_por_eng": {
1183
- "num_of_instances": 66,
1184
- "counts": [
1185
- 1286,
1186
- 834,
1187
- 587,
1188
- 419
1189
- ],
1190
- "totals": [
1191
- 3404,
1192
- 3338,
1193
- 3272,
1194
- 3206
1195
- ],
1196
- "precisions": [
1197
- 0.37779083431257343,
1198
- 0.24985020970641103,
1199
- 0.17940097799511,
1200
- 0.13069245165315035
1201
- ],
1202
- "bp": 1.0,
1203
- "sys_len": 3404,
1204
- "ref_len": 1734,
1205
- "sacrebleu": 0.21689603438287544,
1206
- "score": 0.21689603438287544,
1207
- "score_name": "sacrebleu",
1208
- "score_ci_low": 0.18174547190909165,
1209
- "score_ci_high": 0.2734022486576191,
1210
- "sacrebleu_ci_low": 0.18174547190909165,
1211
- "sacrebleu_ci_high": 0.2734022486576191
1212
- },
1213
- "mt_flores_101_ron_eng": {
1214
- "num_of_instances": 66,
1215
- "counts": [
1216
- 1208,
1217
- 675,
1218
- 430,
1219
- 279
1220
- ],
1221
- "totals": [
1222
- 3677,
1223
- 3611,
1224
- 3545,
1225
- 3479
1226
- ],
1227
- "precisions": [
1228
- 0.32852869186837097,
1229
- 0.1869288285793409,
1230
- 0.12129760225669958,
1231
- 0.08019545846507617
1232
- ],
1233
- "bp": 1.0,
1234
- "sys_len": 3677,
1235
- "ref_len": 1734,
1236
- "sacrebleu": 0.15633740352446387,
1237
- "score": 0.15633740352446387,
1238
- "score_name": "sacrebleu",
1239
- "score_ci_low": 0.12255450743419968,
1240
- "score_ci_high": 0.17971859902386644,
1241
- "sacrebleu_ci_low": 0.12255450743419968,
1242
- "sacrebleu_ci_high": 0.17971859902386644
1243
- },
1244
- "mt_flores_101_spa_eng": {
1245
- "num_of_instances": 66,
1246
- "counts": [
1247
- 1135,
1248
- 581,
1249
- 336,
1250
- 202
1251
- ],
1252
- "totals": [
1253
- 3533,
1254
- 3467,
1255
- 3401,
1256
- 3335
1257
- ],
1258
- "precisions": [
1259
- 0.3212567223322955,
1260
- 0.16758004038073263,
1261
- 0.09879447221405468,
1262
- 0.06056971514242879
1263
- ],
1264
- "bp": 1.0,
1265
- "sys_len": 3533,
1266
- "ref_len": 1734,
1267
- "sacrebleu": 0.133972503470666,
1268
- "score": 0.133972503470666,
1269
- "score_name": "sacrebleu",
1270
- "score_ci_low": 0.10251876459928583,
1271
- "score_ci_high": 0.17481307519673603,
1272
- "sacrebleu_ci_low": 0.10251876459928583,
1273
- "sacrebleu_ci_high": 0.17481307519673603
1274
- },
1275
- "score": 0.1632066271292133,
1276
- "score_name": "subsets_mean",
1277
- "num_of_instances": 990
1278
- },
1279
- "score": 0.4537326535720019,
1280
- "score_name": "subsets_mean",
1281
- "num_of_instances": 12472
1282
- }
1283
- }