Aaron Mueller
updated filtering, add F= tab
1d8e193
{
"method_name": "EAP (CF)",
"results": [
{
"model_id": "gemma2",
"scores": {
"mcqa": {
"edge_counts": [
40.0,
76.0,
176.0,
332.0,
962.0,
3256.0,
6866.0,
14441.0,
36027.0,
74218.0
],
"faithfulness": [
0.05782312925170068,
0.06462585034013606,
0.13435374149659865,
0.19727891156462585,
0.3521471088435374,
0.9591836734693877,
1.2857142857142858,
1.435374149659864,
1.5510204081632653,
1.0
]
},
"arc_easy": {
"edge_counts": [
33.0,
85.0,
231.0,
451.0,
1054.0,
3305.0,
7024.0,
14506.0,
36031.0,
74218.0
],
"faithfulness": [
0.04447852760736196,
0.06441717791411043,
0.21779141104294478,
0.3159509202453988,
0.5076687116564417,
0.8374233128834356,
1.0245398773006136,
1.138036809815951,
1.334355828220859,
1.0
]
},
"ioi": {
"edge_counts": [
1193.0,
2592.0,
7327.0,
15198.0,
30956.0,
78020.0,
156182.0,
311361.0,
768765.0,
1592881.0
],
"faithfulness": [
0.08709677419354839,
0.16612903225806452,
0.37661290322580643,
0.5518145161290322,
0.8741935483870967,
1.0354838709677419,
1.3612903225806452,
1.3612903225806452,
1.4645161290322581,
1.0
]
}
}
},
{
"model_id": "qwen2.5",
"scores": {
"ioi": {
"edge_counts": [
110.0,
246.0,
653.0,
1455.0,
3114.0,
8393.0,
17273.0,
34695.0,
87690.0,
179749.0
],
"faithfulness": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.01557632398753894,
0.021806853582554516,
0.018691588785046728,
1.0
]
},
"mcqa": {
"edge_counts": [
128.0,
262.0,
724.0,
1586.0,
3413.0,
8813.0,
17768.0,
35535.0,
86556.0,
179749.0
],
"faithfulness": [
0.27681388012618297,
0.3994479495268139,
0.5441640378548895,
0.5291798107255521,
0.48738170347003157,
0.5977917981072555,
0.7523659305993691,
0.6876971608832808,
0.667192429022082,
1.0
]
}
}
},
{
"model_id": "gpt2",
"scores": {
"ioi": {
"edge_counts": [
28.17708396911621,
74.19965362548828,
317.2265625,
881.1848754882812,
2312.229248046875,
7085.25,
15016.50390625,
30456.0,
76492.484375,
152985.0
],
"faithfulness": [
0.0,
0.22449694731820746,
0.5048270710448762,
0.7968936313069007,
0.976166532167683,
1.2088630716953397,
1.232255211567972,
1.2651923955277595,
1.3237581777016123,
1.0
]
}
}
},
{
"model_id": "llama3",
"scores": {
"arithmetic_subtraction": {
"edge_counts": [
1101.0,
2521.0,
7330.0,
15268.0,
31128.0,
78568.0,
157165.0,
313324.0,
773309.0,
1592881.0
],
"faithfulness": [
0.21634615384615385,
0.3076923076923077,
0.4110576923076923,
0.4467147435897436,
0.49854767628205127,
0.5422676282051282,
0.5584935897435898,
0.5737179487179487,
0.577323717948718,
1.0
]
},
"arithmetic_addition": {
"edge_counts": [
1105.0,
2636.0,
7488.0,
15424.0,
31195.0,
78423.0,
156792.0,
312466.0,
776673.0,
1592881.0
],
"faithfulness": [
0.08,
0.13411764705882354,
0.18352941176470589,
0.22588235294117648,
0.2847058823529412,
0.3588235294117647,
0.37529411764705883,
0.3841176470588235,
0.3988235294117647,
1.0
]
},
"mcqa": {
"edge_counts": [
1204.0,
2743.0,
7462.0,
15300.0,
31172.0,
78615.0,
157496.0,
314048.0,
767228.0,
1592881.0
],
"faithfulness": [
0.014056224899598393,
0.04618473895582329,
0.1144578313253012,
0.19678714859437751,
0.19076305220883535,
0.3644578313253012,
0.5144327309236948,
0.7590361445783133,
0.7570281124497992,
1.0
]
},
"arc_challenge": {
"edge_counts": [
1080.0,
2633.0,
6870.0,
14850.0,
30338.0,
77983.0,
156741.0,
306677.0,
781379.0,
1592881.0
],
"faithfulness": [
-0.0024390243902439024,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.014634146341463415,
0.007317073170731708,
1.0
]
}
}
}
]
}