Aaron Mueller
updated filtering, add F= tab
1d8e193
{
"method_name": "NAP (CF)",
"results": [
{
"model_id": "gemma2",
"scores": {
"arc_easy": {
"edge_counts": [
28.17708396911621,
74.19965362548828,
317.2265625,
881.1848754882812,
2312.229248046875,
7085.25,
15016.50390625,
30456.0,
76492.484375,
152985.0
],
"faithfulness": [
0.004601226993865031,
0.006134969325153374,
0.013803680981595092,
0.02607361963190184,
0.04754601226993865,
0.12423312883435583,
0.24233128834355827,
0.5751533742331288,
1.6411042944785277,
1.0
]
},
"mcqa": {
"edge_counts": [
44.14409637451172,
142.76388549804688,
553.3840942382812,
1318.2525634765625,
2904.385498046875,
7578.75,
15298.49609375,
30597.0078125,
76492.4765625,
152985.0
],
"faithfulness": [
0.0,
-0.003401360544217687,
-0.003401360544217687,
-0.0017006802721088435,
0.0017006802721088435,
0.02040816326530612,
0.022108843537414966,
0.10204081632653061,
0.24489795918367346,
1.0
]
},
"ioi": {
"edge_counts": [
78.42621612548828,
195.36111450195312,
587.4566040039062,
1273.60595703125,
2791.59375,
7614.0,
15298.5,
30596.99609375,
76492.484375,
152985.0
],
"faithfulness": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.003215434083601286,
0.01929260450160772,
0.07395498392282958,
0.1607717041800643,
1.0
]
}
}
},
{
"model_id": "llama3",
"scores": {
"arithmetic_subtraction": {
"edge_counts": [
210.320068359375,
596.0830078125,
3139.23779296875,
11288.572265625,
34563.7265625,
117960.8671875,
271754.5625,
614539.5,
1639406.875,
3281985.0
],
"faithfulness": [
0.0,
0.0,
0.0,
0.0,
0.003205128205128205,
0.00641025641025641,
0.01282051282051282,
0.041666666666666664,
0.08012820512820513,
1.0
]
},
"arc_easy": {
"edge_counts": [
22.194580078125,
59.185546875,
248.390869140625,
507.3515625,
9259.2734375,
77213.6328125,
270803.25,
646249.5625,
1640992.125,
3281985.0
],
"faithfulness": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.02,
1.0
]
},
"arc_challenge": {
"edge_counts": [
31.70654296875,
90.89208984375,
232.5361328125,
792.73681640625,
9893.4697265625,
64371.125,
143329.125,
628492.0,
1640992.375,
3281985.0
],
"faithfulness": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.012195121951219513,
1.0
]
},
"ioi": {
"edge_counts": [
361.45458984375,
1048.4296875,
4175.08056640625,
11753.64453125,
31984.65625,
109082.09375,
260973.203125,
612637.0,
1640992.375,
3281985.0
],
"faithfulness": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.009230769230769232,
0.015384615384615385,
0.015384615384615385,
1.0
]
},
"arithmetic_addition": {
"edge_counts": [
161.703369140625,
450.23291015625,
2742.869384765625,
10559.25390625,
34246.625,
117802.3125,
275559.8125,
616442.0625,
1639407.0,
3281985.0
],
"faithfulness": [
0.0,
-0.002352941176470588,
0.0,
0.0,
0.0,
0.004705882352941176,
0.011764705882352941,
0.0,
0.0,
1.0
]
},
"mcqa": {
"edge_counts": [
700.714599609375,
2084.1767578125,
9259.166015625,
23591.84765625,
54921.44140625,
153634.5,
320587.96875,
655762.5625,
1640992.375,
3281985.0
],
"faithfulness": [
0.0,
0.0,
0.0019305019305019305,
0.0019305019305019305,
0.0,
-0.007722007722007722,
-0.02702702702702703,
-0.07722007722007722,
-0.05791505791505792,
1.0
]
}
}
},
{
"model_id": "qwen2.5",
"scores": {
"ioi": {
"edge_counts": [
32.8046875,
88.68080139160156,
348.3046875,
1010.625,
3039.563720703125,
11605.576171875,
29601.451171875,
70466.984375,
185914.984375,
372913.0
],
"faithfulness": [
0.0,
0.0,
0.0,
0.0,
0.006230529595015576,
0.024922118380062305,
0.04984423676012461,
0.06853582554517133,
0.08099688473520249,
1.0
]
},
"mcqa": {
"edge_counts": [
21.98995590209961,
100.21651458740234,
516.140625,
1844.390625,
5357.140625,
17507.634765625,
37182.30859375,
74582.3984375,
186456.53125,
372913.0
],
"faithfulness": [
-0.0031645569620253164,
-0.0031645569620253164,
-0.0031645569620253164,
-0.0031645569620253164,
-0.0031645569620253164,
0.012658227848101266,
0.012658227848101266,
0.03481012658227848,
0.056962025316455694,
1.0
]
}
}
},
{
"model_id": "gpt2",
"scores": {
"ioi": {
"edge_counts": [
3.28125,
10.35546875,
31.35416603088379,
83.15755462646484,
279.3997497558594,
1350.0208740234375,
4442.87890625,
12654.0947265625,
34932.49609375,
69865.0
],
"faithfulness": [
-3.380787646439662e-08,
0.0,
-1.5450199544229257e-05,
-6.119225640055789e-06,
0.00011721190770206309,
0.0020236718694058528,
0.015380555318712599,
0.03909438029925786,
0.04813088759942643,
1.0
]
}
}
}
]
}