Zekun Wu commited on
Commit
076d436
·
1 Parent(s): 91e8e74
Files changed (1) hide show
  1. util/evaluation.py +214 -47
util/evaluation.py CHANGED
@@ -10,12 +10,66 @@ from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
10
  from statsmodels.stats.multicomp import pairwise_tukeyhsd
11
  from scipy.stats import ttest_1samp
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def calculate_impact_ratio(selection_rates):
14
  """Calculate the impact ratio for each category."""
15
  most_selected_rate = max(selection_rates.values())
16
  impact_ratios = {category: rate / most_selected_rate for category, rate in selection_rates.items()}
17
  return impact_ratios
18
 
 
 
 
 
 
 
 
 
19
  def statistical_parity_difference(selection_rates):
20
  """Calculate statistical parity difference."""
21
  most_selected_rate = max(selection_rates.values())
@@ -33,48 +87,34 @@ def statistical_tests(data):
33
  rank_suffix = '_Rank'
34
  score_suffix = '_Avg_Score'
35
 
36
- # Calculate average ranks
37
  rank_columns = [v + rank_suffix for v in variables]
38
  average_ranks = data[rank_columns].mean()
39
  average_scores = data[[v + score_suffix for v in variables]].mean()
40
 
41
- # Statistical tests
42
  rank_data = [data[col] for col in rank_columns]
 
43
 
44
- # Pairwise tests
45
- pairs = [
46
- ('Privilege', 'Protect'),
47
- ('Protect', 'Neutral'),
48
- ('Privilege', 'Neutral')
49
- ]
50
 
51
- pairwise_results = {
52
- 'Wilcoxon Test': {}
53
- }
54
-
55
- for (var1, var2) in pairs:
56
- pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
57
  pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
58
-
59
- # Wilcoxon Signed-Rank Test
60
  if len(data) > 20:
61
  wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
62
  else:
63
  wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
64
  pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
65
 
66
- # Levene's Test for Equality of Variances
67
- levene_results = {}
68
- levene_privilege_protect = levene(data['Privilege_Rank'], data['Protect_Rank'])
69
- levene_privilege_neutral = levene(data['Privilege_Rank'], data['Neutral_Rank'])
70
- levene_protect_neutral = levene(data['Protect_Rank'], data['Neutral_Rank'])
71
-
72
- levene_results['Privilege vs Protect'] = {"Statistic": levene_privilege_protect.statistic,
73
- "p-value": levene_privilege_protect.pvalue}
74
- levene_results['Privilege vs Neutral'] = {"Statistic": levene_privilege_neutral.statistic,
75
- "p-value": levene_privilege_neutral.pvalue}
76
- levene_results['Protect vs Neutral'] = {"Statistic": levene_protect_neutral.statistic,
77
- "p-value": levene_protect_neutral.pvalue}
78
 
79
  # Calculate variances for ranks
80
  variances = {col: data[col].var() for col in rank_columns}
@@ -84,36 +124,45 @@ def statistical_tests(data):
84
  'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
85
  }
86
 
87
- selection_rates_Avg_Score = {
88
- 'Privilege': data['Privilege_Avg_Score'].mean(),
89
- 'Protect': data['Protect_Avg_Score'].mean(),
90
- 'Neutral': data['Neutral_Avg_Score'].mean()
91
- }
92
  impact_ratios_Avg_Score = calculate_impact_ratio(selection_rates_Avg_Score)
93
  spd_result_Avg_Score = statistical_parity_difference(selection_rates_Avg_Score)
94
  adverse_impact_Avg_Score = calculate_four_fifths_rule(impact_ratios_Avg_Score)
95
 
96
-
97
- # rank version of bias metrics
98
- selection_rates_rank = {
99
- 'Privilege': data['Privilege_Rank'].mean(),
100
- 'Protect': data['Protect_Rank'].mean(),
101
- 'Neutral': data['Neutral_Rank'].mean()
102
- }
103
  impact_ratios_rank = calculate_impact_ratio(selection_rates_rank)
104
  spd_result_rank = statistical_parity_difference(selection_rates_rank)
105
  adverse_impact_rank = calculate_four_fifths_rule(impact_ratios_rank)
106
 
107
-
108
  # Friedman test
109
  friedman_stat, friedman_p = friedmanchisquare(*rank_data)
110
-
111
- rank_matrix = data[rank_columns].values
112
- rank_matrix_transposed = np.transpose(rank_matrix)
113
  posthoc_results = posthoc_nemenyi(rank_matrix_transposed)
114
- #posthoc_results = posthoc_friedman(data, variables, rank_suffix)
115
 
 
 
 
 
 
 
 
 
 
116
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  results = {
119
  "Average Ranks": average_ranks.to_dict(),
@@ -124,7 +173,7 @@ def statistical_tests(data):
124
  "Post-hoc": posthoc_results
125
  },
126
  **pairwise_results,
127
- "Levene's Test for Equality of Variances": levene_results,
128
  "Pairwise Comparisons of Variances": pairwise_variances,
129
  "Statistical Parity Difference": {
130
  "Avg_Score": spd_result_Avg_Score,
@@ -137,12 +186,130 @@ def statistical_tests(data):
137
  "Four-Fifths Rule": {
138
  "Avg_Score": adverse_impact_Avg_Score,
139
  "Rank": adverse_impact_rank
140
- }
 
141
  }
142
 
143
  return results
144
 
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  def hellinger_distance(p, q):
147
  """Calculate the Hellinger distance between two probability distributions."""
148
  return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))
 
10
  from statsmodels.stats.multicomp import pairwise_tukeyhsd
11
  from scipy.stats import ttest_1samp
12
 
13
+
14
+ def test_statistic_variance_ratio(x, y):
15
+ return np.var(x, ddof=1) / np.var(y, ddof=1)
16
+
17
+
18
+ def test_statistic_mean_difference(x, y):
19
+ return np.mean(x) - np.mean(y)
20
+
21
+
22
+ def permutation_test_variance(x, y, num_permutations=10000):
23
+ T_obs = test_statistic_variance_ratio(x, y)
24
+ pooled_data = np.concatenate([x, y])
25
+ n_A = len(x)
26
+ n_B = len(y)
27
+
28
+ perm_test_stats = []
29
+ for _ in range(num_permutations):
30
+ np.random.shuffle(pooled_data)
31
+ perm_A = pooled_data[:n_A]
32
+ perm_B = pooled_data[n_A:]
33
+ perm_test_stats.append(test_statistic_variance_ratio(perm_A, perm_B))
34
+
35
+ perm_test_stats = np.array(perm_test_stats)
36
+ p_value = np.mean(np.abs(perm_test_stats) >= np.abs(T_obs))
37
+
38
+ return T_obs, p_value
39
+
40
+
41
+ def permutation_test_mean(x, y, num_permutations=10000):
42
+ T_obs = test_statistic_mean_difference(x, y)
43
+ pooled_data = np.concatenate([x, y])
44
+ n_A = len(x)
45
+ n_B = len(y)
46
+
47
+ perm_test_stats = []
48
+ for _ in range(num_permutations):
49
+ np.random.shuffle(pooled_data)
50
+ perm_A = pooled_data[:n_A]
51
+ perm_B = pooled_data[n_A:]
52
+ perm_test_stats.append(test_statistic_mean_difference(perm_A, perm_B))
53
+
54
+ perm_test_stats = np.array(perm_test_stats)
55
+ p_value = np.mean(np.abs(perm_test_stats) >= np.abs(T_obs))
56
+
57
+ return T_obs, p_value
58
+
59
  def calculate_impact_ratio(selection_rates):
60
  """Calculate the impact ratio for each category."""
61
  most_selected_rate = max(selection_rates.values())
62
  impact_ratios = {category: rate / most_selected_rate for category, rate in selection_rates.items()}
63
  return impact_ratios
64
 
65
+ def statistical_parity_difference(y_true, y_pred=None, reference_group='Privilege'):
66
+ selection_rates = y_pred if y_pred is not None else y_true
67
+ reference_rate = selection_rates[reference_group]
68
+ spd = {category: rate - reference_rate for category, rate in selection_rates.items()}
69
+ return spd
70
+
71
+
72
+
73
  def statistical_parity_difference(selection_rates):
74
  """Calculate statistical parity difference."""
75
  most_selected_rate = max(selection_rates.values())
 
87
  rank_suffix = '_Rank'
88
  score_suffix = '_Avg_Score'
89
 
90
+ # Calculate average ranks and scores
91
  rank_columns = [v + rank_suffix for v in variables]
92
  average_ranks = data[rank_columns].mean()
93
  average_scores = data[[v + score_suffix for v in variables]].mean()
94
 
95
+ # Statistical tests setup
96
  rank_data = [data[col] for col in rank_columns]
97
+ pairs = [('Privilege', 'Protect'), ('Protect', 'Neutral'), ('Privilege', 'Neutral')]
98
 
99
+ pairwise_results = {'Wilcoxon Test': {}}
 
 
 
 
 
100
 
101
+ # Pairwise Wilcoxon Signed-Rank Test
102
+ for var1, var2 in pairs:
 
 
 
 
103
  pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
 
 
104
  if len(data) > 20:
105
  wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
106
  else:
107
  wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
108
  pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
109
 
110
+ # # Levene's Test for Equality of Variances
111
+ # levene_results = {
112
+ # 'Privilege vs Protect': levene(data['Privilege_Rank'], data['Protect_Rank']),
113
+ # 'Privilege vs Neutral': levene(data['Privilege_Rank'], data['Neutral_Rank']),
114
+ # 'Protect vs Neutral': levene(data['Protect_Rank'], data['Neutral_Rank'])
115
+ # }
116
+ #
117
+ # levene_results = {key: {"Statistic": res.statistic, "p-value": res.pvalue} for key, res in levene_results.items()}
 
 
 
 
118
 
119
  # Calculate variances for ranks
120
  variances = {col: data[col].var() for col in rank_columns}
 
124
  'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
125
  }
126
 
127
+ # Bias metrics calculations
128
+ selection_rates_Avg_Score = {v: data[f'{v}{score_suffix}'].mean() for v in variables}
129
+ selection_rates_rank = {v: data[f'{v}{rank_suffix}'].mean() for v in variables}
130
+
 
131
  impact_ratios_Avg_Score = calculate_impact_ratio(selection_rates_Avg_Score)
132
  spd_result_Avg_Score = statistical_parity_difference(selection_rates_Avg_Score)
133
  adverse_impact_Avg_Score = calculate_four_fifths_rule(impact_ratios_Avg_Score)
134
 
 
 
 
 
 
 
 
135
  impact_ratios_rank = calculate_impact_ratio(selection_rates_rank)
136
  spd_result_rank = statistical_parity_difference(selection_rates_rank)
137
  adverse_impact_rank = calculate_four_fifths_rule(impact_ratios_rank)
138
 
 
139
  # Friedman test
140
  friedman_stat, friedman_p = friedmanchisquare(*rank_data)
141
+ rank_matrix_transposed = np.transpose(data[rank_columns].values)
 
 
142
  posthoc_results = posthoc_nemenyi(rank_matrix_transposed)
 
143
 
144
+ # Perform permutation tests for variances
145
+ T_priv_prot_var, p_priv_prot_var = permutation_test_variance(data['Privilege_Rank'], data['Protect_Rank'])
146
+ T_neut_prot_var, p_neut_prot_var = permutation_test_variance(data['Neutral_Rank'], data['Protect_Rank'])
147
+ T_neut_priv_var, p_neut_priv_var = permutation_test_variance(data['Neutral_Rank'], data['Privilege_Rank'])
148
+
149
+ # Perform permutation tests for means
150
+ T_priv_prot_mean, p_priv_prot_mean = permutation_test_mean(data['Privilege_Rank'], data['Protect_Rank'])
151
+ T_neut_prot_mean, p_neut_prot_mean = permutation_test_mean(data['Neutral_Rank'], data['Protect_Rank'])
152
+ T_neut_priv_mean, p_neut_priv_mean = permutation_test_mean(data['Neutral_Rank'], data['Privilege_Rank'])
153
 
154
+ permutation_results = {
155
+ "Permutation Tests for Variances": {
156
+ "Privilege vs. Protect": {"Statistic": T_priv_prot_var, "p-value": p_priv_prot_var},
157
+ "Neutral vs. Protect": {"Statistic": T_neut_prot_var, "p-value": p_neut_prot_var},
158
+ "Neutral vs. Privilege": {"Statistic": T_neut_priv_var, "p-value": p_neut_priv_var}
159
+ },
160
+ "Permutation Tests for Means": {
161
+ "Privilege vs. Protect": {"Statistic": T_priv_prot_mean, "p-value": p_priv_prot_mean},
162
+ "Neutral vs. Protect": {"Statistic": T_neut_prot_mean, "p-value": p_neut_prot_mean},
163
+ "Neutral vs. Privilege": {"Statistic": T_neut_priv_mean, "p-value": p_neut_priv_mean}
164
+ }
165
+ }
166
 
167
  results = {
168
  "Average Ranks": average_ranks.to_dict(),
 
173
  "Post-hoc": posthoc_results
174
  },
175
  **pairwise_results,
176
+ #"Levene's Test for Equality of Variances": levene_results,
177
  "Pairwise Comparisons of Variances": pairwise_variances,
178
  "Statistical Parity Difference": {
179
  "Avg_Score": spd_result_Avg_Score,
 
186
  "Four-Fifths Rule": {
187
  "Avg_Score": adverse_impact_Avg_Score,
188
  "Rank": adverse_impact_rank
189
+ },
190
+ **permutation_results
191
  }
192
 
193
  return results
194
 
195
 
196
+ #
197
+ # def statistical_tests(data):
198
+ # """Perform various statistical tests to evaluate potential biases."""
199
+ # variables = ['Privilege', 'Protect', 'Neutral']
200
+ # rank_suffix = '_Rank'
201
+ # score_suffix = '_Avg_Score'
202
+ #
203
+ # # Calculate average ranks
204
+ # rank_columns = [v + rank_suffix for v in variables]
205
+ # average_ranks = data[rank_columns].mean()
206
+ # average_scores = data[[v + score_suffix for v in variables]].mean()
207
+ #
208
+ # # Statistical tests
209
+ # rank_data = [data[col] for col in rank_columns]
210
+ #
211
+ # # Pairwise tests
212
+ # pairs = [
213
+ # ('Privilege', 'Protect'),
214
+ # ('Protect', 'Neutral'),
215
+ # ('Privilege', 'Neutral')
216
+ # ]
217
+ #
218
+ # pairwise_results = {
219
+ # 'Wilcoxon Test': {}
220
+ # }
221
+ #
222
+ # for (var1, var2) in pairs:
223
+ # pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
224
+ # pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
225
+ #
226
+ # # Wilcoxon Signed-Rank Test
227
+ # if len(data) > 20:
228
+ # wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
229
+ # else:
230
+ # wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
231
+ # pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
232
+ #
233
+ # # Levene's Test for Equality of Variances
234
+ # levene_results = {}
235
+ # levene_privilege_protect = levene(data['Privilege_Rank'], data['Protect_Rank'])
236
+ # levene_privilege_neutral = levene(data['Privilege_Rank'], data['Neutral_Rank'])
237
+ # levene_protect_neutral = levene(data['Protect_Rank'], data['Neutral_Rank'])
238
+ #
239
+ # levene_results['Privilege vs Protect'] = {"Statistic": levene_privilege_protect.statistic,
240
+ # "p-value": levene_privilege_protect.pvalue}
241
+ # levene_results['Privilege vs Neutral'] = {"Statistic": levene_privilege_neutral.statistic,
242
+ # "p-value": levene_privilege_neutral.pvalue}
243
+ # levene_results['Protect vs Neutral'] = {"Statistic": levene_protect_neutral.statistic,
244
+ # "p-value": levene_protect_neutral.pvalue}
245
+ #
246
+ # # Calculate variances for ranks
247
+ # variances = {col: data[col].var() for col in rank_columns}
248
+ # pairwise_variances = {
249
+ # 'Privilege_Rank vs Protect_Rank': variances['Privilege_Rank'] > variances['Protect_Rank'],
250
+ # 'Privilege_Rank vs Neutral_Rank': variances['Privilege_Rank'] > variances['Neutral_Rank'],
251
+ # 'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
252
+ # }
253
+ #
254
+ # selection_rates_Avg_Score = {
255
+ # 'Privilege': data['Privilege_Avg_Score'].mean(),
256
+ # 'Protect': data['Protect_Avg_Score'].mean(),
257
+ # 'Neutral': data['Neutral_Avg_Score'].mean()
258
+ # }
259
+ # impact_ratios_Avg_Score = calculate_impact_ratio(selection_rates_Avg_Score)
260
+ # spd_result_Avg_Score = statistical_parity_difference(selection_rates_Avg_Score)
261
+ # adverse_impact_Avg_Score = calculate_four_fifths_rule(impact_ratios_Avg_Score)
262
+ #
263
+ #
264
+ # # rank version of bias metrics
265
+ # selection_rates_rank = {
266
+ # 'Privilege': data['Privilege_Rank'].mean(),
267
+ # 'Protect': data['Protect_Rank'].mean(),
268
+ # 'Neutral': data['Neutral_Rank'].mean()
269
+ # }
270
+ # impact_ratios_rank = calculate_impact_ratio(selection_rates_rank)
271
+ # spd_result_rank = statistical_parity_difference(selection_rates_rank)
272
+ # adverse_impact_rank = calculate_four_fifths_rule(impact_ratios_rank)
273
+ #
274
+ #
275
+ # # Friedman test
276
+ # friedman_stat, friedman_p = friedmanchisquare(*rank_data)
277
+ #
278
+ # rank_matrix = data[rank_columns].values
279
+ # rank_matrix_transposed = np.transpose(rank_matrix)
280
+ # posthoc_results = posthoc_nemenyi(rank_matrix_transposed)
281
+ # #posthoc_results = posthoc_friedman(data, variables, rank_suffix)
282
+ #
283
+ #
284
+ #
285
+ # results = {
286
+ # "Average Ranks": average_ranks.to_dict(),
287
+ # "Average Scores": average_scores.to_dict(),
288
+ # "Friedman Test": {
289
+ # "Statistic": friedman_stat,
290
+ # "p-value": friedman_p,
291
+ # "Post-hoc": posthoc_results
292
+ # },
293
+ # **pairwise_results,
294
+ # "Levene's Test for Equality of Variances": levene_results,
295
+ # "Pairwise Comparisons of Variances": pairwise_variances,
296
+ # "Statistical Parity Difference": {
297
+ # "Avg_Score": spd_result_Avg_Score,
298
+ # "Rank": spd_result_rank
299
+ # },
300
+ # "Disparate Impact Ratios": {
301
+ # "Avg_Score": impact_ratios_Avg_Score,
302
+ # "Rank": impact_ratios_rank
303
+ # },
304
+ # "Four-Fifths Rule": {
305
+ # "Avg_Score": adverse_impact_Avg_Score,
306
+ # "Rank": adverse_impact_rank
307
+ # }
308
+ # }
309
+ #
310
+ # return results
311
+
312
+
313
  def hellinger_distance(p, q):
314
  """Calculate the Hellinger distance between two probability distributions."""
315
  return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))