Spaces:
Sleeping
Sleeping
Zekun Wu
commited on
Commit
·
ae16dbc
1
Parent(s):
7a70a60
update
Browse files- requirements.txt +2 -1
- util/evaluation.py +51 -42
requirements.txt
CHANGED
|
@@ -2,4 +2,5 @@ openai
|
|
| 2 |
pandas
|
| 3 |
tqdm
|
| 4 |
scipy
|
| 5 |
-
statsmodels
|
|
|
|
|
|
| 2 |
pandas
|
| 3 |
tqdm
|
| 4 |
scipy
|
| 5 |
+
statsmodels
|
| 6 |
+
scikit-posthocs
|
util/evaluation.py
CHANGED
|
@@ -8,22 +8,23 @@ from scipy.spatial.distance import jensenshannon
|
|
| 8 |
from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
|
| 9 |
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
| 10 |
from scipy.stats import ttest_1samp
|
|
|
|
| 11 |
|
| 12 |
-
def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
|
| 28 |
|
| 29 |
# def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
|
@@ -48,30 +49,33 @@ def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
|
| 48 |
# p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
|
| 49 |
# return observed_t_stat, p_value
|
| 50 |
|
| 51 |
-
def posthoc_friedman(data, variables, rank_suffix='_Rank'):
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
|
| 54 |
-
|
| 55 |
-
num_conditions = ranked_data.shape[1]
|
| 56 |
-
comparisons = []
|
| 57 |
-
|
| 58 |
-
for i in range(num_conditions):
|
| 59 |
-
for j in range(i + 1, num_conditions):
|
| 60 |
-
diff = ranked_data[:, i] - ranked_data[:, j]
|
| 61 |
-
abs_diff = np.abs(diff)
|
| 62 |
-
avg_diff = np.mean(diff)
|
| 63 |
-
se_diff = np.std(diff, ddof=1) / np.sqrt(num_subjects)
|
| 64 |
-
z_value = avg_diff / se_diff
|
| 65 |
-
p_value = 2 * (1 - stats.norm.cdf(np.abs(z_value)))
|
| 66 |
-
comparisons.append({
|
| 67 |
-
"Group1": variables[i],
|
| 68 |
-
"Group2": variables[j],
|
| 69 |
-
"Z": z_value,
|
| 70 |
-
"p-value": p_value
|
| 71 |
-
})
|
| 72 |
-
|
| 73 |
-
return comparisons
|
| 74 |
-
|
| 75 |
def statistical_tests(data):
|
| 76 |
"""Perform various statistical tests to evaluate potential biases."""
|
| 77 |
variables = ['Privilege', 'Protect', 'Neutral']
|
|
@@ -96,17 +100,22 @@ def statistical_tests(data):
|
|
| 96 |
'T-Test': {}
|
| 97 |
}
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
for (var1, var2) in pairs:
|
| 100 |
pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
|
| 101 |
pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
|
| 102 |
|
| 103 |
-
#
|
| 104 |
-
|
| 105 |
-
pairwise_results['
|
|
|
|
| 106 |
|
| 107 |
# Friedman test
|
| 108 |
friedman_stat, friedman_p = friedmanchisquare(*rank_data)
|
| 109 |
-
posthoc_results =
|
| 110 |
|
| 111 |
results = {
|
| 112 |
"Average Ranks": average_ranks.to_dict(),
|
|
|
|
| 8 |
from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
|
| 9 |
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
| 10 |
from scipy.stats import ttest_1samp
|
| 11 |
+
from scikit_posthocs import posthoc_nemenyi
|
| 12 |
|
| 13 |
+
# def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
| 14 |
+
# """Perform a bootstrapped t-test."""
|
| 15 |
+
# observed_t_stat, _ = ttest_ind(data1, data2)
|
| 16 |
+
# combined = np.concatenate([data1, data2])
|
| 17 |
+
# t_stats = []
|
| 18 |
+
#
|
| 19 |
+
# for _ in range(num_bootstrap):
|
| 20 |
+
# np.random.shuffle(combined)
|
| 21 |
+
# new_data1 = combined[:len(data1)]
|
| 22 |
+
# new_data2 = combined[len(data1):]
|
| 23 |
+
# t_stat, _ = ttest_ind(new_data1, new_data2)
|
| 24 |
+
# t_stats.append(t_stat)
|
| 25 |
+
#
|
| 26 |
+
# p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
|
| 27 |
+
# return observed_t_stat, p_value
|
| 28 |
|
| 29 |
|
| 30 |
# def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
|
|
|
| 49 |
# p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
|
| 50 |
# return observed_t_stat, p_value
|
| 51 |
|
| 52 |
+
# def posthoc_friedman(data, variables, rank_suffix='_Rank'):
|
| 53 |
+
# """Perform a post-hoc analysis for the Friedman test using pairwise comparisons."""
|
| 54 |
+
# ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
|
| 55 |
+
# num_subjects = ranked_data.shape[0]
|
| 56 |
+
# num_conditions = ranked_data.shape[1]
|
| 57 |
+
# comparisons = []
|
| 58 |
+
#
|
| 59 |
+
# for i in range(num_conditions):
|
| 60 |
+
# for j in range(i + 1, num_conditions):
|
| 61 |
+
# diff = ranked_data[:, i] - ranked_data[:, j]
|
| 62 |
+
# abs_diff = np.abs(diff)
|
| 63 |
+
# avg_diff = np.mean(diff)
|
| 64 |
+
# se_diff = np.std(diff, ddof=1) / np.sqrt(num_subjects)
|
| 65 |
+
# z_value = avg_diff / se_diff
|
| 66 |
+
# p_value = 2 * (1 - stats.norm.cdf(np.abs(z_value)))
|
| 67 |
+
# comparisons.append({
|
| 68 |
+
# "Group1": variables[i],
|
| 69 |
+
# "Group2": variables[j],
|
| 70 |
+
# "Z": z_value,
|
| 71 |
+
# "p-value": p_value
|
| 72 |
+
# })
|
| 73 |
+
#
|
| 74 |
+
# return comparisons
|
| 75 |
+
def posthoc_friedman_nemenyi(data, variables, rank_suffix='_Rank'):
|
| 76 |
+
"""Perform post-hoc Nemenyi test for the Friedman test."""
|
| 77 |
ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
|
| 78 |
+
return posthoc_nemenyi(ranked_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
def statistical_tests(data):
|
| 80 |
"""Perform various statistical tests to evaluate potential biases."""
|
| 81 |
variables = ['Privilege', 'Protect', 'Neutral']
|
|
|
|
| 100 |
'T-Test': {}
|
| 101 |
}
|
| 102 |
|
| 103 |
+
pairwise_results = {
|
| 104 |
+
'Wilcoxon Signed-Rank Test': {}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
for (var1, var2) in pairs:
|
| 108 |
pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
|
| 109 |
pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
|
| 110 |
|
| 111 |
+
# Wilcoxon signed-rank test for pairwise comparisons
|
| 112 |
+
wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
|
| 113 |
+
pairwise_results['Wilcoxon Signed-Rank Test'][pair_name_score] = {"Statistic": wilcoxon_stat,
|
| 114 |
+
"p-value": wilcoxon_p}
|
| 115 |
|
| 116 |
# Friedman test
|
| 117 |
friedman_stat, friedman_p = friedmanchisquare(*rank_data)
|
| 118 |
+
posthoc_results = posthoc_friedman_nemenyi(data, variables, rank_suffix)
|
| 119 |
|
| 120 |
results = {
|
| 121 |
"Average Ranks": average_ranks.to_dict(),
|