Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
·
96acdf5
1
Parent(s):
4f4c0c4
Adding docstrings to run_data_measurements CLI
Browse files- run_data_measurements.py +22 -21
run_data_measurements.py
CHANGED
|
@@ -12,13 +12,14 @@ from data_measurements import dataset_utils
|
|
| 12 |
def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
|
| 13 |
"""
|
| 14 |
Loader specifically for the widgets used in the app.
|
|
|
|
| 15 |
Args:
|
| 16 |
-
ds_args:
|
| 17 |
-
show_embeddings:
|
| 18 |
-
use_cache:
|
| 19 |
|
| 20 |
Returns:
|
| 21 |
-
|
| 22 |
"""
|
| 23 |
|
| 24 |
if not isdir(ds_args["cache_dir"]):
|
|
@@ -58,7 +59,16 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
|
|
| 58 |
dstats.load_or_prepare_zipf()
|
| 59 |
|
| 60 |
|
| 61 |
-
def load_or_prepare(dataset_args,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
all = False
|
| 63 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
|
| 64 |
print("Loading dataset.")
|
|
@@ -86,8 +96,8 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
|
|
| 86 |
|
| 87 |
if all or dataset_args["calculation"] == "labels":
|
| 88 |
if not dstats.label_field:
|
| 89 |
-
print("Warning: You asked for label calculation, but didn't
|
| 90 |
-
"the labels field name. Assuming it is 'label'...")
|
| 91 |
dstats.set_label_field("label")
|
| 92 |
else:
|
| 93 |
print("\n* Calculating label distribution.")
|
|
@@ -106,7 +116,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
|
|
| 106 |
npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
|
| 107 |
dstats, use_cache=use_cache
|
| 108 |
)
|
| 109 |
-
do_npmi(npmi_stats
|
| 110 |
print("Done!")
|
| 111 |
print(
|
| 112 |
"nPMI results now available in %s for all identity terms that "
|
|
@@ -137,7 +147,7 @@ def load_or_prepare(dataset_args, do_html=False, use_cache=False):
|
|
| 137 |
dstats.load_or_prepare_embeddings()
|
| 138 |
|
| 139 |
|
| 140 |
-
def do_npmi(npmi_stats
|
| 141 |
available_terms = npmi_stats.load_or_prepare_npmi_terms()
|
| 142 |
completed_pairs = {}
|
| 143 |
print("Iterating through terms for joint npmi.")
|
|
@@ -160,7 +170,6 @@ def get_text_label_df(
|
|
| 160 |
label_field,
|
| 161 |
calculation,
|
| 162 |
out_dir,
|
| 163 |
-
do_html=False,
|
| 164 |
use_cache=True,
|
| 165 |
):
|
| 166 |
if not use_cache:
|
|
@@ -268,17 +277,9 @@ def main():
|
|
| 268 |
print("Proceeding with the following arguments:")
|
| 269 |
print(args)
|
| 270 |
# run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
|
| 271 |
-
get_text_label_df(
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
args.split,
|
| 275 |
-
args.feature,
|
| 276 |
-
args.label_field,
|
| 277 |
-
args.calculation,
|
| 278 |
-
args.out_dir,
|
| 279 |
-
do_html=args.do_html,
|
| 280 |
-
use_cache=args.cached,
|
| 281 |
-
)
|
| 282 |
print()
|
| 283 |
|
| 284 |
|
|
|
|
| 12 |
def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
|
| 13 |
"""
|
| 14 |
Loader specifically for the widgets used in the app.
|
| 15 |
+
Does not take specifications from user.
|
| 16 |
Args:
|
| 17 |
+
ds_args: Dataset configuration settings (config name, split, etc)
|
| 18 |
+
show_embeddings: Whether to compute embeddings (slow)
|
| 19 |
+
use_cache: Whether to grab files that have already been computed
|
| 20 |
|
| 21 |
Returns:
|
| 22 |
+
Saves files to disk in cache_dir, if user has not specified another dir.
|
| 23 |
"""
|
| 24 |
|
| 25 |
if not isdir(ds_args["cache_dir"]):
|
|
|
|
| 59 |
dstats.load_or_prepare_zipf()
|
| 60 |
|
| 61 |
|
| 62 |
+
def load_or_prepare(dataset_args, use_cache=False):
|
| 63 |
+
"""
|
| 64 |
+
Users can specify which aspects of the dataset they would like to compute.
|
| 65 |
+
Args:
|
| 66 |
+
dataset_args: Dataset configuration settings (config name, split, etc)
|
| 67 |
+
use_cache: Whether to grab files that have already been computed
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
Saves files to disk in cache_dir, if user has not specified another dir.
|
| 71 |
+
"""
|
| 72 |
all = False
|
| 73 |
dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
|
| 74 |
print("Loading dataset.")
|
|
|
|
| 96 |
|
| 97 |
if all or dataset_args["calculation"] == "labels":
|
| 98 |
if not dstats.label_field:
|
| 99 |
+
print("Warning: You asked for label calculation, but didn't "
|
| 100 |
+
"provide the labels field name. Assuming it is 'label'...")
|
| 101 |
dstats.set_label_field("label")
|
| 102 |
else:
|
| 103 |
print("\n* Calculating label distribution.")
|
|
|
|
| 116 |
npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
|
| 117 |
dstats, use_cache=use_cache
|
| 118 |
)
|
| 119 |
+
do_npmi(npmi_stats)
|
| 120 |
print("Done!")
|
| 121 |
print(
|
| 122 |
"nPMI results now available in %s for all identity terms that "
|
|
|
|
| 147 |
dstats.load_or_prepare_embeddings()
|
| 148 |
|
| 149 |
|
| 150 |
+
def do_npmi(npmi_stats):
|
| 151 |
available_terms = npmi_stats.load_or_prepare_npmi_terms()
|
| 152 |
completed_pairs = {}
|
| 153 |
print("Iterating through terms for joint npmi.")
|
|
|
|
| 170 |
label_field,
|
| 171 |
calculation,
|
| 172 |
out_dir,
|
|
|
|
| 173 |
use_cache=True,
|
| 174 |
):
|
| 175 |
if not use_cache:
|
|
|
|
| 277 |
print("Proceeding with the following arguments:")
|
| 278 |
print(args)
|
| 279 |
# run_data_measurements.py -d hate_speech18 -c default -s train -f text -w npmi
|
| 280 |
+
get_text_label_df(args.dataset, args.config, args.split, args.feature,
|
| 281 |
+
args.label_field, args.calculation, args.out_dir,
|
| 282 |
+
use_cache=args.cached)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
print()
|
| 284 |
|
| 285 |
|