Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
·
335424f
1
Parent(s):
85cf91c
Some additional modularizing and caching of the text lengths widget
Browse files
data_measurements/dataset_statistics.py
CHANGED
|
@@ -219,6 +219,7 @@ class DatasetStatisticsCacheClass:
|
|
| 219 |
self.avg_length = None
|
| 220 |
self.std_length = None
|
| 221 |
self.general_stats_dict = None
|
|
|
|
| 222 |
# clustering text by embeddings
|
| 223 |
# the hierarchical clustering tree is represented as a list of nodes,
|
| 224 |
# the first is the root
|
|
@@ -351,6 +352,7 @@ class DatasetStatisticsCacheClass:
|
|
| 351 |
self.length_stats_dict = json.load(f)
|
| 352 |
self.avg_length = self.length_stats_dict["avg length"]
|
| 353 |
self.std_length = self.length_stats_dict["std length"]
|
|
|
|
| 354 |
else:
|
| 355 |
self.prepare_text_length_stats()
|
| 356 |
if save:
|
|
@@ -367,14 +369,16 @@ class DatasetStatisticsCacheClass:
|
|
| 367 |
)
|
| 368 |
|
| 369 |
def prepare_text_length_stats(self):
|
| 370 |
-
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
| 371 |
self.prepare_length_df()
|
| 372 |
avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
|
| 373 |
self.avg_length = round(avg_length, 1)
|
| 374 |
std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
|
| 375 |
self.std_length = round(std_length, 1)
|
|
|
|
| 376 |
self.length_stats_dict = {"avg length": self.avg_length,
|
| 377 |
-
"std length": self.std_length
|
|
|
|
| 378 |
|
| 379 |
def prepare_fig_text_lengths(self):
|
| 380 |
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
|
|
|
| 219 |
self.avg_length = None
|
| 220 |
self.std_length = None
|
| 221 |
self.general_stats_dict = None
|
| 222 |
+
self.num_uniq_lengths = 0
|
| 223 |
# clustering text by embeddings
|
| 224 |
# the hierarchical clustering tree is represented as a list of nodes,
|
| 225 |
# the first is the root
|
|
|
|
| 352 |
self.length_stats_dict = json.load(f)
|
| 353 |
self.avg_length = self.length_stats_dict["avg length"]
|
| 354 |
self.std_length = self.length_stats_dict["std length"]
|
| 355 |
+
self.num_uniq_lengths = self.length_stats_dict["num lengths"]
|
| 356 |
else:
|
| 357 |
self.prepare_text_length_stats()
|
| 358 |
if save:
|
|
|
|
| 369 |
)
|
| 370 |
|
| 371 |
def prepare_text_length_stats(self):
|
| 372 |
+
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns or self.length_df is None:
|
| 373 |
self.prepare_length_df()
|
| 374 |
avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
|
| 375 |
self.avg_length = round(avg_length, 1)
|
| 376 |
std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
|
| 377 |
self.std_length = round(std_length, 1)
|
| 378 |
+
self.num_uniq_lengths = len(self.length_df["length"].unique())
|
| 379 |
self.length_stats_dict = {"avg length": self.avg_length,
|
| 380 |
+
"std length": self.std_length,
|
| 381 |
+
"num lengths": self.num_uniq_lengths}
|
| 382 |
|
| 383 |
def prepare_fig_text_lengths(self):
|
| 384 |
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
data_measurements/streamlit_utils.py
CHANGED
|
@@ -147,9 +147,7 @@ def expander_label_distribution(fig_labels, column_id):
|
|
| 147 |
st.markdown("No labels were found in the dataset")
|
| 148 |
|
| 149 |
|
| 150 |
-
def expander_text_lengths(dstats,
|
| 151 |
-
column_id,
|
| 152 |
-
):
|
| 153 |
_TEXT_LENGTH_CAPTION = (
|
| 154 |
"Use this widget to identify outliers, particularly suspiciously long outliers."
|
| 155 |
)
|
|
@@ -176,7 +174,7 @@ def expander_text_lengths(dstats,
|
|
| 176 |
start_id_show_lengths = st.slider(
|
| 177 |
f"Show the shortest sentences{column_id} starting at:",
|
| 178 |
0,
|
| 179 |
-
|
| 180 |
value=0,
|
| 181 |
step=1,
|
| 182 |
)
|
|
|
|
| 147 |
st.markdown("No labels were found in the dataset")
|
| 148 |
|
| 149 |
|
| 150 |
+
def expander_text_lengths(dstats, column_id):
|
|
|
|
|
|
|
| 151 |
_TEXT_LENGTH_CAPTION = (
|
| 152 |
"Use this widget to identify outliers, particularly suspiciously long outliers."
|
| 153 |
)
|
|
|
|
| 174 |
start_id_show_lengths = st.slider(
|
| 175 |
f"Show the shortest sentences{column_id} starting at:",
|
| 176 |
0,
|
| 177 |
+
dstats.num_uniq_lengths,
|
| 178 |
value=0,
|
| 179 |
step=1,
|
| 180 |
)
|