Spaces:

huggingface
/

data-measurements-tool

Build error

App Files Files Community

meg-huggingface commited on Dec 5, 2021

Commit

335424f

1 Parent(s): 85cf91c

Some additional modularizing and caching of the text lengths widget

Browse files

Files changed (2) hide show

data_measurements/dataset_statistics.py +6 -2
data_measurements/streamlit_utils.py +2 -4

data_measurements/dataset_statistics.py CHANGED Viewed

@@ -219,6 +219,7 @@ class DatasetStatisticsCacheClass:
         self.avg_length = None
         self.std_length = None
         self.general_stats_dict = None
         # clustering text by embeddings
         # the hierarchical clustering tree is represented as a list of nodes,
         # the first is the root
@@ -351,6 +352,7 @@ class DatasetStatisticsCacheClass:
                 self.length_stats_dict = json.load(f)
             self.avg_length = self.length_stats_dict["avg length"]
             self.std_length = self.length_stats_dict["std length"]
         else:
             self.prepare_text_length_stats()
             if save:
@@ -367,14 +369,16 @@ class DatasetStatisticsCacheClass:
         )
     def prepare_text_length_stats(self):
-        if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
             self.prepare_length_df()
         avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
         self.avg_length = round(avg_length, 1)
         std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
         self.std_length = round(std_length, 1)
         self.length_stats_dict = {"avg length": self.avg_length,
-                                  "std length": self.std_length}
     def prepare_fig_text_lengths(self):
         if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:

         self.avg_length = None
         self.std_length = None
         self.general_stats_dict = None
+        self.num_uniq_lengths = 0
         # clustering text by embeddings
         # the hierarchical clustering tree is represented as a list of nodes,
         # the first is the root
                 self.length_stats_dict = json.load(f)
             self.avg_length = self.length_stats_dict["avg length"]
             self.std_length = self.length_stats_dict["std length"]
+            self.num_uniq_lengths = self.length_stats_dict["num lengths"]
         else:
             self.prepare_text_length_stats()
             if save:
         )
     def prepare_text_length_stats(self):
+        if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns or self.length_df is None:
             self.prepare_length_df()
         avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
         self.avg_length = round(avg_length, 1)
         std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
         self.std_length = round(std_length, 1)
+        self.num_uniq_lengths = len(self.length_df["length"].unique())
         self.length_stats_dict = {"avg length": self.avg_length,
+                                  "std length": self.std_length,
+                                  "num lengths": self.num_uniq_lengths}
     def prepare_fig_text_lengths(self):
         if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:

data_measurements/streamlit_utils.py CHANGED Viewed

@@ -147,9 +147,7 @@ def expander_label_distribution(fig_labels, column_id):
             st.markdown("No labels were found in the dataset")
-def expander_text_lengths(dstats,
-    column_id,
-):
     _TEXT_LENGTH_CAPTION = (
         "Use this widget to identify outliers, particularly suspiciously long outliers."
     )
@@ -176,7 +174,7 @@ def expander_text_lengths(dstats,
         start_id_show_lengths = st.slider(
             f"Show the shortest sentences{column_id} starting at:",
             0,
-            len(dstats.length_df["length"].unique()),
             value=0,
             step=1,
         )

             st.markdown("No labels were found in the dataset")
+def expander_text_lengths(dstats, column_id):
     _TEXT_LENGTH_CAPTION = (
         "Use this widget to identify outliers, particularly suspiciously long outliers."
     )
         start_id_show_lengths = st.slider(
             f"Show the shortest sentences{column_id} starting at:",
             0,
+            dstats.num_uniq_lengths,
             value=0,
             step=1,
         )