Spaces:

MilaNLProc
/

wordify

Build error

App Files Files Community

Pietro Lesci commited on Mar 9, 2022

Commit

dc4ad9e

1 Parent(s): 13c6837

formatting

Browse files

Files changed (4) hide show

Makefile +9 -1
src/components.py +46 -14
src/preprocessing.py +0 -26
src/utils.py +0 -80

Makefile CHANGED Viewed

@@ -1,6 +1,7 @@
 # Docker image build info
 PROJECT:=wordify
 BUILD_TAG?=v2.0
 ########################################################
 ## Local development
@@ -21,4 +22,11 @@ run:
 	docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 4321:8501 $(PROJECT):${BUILD_TAG}
 stop:
-	docker stop $(PROJECT)-${BUILD_TAG}-container

 # Docker image build info
 PROJECT:=wordify
 BUILD_TAG?=v2.0
+sources = src
 ########################################################
 ## Local development
 	docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 4321:8501 $(PROJECT):${BUILD_TAG}
 stop:
+	docker stop $(PROJECT)-${BUILD_TAG}-container
+format:
+	isort $(sources)
+	black $(sources)
+lint:
+	flake8 $(sources)

src/components.py CHANGED Viewed

@@ -65,12 +65,16 @@ def form(df):
                 pre_steps = st.multiselect(
                     "Select pre-lemmatization processing steps (ordered)",
                     options=steps_options,
-                    default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value],
                     format_func=lambda x: x.replace("_", " ").title(),
                     help="Select the processing steps to apply before the text is lemmatized",
                 )
-                lammatization_options = list(PreprocessingPipeline.lemmatization_component().keys())
                 lemmatization_step = st.selectbox(
                     "Select lemmatization",
                     options=lammatization_options,
@@ -81,7 +85,10 @@ def form(df):
                 post_steps = st.multiselect(
                     "Select post-lemmatization processing steps (ordered)",
                     options=steps_options,
-                    default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value],
                     format_func=lambda x: x.replace("_", " ").title(),
                     help="Select the processing steps to apply after the text is lemmatized",
                 )
@@ -93,21 +100,31 @@ def form(df):
             start_time = time.time()
             # warnings about inputs
-            language_specific_warnings(pre_steps, post_steps, lemmatization_step, language)
             # preprocess
             if not disable_preprocessing:
                 with st.spinner("Step 1/4: Preprocessing text"):
-                    pipe = PreprocessingPipeline(language, pre_steps, lemmatization_step, post_steps)
                     df = pipe.vaex_process(df, text_column)
             else:
-                with st.spinner("Step 1/4: Preprocessing has been disabled - doing nothing"):
-                    df = df.rename(columns={text_column: ColumnNames.PROCESSED_TEXT.value})
                     time.sleep(1.2)
             # prepare input
             with st.spinner("Step 2/4: Preparing inputs"):
-                input_dict = input_transform(df[ColumnNames.PROCESSED_TEXT.value], df[label_column])
             # wordify
             with st.spinner("Step 3/4: Wordifying"):
@@ -217,7 +234,13 @@ def how_it_works():
                 "Wine light cherry",
                 "Chardonnay wine oak buttery",
             ],
-            "Label": ["Italy", "United States", "United States", "Italy", "United States"],
         }
     )
@@ -268,7 +291,9 @@ def how_it_works():
             vectors of coefficients reported in table 3 (indicators that are not present in a run are listed as 0 here):
             """
         )
-        st.caption("Table 3: Coefficients for frequency of indicators in each of the four runs for US wines.")
         st.table(table3)
         st.markdown(
@@ -278,7 +303,9 @@ def how_it_works():
             that are positively and negatively correlated with the US wines.
             """
         )
-        st.caption("Table 4: Final set of indicators that are positively versus negatively correlated with US wines.")
         st.table(table4)
         st.markdown(
             """
@@ -459,11 +486,15 @@ def analysis(outputs):
         )
         with st.expander("Vocabulary"):
-            st.markdown("The table below shows all candidate n-grams that Wordify considered")
             st.write(meta_data["vocabulary"])
         with st.expander("Labels"):
-            st.markdown("The table below summarizes the labels that your file contained")
             st.write(meta_data["labels"])
     return subset_df
@@ -493,5 +524,6 @@ def language_specific_warnings(pre_steps, post_steps, lemmatization_step, langua
         "Chinese",
     ):
         st.info(
-            msg + " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
         )

                 pre_steps = st.multiselect(
                     "Select pre-lemmatization processing steps (ordered)",
                     options=steps_options,
+                    default=[
+                        steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
+                    ],
                     format_func=lambda x: x.replace("_", " ").title(),
                     help="Select the processing steps to apply before the text is lemmatized",
                 )
+                lammatization_options = list(
+                    PreprocessingPipeline.lemmatization_component().keys()
+                )
                 lemmatization_step = st.selectbox(
                     "Select lemmatization",
                     options=lammatization_options,
                 post_steps = st.multiselect(
                     "Select post-lemmatization processing steps (ordered)",
                     options=steps_options,
+                    default=[
+                        steps_options[i]
+                        for i in PreprocessingConfigs.DEFAULT_POST.value
+                    ],
                     format_func=lambda x: x.replace("_", " ").title(),
                     help="Select the processing steps to apply after the text is lemmatized",
                 )
             start_time = time.time()
             # warnings about inputs
+            language_specific_warnings(
+                pre_steps, post_steps, lemmatization_step, language
+            )
             # preprocess
             if not disable_preprocessing:
                 with st.spinner("Step 1/4: Preprocessing text"):
+                    pipe = PreprocessingPipeline(
+                        language, pre_steps, lemmatization_step, post_steps
+                    )
                     df = pipe.vaex_process(df, text_column)
             else:
+                with st.spinner(
+                    "Step 1/4: Preprocessing has been disabled - doing nothing"
+                ):
+                    df = df.rename(
+                        columns={text_column: ColumnNames.PROCESSED_TEXT.value}
+                    )
                     time.sleep(1.2)
             # prepare input
             with st.spinner("Step 2/4: Preparing inputs"):
+                input_dict = input_transform(
+                    df[ColumnNames.PROCESSED_TEXT.value], df[label_column]
+                )
             # wordify
             with st.spinner("Step 3/4: Wordifying"):
                 "Wine light cherry",
                 "Chardonnay wine oak buttery",
             ],
+            "Label": [
+                "Italy",
+                "United States",
+                "United States",
+                "Italy",
+                "United States",
+            ],
         }
     )
             vectors of coefficients reported in table 3 (indicators that are not present in a run are listed as 0 here):
             """
         )
+        st.caption(
+            "Table 3: Coefficients for frequency of indicators in each of the four runs for US wines."
+        )
         st.table(table3)
         st.markdown(
             that are positively and negatively correlated with the US wines.
             """
         )
+        st.caption(
+            "Table 4: Final set of indicators that are positively versus negatively correlated with US wines."
+        )
         st.table(table4)
         st.markdown(
             """
         )
         with st.expander("Vocabulary"):
+            st.markdown(
+                "The table below shows all candidate n-grams that Wordify considered"
+            )
             st.write(meta_data["vocabulary"])
         with st.expander("Labels"):
+            st.markdown(
+                "The table below summarizes the labels that your file contained"
+            )
             st.write(meta_data["labels"])
     return subset_df
         "Chinese",
     ):
         st.info(
+            msg
+            + " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
         )

src/preprocessing.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import multiprocessing as mp
-import os
 import re
 import string
 from collections import OrderedDict
@@ -8,7 +6,6 @@ from typing import Callable, List, Optional, Union
 import spacy
 import vaex
 from pandas.core.frame import DataFrame
-from pandas.core.series import Series
 from textacy.preprocessing import make_pipeline, normalize, remove, replace
 from .configs import Languages
@@ -119,29 +116,6 @@ class PreprocessingPipeline:
         return df
-    # def __call__(self, series: Series) -> Series:
-    #     if self.pre:
-    #         series = series.map(self.pre)
-    #     if self.lemma:
-    #         total_steps = len(series) // 100
-    #         res = []
-    #         pbar = st.progress(0)
-    #         for i, doc in enumerate(
-    #             self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())
-    #         ):
-    #             res.append(self.lemma(doc))
-    #             if i % total_steps == 0:
-    #                 pbar.progress(1)
-    #         series = pd.Series(res)
-    #     if self.post:
-    #         series = series.map(self.post)
-    #     return series
     @classmethod
     def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
         if not steps:

 import re
 import string
 from collections import OrderedDict
 import spacy
 import vaex
 from pandas.core.frame import DataFrame
 from textacy.preprocessing import make_pipeline, normalize, remove, replace
 from .configs import Languages
         return df
     @classmethod
     def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
         if not steps:

src/utils.py CHANGED Viewed

@@ -7,8 +7,6 @@ from PIL import Image
 from .configs import ColumnNames, SupportedFiles
-# import altair as alt
 def get_col_indices(cols: List) -> Tuple[int, int]:
     """Ugly but works"""
@@ -52,81 +50,3 @@ def download_button(dataframe: DataFrame, name: str) -> None:
     b64 = base64.b64encode(csv.encode()).decode()
     href = f'<a href="data:file/csv;base64,{b64}" download="{name}.csv">Download</a>'
     st.write(href, unsafe_allow_html=True)
-# def plot_labels_prop(data: DataFrame, label_column: str):
-#     unique_value_limit = 100
-#     if data[label_column].nunique() > unique_value_limit:
-#         st.warning(
-#             f"""
-#         The column you selected has more than {unique_value_limit}.
-#         Are you sure it's the right column? If it is, please note that
-#         this will impact __Wordify__ performance.
-#         """
-#         )
-#         return
-#     source = (
-#         data[label_column]
-#         .value_counts()
-#         .reset_index()
-#         .rename(columns={"index": "Labels", label_column: "Counts"})
-#     )
-#     source["Props"] = source["Counts"] / source["Counts"].sum()
-#     source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
-#     bars = (
-#         alt.Chart(source)
-#         .mark_bar()
-#         .encode(
-#             x=alt.X("Labels:O", sort="-y"),
-#             y="Counts:Q",
-#         )
-#     )
-#     text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
-#         text="Proportions:O"
-#     )
-#     return (bars + text).properties(height=300)
-# def plot_nchars(data: DataFrame, text_column: str):
-#     source = data[text_column].str.len().to_frame()
-#     plot = (
-#         alt.Chart(source)
-#         .mark_bar()
-#         .encode(
-#             alt.X(
-#                 f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
-#             ),
-#             alt.Y("count()", axis=alt.Axis(title="")),
-#         )
-#     )
-#     return plot.properties(height=300)
-# def plot_score(data: DataFrame, label_col: str, label: str):
-#     source = (
-#         data.loc[data[label_col] == label]
-#         .sort_values("score", ascending=False)
-#         .head(100)
-#     )
-#     plot = (
-#         alt.Chart(source)
-#         .mark_bar()
-#         .encode(
-#             y=alt.Y("word:O", sort="-x"),
-#             x="score:Q",
-#         )
-#     )
-#     return plot.properties(height=max(30 * source.shape[0], 50))

 from .configs import ColumnNames, SupportedFiles
 def get_col_indices(cols: List) -> Tuple[int, int]:
     """Ugly but works"""
     b64 = base64.b64encode(csv.encode()).decode()
     href = f'<a href="data:file/csv;base64,{b64}" download="{name}.csv">Download</a>'
     st.write(href, unsafe_allow_html=True)