Spaces:

MilaNLProc
/

wordify

Build error

App Files Files Community

Pietro Lesci commited on May 31, 2021

Commit

c700823

1 Parent(s): 51cab9d

enhanced UI

Browse files

Files changed (2) hide show

src/pages/home.py +64 -61
src/preprocessing.py +74 -63

src/pages/home.py CHANGED Viewed

@@ -1,13 +1,7 @@
 from src.configs import Languages
-from src.utils import (
-    encode,
-    download_button,
-    TextPreprocessor,
-    plot_labels_prop,
-    plot_nchars,
-    plot_score,
-    read_file,
-)
 from src.wordifier import wordifier
 import streamlit as st
@@ -36,7 +30,7 @@ def write(session, uploaded_file):
     elif uploaded_file:
-        # 1. READ FILE
         with st.spinner("Reading file"):
             # TODO: write parser function that automatically understands format
             data = read_file(uploaded_file)
@@ -47,15 +41,13 @@ def write(session, uploaded_file):
             language = st.selectbox("Select language", [i.name for i in Languages])
             with st.beta_expander("Description"):
                 st.markdown(
-                    f"Select a language of text amongst those supported: {', '.join([f'`{i.name}`' for i in Languages])}"
                 )
         with col2:
             cols_options = [""] + data.columns.tolist()
-            label_column = st.selectbox(
-                "Select label column name", cols_options, index=0
-            )
             with st.beta_expander("Description"):
-                st.markdown("Select the column containing the label")
             if label_column:
                 plot = plot_labels_prop(data, label_column)
@@ -65,90 +57,103 @@ def write(session, uploaded_file):
         with col3:
             text_column = st.selectbox("Select text column name", cols_options, index=0)
             with st.beta_expander("Description"):
-                st.markdown("Select the column containing the text")
             if text_column:
-                st.altair_chart(
-                    plot_nchars(data, text_column), use_container_width=True
-                )
         with st.beta_expander("Advanced options"):
-            # Lemmatization option
             col1, col2 = st.beta_columns([1, 3])
             with col1:
-                lemmatization_when_elem = st.empty()
             with col2:
-                st.markdown("Choose lemmatization option")
-            # stopwords option
             col1, col2 = st.beta_columns([1, 3])
             with col1:
-                remove_stopwords_elem = st.empty()
             with col2:
-                st.markdown("Choose stopword option")
-            # cleaning steps
             col1, col2 = st.beta_columns([1, 3])
             with col1:
-                cleaning_steps_elem = st.empty()
-                reset_button = st.empty()
             with col2:
-                st.markdown("Choose cleaning steps")
             # implement reset logic
             if reset_button.button("Reset steps"):
                 session.run_id += 1
-            steps_options = list(TextPreprocessor._cleaning_options().keys())
-            cleaning_steps = cleaning_steps_elem.multiselect(
-                "Select text processing steps (ordered)",
                 options=steps_options,
-                default=steps_options,
                 format_func=lambda x: x.replace("_", " ").title(),
                 key=session.run_id,
             )
-            lemmatization_options = list(
-                TextPreprocessor._lemmatization_options().keys()
-            )
-            lemmatization_when = lemmatization_when_elem.selectbox(
-                "Select when lemmatization happens",
-                options=lemmatization_options,
-                index=0,
                 key=session.run_id,
             )
             remove_stopwords = remove_stopwords_elem.checkbox(
-                "Remove stopwords", value=True, key=session.run_id
             )
-        # Show sample checkbox
         col1, col2 = st.beta_columns([1, 2])
         with col1:
             show_sample = st.checkbox("Show sample of preprocessed text")
         # initialize text preprocessor
-        preprocessor = TextPreprocessor(
-            language=language,
-            cleaning_steps=cleaning_steps,
-            lemmatizer_when=lemmatization_when,
-            remove_stop=remove_stopwords,
         )
-        # 3. PROVIDE FEEDBACK ON OPTIONS
         if show_sample and not (label_column and text_column):
             st.warning("Please select `label` and `text` columns")
         elif show_sample and (label_column and text_column):
-            sample_data = data.sample(10)
-            sample_data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(
                 sample_data[text_column]
             ).values
-            st.table(
-                sample_data.loc[
-                    :, [label_column, text_column, f"preprocessed_{text_column}"]
-                ]
-            )
-        # 4. RUN
         run_button = st.button("Wordify!")
         if run_button and not (label_column and text_column):
             st.warning("Please select `label` and `text` columns")
@@ -157,7 +162,7 @@ def write(session, uploaded_file):
             with st.spinner("Process started"):
                 # data = data.head()
-                data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(
                     data[text_column]
                 ).values
@@ -168,7 +173,7 @@ def write(session, uploaded_file):
             # session.posdf, session.negdf = process(data, text_column, label_column)
             session.process = True
-        # 5. RESULTS
         if session.process and (label_column and text_column):
             st.markdown("")
             st.markdown("")
@@ -178,9 +183,7 @@ def write(session, uploaded_file):
             col1, col2, col3 = st.beta_columns([2, 3, 3])
             with col1:
-                label = st.selectbox(
-                    "Select label", data[label_column].unique().tolist()
-                )
                 # # with col2:
                 # thres = st.slider(
                 #     "Select threshold",

 from src.configs import Languages
+from src.utils import read_file, download_button
+from src.plotting import plot_labels_prop, plot_nchars, plot_score
+from src.preprocessing import Lemmatizer, PreprocessingPipeline, encode
 from src.wordifier import wordifier
 import streamlit as st
     elif uploaded_file:
+        # ==== 1. READ FILE ==== #
         with st.spinner("Reading file"):
             # TODO: write parser function that automatically understands format
             data = read_file(uploaded_file)
             language = st.selectbox("Select language", [i.name for i in Languages])
             with st.beta_expander("Description"):
                 st.markdown(
+                    f"Select a language amongst those supported: {', '.join([f'`{i.name}`' for i in Languages])}. This will be used to lemmatize and remove stopwords."
                 )
         with col2:
             cols_options = [""] + data.columns.tolist()
+            label_column = st.selectbox("Select label column name", cols_options, index=0)
             with st.beta_expander("Description"):
+                st.markdown("Select the column containing the labels.")
             if label_column:
                 plot = plot_labels_prop(data, label_column)
         with col3:
             text_column = st.selectbox("Select text column name", cols_options, index=0)
             with st.beta_expander("Description"):
+                st.markdown("Select the column containing the texts.")
             if text_column:
+                st.altair_chart(plot_nchars(data, text_column), use_container_width=True)
+        # ==== 2.1 CREATE UI FOR ADVANCED OPTIONS ==== #
         with st.beta_expander("Advanced options"):
+            steps_options = list(PreprocessingPipeline.pipeline_components().keys())
+            # stopwords option and
             col1, col2 = st.beta_columns([1, 3])
             with col1:
+                st.markdown("Remove stopwords (uses Spacy vocabulary)")
             with col2:
+                remove_stopwords_elem = st.empty()
+            # lemmatization option
             col1, col2 = st.beta_columns([1, 3])
             with col1:
+                st.markdown("Lemmatizes text (uses Spacy)")
             with col2:
+                lemmatization_elem = st.empty()
+            # pre-lemmatization cleaning steps and
+            # post-lemmatization cleaning steps
             col1, col2 = st.beta_columns([1, 3])
             with col1:
+                st.markdown(
+                    f"""
+                    Define a pipeline of cleaning steps that is applied before and/or after lemmatization.
+                    The available cleaning steps are:\n
+                    {", ".join([f"`{x.replace('_', ' ').title()}`" for x in steps_options])}
+                    """
+                )
             with col2:
+                pre_steps_elem = st.empty()
+                post_steps_elem = st.empty()
+                reset_button = st.empty()
             # implement reset logic
             if reset_button.button("Reset steps"):
                 session.run_id += 1
+            pre_steps = pre_steps_elem.multiselect(
+                "Select pre-lemmatization preprocessing steps (ordered)",
                 options=steps_options,
+                default=steps_options[1:],
                 format_func=lambda x: x.replace("_", " ").title(),
                 key=session.run_id,
             )
+            post_steps = post_steps_elem.multiselect(
+                "Select post-lemmatization processing steps (ordered)",
+                options=steps_options,
+                default=steps_options[-4:],
+                format_func=lambda x: x.replace("_", " ").title(),
                 key=session.run_id,
             )
             remove_stopwords = remove_stopwords_elem.checkbox(
+                "Remove stopwords",
+                value=True,
+                key=session.run_id,
+            )
+            lemmatization = lemmatization_elem.checkbox(
+                "Lemmatize text",
+                value=True,
+                key=session.run_id,
             )
+        # show sample checkbox
         col1, col2 = st.beta_columns([1, 2])
         with col1:
             show_sample = st.checkbox("Show sample of preprocessed text")
         # initialize text preprocessor
+        preprocessing_pipeline = PreprocessingPipeline(
+            pre_steps=pre_steps,
+            lemmatizer=Lemmatizer(
+                language=language,
+                remove_stop=remove_stopwords,
+                lemmatization=lemmatization,
+            ),
+            post_steps=post_steps,
         )
+        # ==== 3. PROVIDE FEEDBACK ON OPTIONS ==== #
         if show_sample and not (label_column and text_column):
             st.warning("Please select `label` and `text` columns")
         elif show_sample and (label_column and text_column):
+            sample_data = data.sample(5)
+            sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
                 sample_data[text_column]
             ).values
+            st.table(sample_data.loc[:, [label_column, text_column, f"preprocessed_{text_column}"]])
+        # ==== 4. RUN ==== #
         run_button = st.button("Wordify!")
         if run_button and not (label_column and text_column):
             st.warning("Please select `label` and `text` columns")
             with st.spinner("Process started"):
                 # data = data.head()
+                data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
                     data[text_column]
                 ).values
             # session.posdf, session.negdf = process(data, text_column, label_column)
             session.process = True
+        # ==== 5. RESULTS ==== #
         if session.process and (label_column and text_column):
             st.markdown("")
             st.markdown("")
             col1, col2, col3 = st.beta_columns([2, 3, 3])
             with col1:
+                label = st.selectbox("Select label", data[label_column].unique().tolist())
                 # # with col2:
                 # thres = st.slider(
                 #     "Select threshold",

src/preprocessing.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 import string
 from collections import OrderedDict
-from typing import Callable, Dict, List
 import numpy as np
 import pandas as pd
@@ -86,75 +86,102 @@ def normalize_repeating_words(t):
     return _re_wrep.sub(_replace_wrep, t)
 # fmt: on
-class TextPreprocessor:
-    def __init__(
-        self,
-        language: str,
-        cleaning_steps: List[str],
-        lemmatizer_when: str = "last",
-        remove_stop: bool = True,
-    ) -> None:
-        # prepare lemmatizer
         self.language = language
         self.nlp = spacy.load(
             Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
         )
-        self.lemmatizer_when = self._lemmatization_options().get(lemmatizer_when, None)
-        self.remove_stop = remove_stop
-        self._lemmatize = self._get_lemmatizer()
-        # prepare cleaning
-        self.cleaning_steps = [
-            self._cleaning_options()[step]
-            for step in cleaning_steps
-            if step in self._cleaning_options()
-        ]
-        self.cleaning_pipeline = (
-            make_pipeline(*self.cleaning_steps) if self.cleaning_steps else lambda x: x
-        )
-    def _get_lemmatizer(self) -> Callable:
         """Return the correct spacy Doc-level lemmatizer"""
-        if self.remove_stop:
-            def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
-                """Lemmatizes spacy Doc and removes stopwords"""
-                return " ".join(
-                    [t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop]
-                )
-        else:
-            def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
-                """Lemmatizes spacy Doc"""
                 return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
-        return lemmatizer
-    @staticmethod
-    def _lemmatization_options() -> Dict[str, str]:
-        return {
-            "Before preprocessing": "first",
-            "After preprocessing": "last",
-            "Never! Let's do it quick and dirty": None,
-        }
-    def lemmatizer(self, series: pd.Series) -> pd.Series:
         """
         Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
         """
         res = []
-        pbar = stqdm(total=len(series))
         for doc in self.nlp.pipe(series, batch_size=500):
-            res.append(self._lemmatize(doc))
             pbar.update(1)
         pbar.close()
         return pd.Series(res)
     @staticmethod
-    def _cleaning_options():
         """Returns available cleaning steps in order"""
         return OrderedDict(
             [
@@ -184,19 +211,3 @@ class TextPreprocessor:
                 ("strip", lambda x: x.strip()),
             ]
         )
-    def fit_transform(self, series: pd.Series) -> Series:
-        """Applies text preprocessing"""
-        if self.lemmatizer_when == "first":
-            with st.spinner("Lemmatizing"):
-                series = self.lemmatizer(series)
-        with st.spinner("Cleaning"):
-            series = series.progress_map(self.cleaning_pipeline)
-        if self.lemmatizer_when == "last":
-            with st.spinner("Lemmatizing"):
-                series = self.lemmatizer(series)
-        return series

 import re
 import string
 from collections import OrderedDict
+from typing import Callable, List, Optional, Tuple
 import numpy as np
 import pandas as pd
     return _re_wrep.sub(_replace_wrep, t)
 # fmt: on
+class Lemmatizer:
+    """Creates lemmatizer based on spacy"""
+    def __init__(self, language: str, remove_stop: bool = True, lemmatization: bool = True) -> None:
         self.language = language
         self.nlp = spacy.load(
             Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
         )
+        self._lemmatizer_fn = self._get_lemmatization_fn(remove_stop, lemmatization)
+        self.lemmatization = lemmatization
+    def _get_lemmatization_fn(self, remove_stop: bool, lemmatization: bool) -> Optional[Callable]:
         """Return the correct spacy Doc-level lemmatizer"""
+        if remove_stop and lemmatization:
+            def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
+                return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
+        elif remove_stop and not lemmatization:
+            def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
+                return " ".join([t for t in doc if not t.is_stop])
+        elif lemmatization and not remove_stop:
+            def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
                 return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
+        else:
+            self.status = False
+            return
+        return lemmatizer_fn
+    def __call__(self, series: Series) -> Series:
         """
         Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
         """
         res = []
+        pbar = stqdm(total=len(series), desc="Lemmatizing")
         for doc in self.nlp.pipe(series, batch_size=500):
+            res.append(self._lemmatizer_fn(doc))
             pbar.update(1)
         pbar.close()
         return pd.Series(res)
+class PreprocessingPipeline:
+    def __init__(self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]):
+        # build pipeline
+        self.pre_pipeline, self.lemmatizer, self.post_pipeline = self.make_pipeline(
+            pre_steps, lemmatizer, post_steps
+        )
+    def __call__(self, series: Series) -> Series:
+        with st.spinner("Pre-lemmatization cleaning"):
+            res = series.progress_map(self.pre_pipeline)
+        with st.spinner("Lemmatizing"):
+            res = self.lemmatizer(series)
+        with st.spinner("Post-lemmatization cleaning"):
+            res = series.progress_map(self.post_pipeline)
+        return res
+    def make_pipeline(
+        self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]
+    ) -> Tuple[Callable]:
+        # pre-lemmatization steps
+        pre_steps = [
+            self.pipeline_components()[step]
+            for step in pre_steps
+            if step in self.pipeline_components()
+        ]
+        pre_steps = make_pipeline(*pre_steps) if pre_steps else lambda x: x
+        # lemmatization
+        lemmatizer = lemmatizer if lemmatizer.lemmatization else lambda x: x
+        # post lemmatization steps
+        post_steps = [
+            self.pipeline_components()[step]
+            for step in post_steps
+            if step in self.pipeline_components()
+        ]
+        post_steps = make_pipeline(*post_steps) if post_steps else lambda x: x
+        return pre_steps, lemmatizer, post_steps
     @staticmethod
+    def pipeline_components() -> "OrderedDict[str, Callable]":
         """Returns available cleaning steps in order"""
         return OrderedDict(
             [
                 ("strip", lambda x: x.strip()),
             ]
         )