Spaces:

MilaNLProc
/

wordify

Build error

App Files Files Community

Pietro Lesci commited on Dec 16, 2021

Commit

b3ecaa7

1 Parent(s): b482a79

add support for chinese

Browse files

Files changed (9) hide show

.streamlit/config.toml +1 -1
app.py +3 -2
data/test_chinese.xlsx +0 -0
requirements.txt +3 -0
src/components.py +52 -5
src/configs.py +4 -3
src/preprocessing.py +54 -14
src/utils.py +4 -3
tests/notebook.ipynb +66 -191

.streamlit/config.toml CHANGED Viewed

@@ -1,7 +1,7 @@
 [server]
 # Max size, in megabytes, for files uploaded with the file_uploader.
 # Default: 200
-maxUploadSize = 20
 [browser]
 gatherUsageStats = false

 [server]
 # Max size, in megabytes, for files uploaded with the file_uploader.
 # Default: 200
+maxUploadSize = 10
 [browser]
 gatherUsageStats = false

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import streamlit as st
-from src.components import faq, footer, form, presentation, analysis, docs
 from src.utils import convert_df, get_logo, read_file
 # app configs
@@ -25,7 +26,7 @@ st.title("Wordify")
 # file uploader
 uploaded_fl = st.sidebar.file_uploader(
     label="Choose a file",
-    type=["csv", "parquet", "tsv", "xlsx"],
     accept_multiple_files=False,
     help="""
         Supported formats:

 import streamlit as st
+from src.components import analysis, docs, faq, footer, form, presentation
+from src.configs import SupportedFiles
 from src.utils import convert_df, get_logo, read_file
 # app configs
 # file uploader
 uploaded_fl = st.sidebar.file_uploader(
     label="Choose a file",
+    type=[i.name for i in SupportedFiles],
     accept_multiple_files=False,
     help="""
         Supported formats:

data/test_chinese.xlsx ADDED Viewed

Binary file (580 kB). View file

requirements.txt CHANGED Viewed

@@ -37,3 +37,6 @@ https://github.com/explosion/spacy-models/releases/download/ro_core_news_sm-3.2.
 https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz#egg=ru_core_news_sm
 # multi-language
 https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.2.0/xx_ent_wiki_sm-3.2.0.tar.gz#egg=xx_ent_wiki_sm

 https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz#egg=ru_core_news_sm
 # multi-language
 https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.2.0/xx_ent_wiki_sm-3.2.0.tar.gz#egg=xx_ent_wiki_sm
+# chinese
+https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.2.0/zh_core_web_sm-3.2.0.tar.gz#egg=zh_core_web_sm

src/components.py CHANGED Viewed

@@ -1,11 +1,12 @@
-import streamlit as st
 import time
 import pandas as pd
-from src.configs import Languages, PreprocessingConfigs, SupportedFiles, ColumnNames
 from src.preprocessing import PreprocessingPipeline
-from src.wordifier import input_transform, output_transform, wordifier
 from src.utils import get_col_indices
 def docs():
@@ -78,7 +79,7 @@ def form(df):
                     "Select lemmatization",
                     options=lammatization_options,
                     index=PreprocessingConfigs.DEFAULT_LEMMA.value,
-                    help="Select lemmatization procedure",
                 )
                 post_steps = st.multiselect(
@@ -98,6 +99,11 @@ def form(df):
             start_time = time.time()
             # preprocess
             if not disable_preprocessing:
                 with st.spinner("Step 1/4: Preprocessing text"):
@@ -109,7 +115,10 @@ def form(df):
                 with st.spinner(
                     "Step 1/4: Preprocessing has been disabled - doing nothing"
                 ):
-                    time.sleep(1.5)
             # prepare input
             with st.spinner("Step 2/4: Preparing inputs"):
@@ -260,6 +269,15 @@ def presentation():
         you provide a file following this naming convention, Wordify will automatically select the
         correct columns. However, if you wish to use a different nomenclature, you will be asked to
         provide the column names in the interactive UI.
         """
     )
@@ -377,3 +395,32 @@ def analysis(outputs):
             st.write(meta_data["labels"])
     return subset_df

 import time
 import pandas as pd
+import streamlit as st
+from src.configs import ColumnNames, Languages, PreprocessingConfigs, SupportedFiles
 from src.preprocessing import PreprocessingPipeline
 from src.utils import get_col_indices
+from src.wordifier import input_transform, output_transform, wordifier
 def docs():
                     "Select lemmatization",
                     options=lammatization_options,
                     index=PreprocessingConfigs.DEFAULT_LEMMA.value,
+                    help="Select lemmatization procedure. This is automatically disabled when the selected language is Chinese or MultiLanguage.",
                 )
                 post_steps = st.multiselect(
             start_time = time.time()
+            # warnings about inputs
+            language_specific_warnings(
+                pre_steps, post_steps, lemmatization_step, language
+            )
             # preprocess
             if not disable_preprocessing:
                 with st.spinner("Step 1/4: Preprocessing text"):
                 with st.spinner(
                     "Step 1/4: Preprocessing has been disabled - doing nothing"
                 ):
+                    df = df.rename(
+                        columns={text_column: ColumnNames.PROCESSED_TEXT.value}
+                    )
+                    time.sleep(1.2)
             # prepare input
             with st.spinner("Step 2/4: Preparing inputs"):
         you provide a file following this naming convention, Wordify will automatically select the
         correct columns. However, if you wish to use a different nomenclature, you will be asked to
         provide the column names in the interactive UI.
+        - Maintain a stable connection with the Wordify page until you download the data. If you refresh the page,
+        a new Wordify session is created and your progress is lost.
+        - Wordify performances depend on the length of the individual texts in your file. The longer the texts, the higher
+        the chance that Wordify considers many n-grams. More n-grams means more data to analyse in each run.
+        We tailored Wordify performance for files of approximately 5'000 lines or 50k n-grams. In such cases we expect a runtime
+        between 90 seconds and 10 minutes. If your file is big, try to apply a stricter preprocessing of the text in the `Advanced Options` section.
+        If this is not enough, please do feel free to reach out to us directly so we can help.
         """
     )
             st.write(meta_data["labels"])
     return subset_df
+# warning for Chinese and MultiLanguage
+def language_specific_warnings(pre_steps, post_steps, lemmatization_step, language):
+    if language in ("MultiLanguage", "Chinese") and (
+        "remove_non_words" in pre_steps or "remove_non_words" in post_steps
+    ):
+        msg = """
+        NOTE: for Chinese and MultiLanguage we automatically substitute `remove_non_words` with
+        `remove_numbers` and `remove_punctuation` to avoid wrong results.
+        """
+        st.info(msg)
+    msg = "NOTE: for Chinese and MultiLanguage we turn-off lemmatization automatically."
+    if lemmatization_step == "Spacy lemmatizer (keep stopwords)" and language in (
+        "MultiLanguage",
+        "Chinese",
+    ):
+        st.info(msg)
+    elif lemmatization_step == "Spacy lemmatizer (remove stopwords)" and language in (
+        "MultiLanguage",
+        "Chinese",
+    ):
+        st.info(
+            msg
+            + " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
+        )

src/configs.py CHANGED Viewed

@@ -25,7 +25,7 @@ class InputTransformConfigs(Enum):
 class PreprocessingConfigs(Enum):
-    DEFAULT_PRE = [1, 14, 2, 3, 4, 21, 23, 22, 5, 24]
     DEFAULT_LEMMA = 1
     DEFAULT_POST = [0, 17, 15, 19, 23, 22, 21, 24]
@@ -39,7 +39,6 @@ class Languages(Enum):
     Dutch = "nl_core_news_sm"
     Portuguese = "pt_core_news_sm"
     French = "fr_core_news_sm"
-    # Chinese = "zh_core_news_sm"
     Danish = "da_core_news_sm"
     # Japanese = "ja_core_news_sm"
     Lithuanian = "lt_core_news_sm"
@@ -48,9 +47,11 @@ class Languages(Enum):
     Romanian = "ro_core_news_sm"
     Russian = "ru_core_news_sm"
     MultiLanguage = "xx_ent_wiki_sm"
 class SupportedFiles(Enum):
     xlsx = (lambda x: pd.read_excel(x, dtype=str),)
-    csv = (lambda x: pd.read_csv(x, dtype=str),)
     parquet = (lambda x: pd.read_parquet(x),)

 class PreprocessingConfigs(Enum):
+    DEFAULT_PRE = [1, 14, 2, 3, 4, 5, 23, 22, 21, 24]
     DEFAULT_LEMMA = 1
     DEFAULT_POST = [0, 17, 15, 19, 23, 22, 21, 24]
     Dutch = "nl_core_news_sm"
     Portuguese = "pt_core_news_sm"
     French = "fr_core_news_sm"
     Danish = "da_core_news_sm"
     # Japanese = "ja_core_news_sm"
     Lithuanian = "lt_core_news_sm"
     Romanian = "ro_core_news_sm"
     Russian = "ru_core_news_sm"
     MultiLanguage = "xx_ent_wiki_sm"
+    Chinese = "zh_core_web_sm"
 class SupportedFiles(Enum):
     xlsx = (lambda x: pd.read_excel(x, dtype=str),)
+    tsv = (lambda x: pd.read_csv(x, dtype=str, sep="\t"),)
+    csv = (lambda x: pd.read_csv(x, dtype=str, sep=","),)
     parquet = (lambda x: pd.read_parquet(x),)

src/preprocessing.py CHANGED Viewed

@@ -3,11 +3,9 @@ import os
 import re
 import string
 from collections import OrderedDict
-from typing import Callable, List, Optional
-import pandas as pd
 import spacy
-import streamlit as st
 import vaex
 from pandas.core.frame import DataFrame
 from pandas.core.series import Series
@@ -99,14 +97,10 @@ class PreprocessingPipeline:
         self.lemmatization_step = lemmatization_step
         self.post_steps = post_steps
-        self.nlp = (
-            spacy.load(Languages[language].value, disable=["parser", "ner"])
-            if self.lemmatization_step != "Disable lemmatizer"
-            else identity
-        )
-        self.pre = self.make_pipe_component(self.pre_steps)
-        self.post = self.make_pipe_component(self.post_steps)
-        self.lemma = self.lemmatization_component().get(self.lemmatization_step)
     # def apply_multiproc(fn, series):
     #     with mp.Pool(mp.cpu_count()) as pool:
@@ -148,13 +142,59 @@ class PreprocessingPipeline:
     #     return series
-    def make_pipe_component(self, steps: Optional[List[str]]) -> Optional[Callable]:
         if not steps:
             return identity
-        components = [self.pipeline_components()[step] for step in steps]
         return make_pipeline(*components)
     @staticmethod
     def pipeline_components() -> "OrderedDict[str, Callable]":
         """Returns available cleaning steps in order"""
@@ -193,7 +233,7 @@ class PreprocessingPipeline:
         return OrderedDict(
             [
                 ("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
-                ("Spacy lemmatizer (no stopwords)", lemmatize_remove_stopwords),
                 ("Disable lemmatizer", identity),
                 ("Remove stopwords", remove_stopwords),
             ]

 import re
 import string
 from collections import OrderedDict
+from typing import Callable, List, Optional, Union
 import spacy
 import vaex
 from pandas.core.frame import DataFrame
 from pandas.core.series import Series
         self.lemmatization_step = lemmatization_step
         self.post_steps = post_steps
+        self.pre = self.make_pipe_component(self.pre_steps, self.language)
+        self.post = self.make_pipe_component(self.post_steps, self.language)
+        self.nlp = self.make_nlp(self.lemmatization_step, self.language)
+        self.lemma = self.make_lemma(self.lemmatization_step, self.language)
     # def apply_multiproc(fn, series):
     #     with mp.Pool(mp.cpu_count()) as pool:
     #     return series
+    @classmethod
+    def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
         if not steps:
             return identity
+        elif language in ("MultiLanguage", "Chinese") and "remove_non_words" in steps:
+            idx = steps.index("remove_non_words")
+            steps = (
+                steps[:idx]
+                + ["remove_numbers", "remove_punctuation"]
+                + steps[idx + 1 :]
+            )
+        components = [cls.pipeline_components()[step] for step in steps]
         return make_pipeline(*components)
+    @staticmethod
+    def make_nlp(
+        lemmatization_step: Optional[str], language: str
+    ) -> Union[spacy.language.Language, Callable]:
+        if (
+            lemmatization_step is None
+            or lemmatization_step == "Disable lemmatizer"
+            or (
+                lemmatization_step == "Spacy lemmatizer (keep stopwords)"
+                and language in ("MultiLanguage", "Chinese")
+            )
+        ):
+            return identity
+        return spacy.load(Languages[language].value, disable=["parser", "ner"])
+    @classmethod
+    def make_lemma(cls, lemmatization_step: Optional[str], language: str) -> Callable:
+        if (
+            lemmatization_step is None
+            or lemmatization_step == "Disable lemmatizer"
+            or (
+                lemmatization_step == "Spacy lemmatizer (keep stopwords)"
+                and language in ("MultiLanguage", "Chinese")
+            )
+        ):
+            return identity
+        elif (
+            lemmatization_step == "Spacy lemmatizer (remove stopwords)"
+            and language in ("MultiLanguage", "Chinese")
+        ):
+            return cls.lemmatization_component().get("Remove stopwords")
+        return cls.lemmatization_component().get(lemmatization_step)
     @staticmethod
     def pipeline_components() -> "OrderedDict[str, Callable]":
         """Returns available cleaning steps in order"""
         return OrderedDict(
             [
                 ("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
+                ("Spacy lemmatizer (remove stopwords)", lemmatize_remove_stopwords),
                 ("Disable lemmatizer", identity),
                 ("Remove stopwords", remove_stopwords),
             ]

src/utils.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import base64
 from typing import List, Tuple
-from pandas.core.frame import DataFrame
 import streamlit as st
 from PIL import Image
-# import altair as alt
-from .configs import SupportedFiles, ColumnNames
 def get_col_indices(cols: List) -> Tuple[int, int]:

 import base64
 from typing import List, Tuple
 import streamlit as st
+from pandas.core.frame import DataFrame
 from PIL import Image
+from .configs import ColumnNames, SupportedFiles
+# import altair as alt
 def get_col_indices(cols: List) -> Tuple[int, int]:

tests/notebook.ipynb CHANGED Viewed

@@ -21,7 +21,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.read_csv(\"../data/test_en.csv\")"
    ]
   },
   {
@@ -36,10 +37,10 @@
     "    \"normalize_bullet_points\",\n",
     "    \"normalize_hyphenated_words\",\n",
     "    \"normalize_quotation_marks\",\n",
-    "    \"normalize_useless_spaces\",\n",
     "    \"normalize_repeating_words\",\n",
     "    \"normalize_repeating_chars\",\n",
-    "    \"normalize_whitespaces\",\n",
     "    # \"replace_currency_symbols\",\n",
     "    # \"replace_emails\",\n",
     "    # \"replace_emojis\",\n",
@@ -60,7 +61,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -74,8 +75,8 @@
     "    # \"replace_emojis\",\n",
     "    # \"replace_phone_numbers\",\n",
     "    # \"replace_numbers\",\n",
-    "    \"remove_html_tags\",\n",
-    "    \"remove_accents\",\n",
     "    # \"remove_brackets\",\n",
     "    \"remove_non_words\",\n",
     "    # \"remove_numbers\",\n",
@@ -89,13 +90,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "pipe = PreprocessingPipeline(\n",
-    "    language=\"English\",\n",
-    "    lemmatization_step=\"Spacy lemmatizer (no stopwords)\", # \"Disable lemmatizer\",\n",
     "    pre_steps=pre_steps,\n",
     "    post_steps=post_steps,\n",
     ")"
@@ -103,218 +104,125 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "\"I think it's time John Rambo move on with his life and try to put Vietnam behind him. This series is getting old and Rambo is no longer a solider but a cold blooded killer. Ever time he turns up on the screen someone dies. Vietnam was not a fun place to be and frankly I am tired of Hollywood making it seem like it was. This is not the worst of the films concerning Vietnam, that honor goes to John Waynes Green Berets. In any case John Rambo carrying around a 50 cal Machine Gun taking on what seems to be half of the Viet Cong army plus a good many Russians is an insult to watch. What is worse is Rambos cheesy speech at the end.Please!! Oh yeah I heard they are making another one.\""
       ]
      },
-     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "pipe.pre(df.text[0])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'think time John Rambo life try Vietnam . series get old Rambo long solider cold blooded killer . time turn screen die . Vietnam fun place frankly tired Hollywood make like . bad film concern Vietnam , honor go John Waynes Green Berets . case John Rambo carry 50 cal Machine Gun take half Viet Cong army plus good Russians insult watch . bad Rambos cheesy speech end . ! ! oh yeah hear make .'"
       ]
      },
-     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "pipe.lemma(pipe.nlp(pipe.pre(df.text[0])))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'think time john rambo life try vietnam series get old rambo long solider cold blooded killer time turn screen die vietnam fun place frankly tired hollywood make like bad film concern vietnam honor go john waynes green berets case john rambo carry cal machine gun take half viet cong army plus good russians insult watch bad rambos cheesy speech end oh yeah hear make'"
       ]
      },
-     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "pipe.post(pipe.lemma(pipe.nlp(pipe.pre(df.text[0]))))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "odf = pipe.vaex_process(df, \"text\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>label</th>\n",
-       "      <th>text</th>\n",
-       "      <th>processed_text</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>I think it's time John Rambo move on with his ...</td>\n",
-       "      <td>think time john rambo life try vietnam series ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>I've just watch 2 films of Pang brothers, The ...</td>\n",
-       "      <td>watch film pang brother eye watch eye kind dis...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1</td>\n",
-       "      <td>Jewel Thief is *THE* crime thriller of Bollywo...</td>\n",
-       "      <td>jewel thief crime thriller bollywood direct bi...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0</td>\n",
-       "      <td>This so called remake is terrible. I went to s...</td>\n",
-       "      <td>call remake terrible go tonight day anticipati...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1</td>\n",
-       "      <td>When Northfork debuted at the Cannes Film Fest...</td>\n",
-       "      <td>northfork debut cannes film festival people li...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4995</th>\n",
-       "      <td>0</td>\n",
-       "      <td>The title tells it all -- Ed Gein, the butcher...</td>\n",
-       "      <td>title tell ed gein butcher plainfield it zappy...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4996</th>\n",
-       "      <td>0</td>\n",
-       "      <td>This film makes about as much sense as an 'Ozz...</td>\n",
-       "      <td>film make sense ozzie harriet father know best...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4997</th>\n",
-       "      <td>0</td>\n",
-       "      <td>\"Sex and the City\" has some great things going...</td>\n",
-       "      <td>sex city great thing go problem saddle number ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4998</th>\n",
-       "      <td>0</td>\n",
-       "      <td>Please...if anybody gets the chance to read th...</td>\n",
-       "      <td>please if anybody get chance read watch movie ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4999</th>\n",
-       "      <td>0</td>\n",
-       "      <td>...a film comes along that manages to be absol...</td>\n",
-       "      <td>a film come manage absolutely terrible open ti...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5000 rows × 3 columns</p>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "      label                                               text  \\\n",
-       "0         0  I think it's time John Rambo move on with his ...   \n",
-       "1         1  I've just watch 2 films of Pang brothers, The ...   \n",
-       "2         1  Jewel Thief is *THE* crime thriller of Bollywo...   \n",
-       "3         0  This so called remake is terrible. I went to s...   \n",
-       "4         1  When Northfork debuted at the Cannes Film Fest...   \n",
-       "...     ...                                                ...   \n",
-       "4995      0  The title tells it all -- Ed Gein, the butcher...   \n",
-       "4996      0  This film makes about as much sense as an 'Ozz...   \n",
-       "4997      0  \"Sex and the City\" has some great things going...   \n",
-       "4998      0  Please...if anybody gets the chance to read th...   \n",
-       "4999      0  ...a film comes along that manages to be absol...   \n",
-       "\n",
-       "                                         processed_text  \n",
-       "0     think time john rambo life try vietnam series ...  \n",
-       "1     watch film pang brother eye watch eye kind dis...  \n",
-       "2     jewel thief crime thriller bollywood direct bi...  \n",
-       "3     call remake terrible go tonight day anticipati...  \n",
-       "4     northfork debut cannes film festival people li...  \n",
-       "...                                                 ...  \n",
-       "4995  title tell ed gein butcher plainfield it zappy...  \n",
-       "4996  film make sense ozzie harriet father know best...  \n",
-       "4997  sex city great thing go problem saddle number ...  \n",
-       "4998  please if anybody get chance read watch movie ...  \n",
-       "4999  a film come manage absolutely terrible open ti...  \n",
-       "\n",
-       "[5000 rows x 3 columns]"
       ]
      },
-     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "odf"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -325,40 +233,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[1, 14, 2, 3, 4, 21, 23, 22, 5, 24]"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "default_pre_steps_idx"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[0, 17, 15, 19, 23, 22, 21, 24]"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "default_post_steps_idx"
    ]
@@ -383,7 +269,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -392,7 +278,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -401,20 +287,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Mimmo '"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "_re_non_words.sub(\" \", \"Mimmo23\")"
    ]

    "metadata": {},
    "outputs": [],
    "source": [
+    "# df = pd.read_csv(\"../data/test_en.csv\")\n",
+    "df = pd.read_excel(\"../data/test_chinese.xlsx\")"
    ]
   },
   {
     "    \"normalize_bullet_points\",\n",
     "    \"normalize_hyphenated_words\",\n",
     "    \"normalize_quotation_marks\",\n",
+    "    \"normalize_whitespaces\",\n",
     "    \"normalize_repeating_words\",\n",
     "    \"normalize_repeating_chars\",\n",
+    "    \"normalize_useless_spaces\",\n",
     "    # \"replace_currency_symbols\",\n",
     "    # \"replace_emails\",\n",
     "    # \"replace_emojis\",\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
     "    # \"replace_emojis\",\n",
     "    # \"replace_phone_numbers\",\n",
     "    # \"replace_numbers\",\n",
+    "    # \"remove_html_tags\",\n",
+    "    # \"remove_accents\",\n",
     "    # \"remove_brackets\",\n",
     "    \"remove_non_words\",\n",
     "    # \"remove_numbers\",\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
     "pipe = PreprocessingPipeline(\n",
+    "    language=\"Chinese\",\n",
+    "    lemmatization_step=\"Spacy lemmatizer (keep stopwords)\", # \"Disable lemmatizer\",\n",
     "    pre_steps=pre_steps,\n",
     "    post_steps=post_steps,\n",
     ")"
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机   利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
       ]
      },
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "df.text[0]"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
       ]
      },
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "pipe.pre(df.text[0])"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
       ]
      },
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "pipe.lemma(pipe.nlp(pipe.pre(df.text[0])))"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 mp 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "pipe.post(pipe.lemma(pipe.nlp(pipe.pre(df.text[0]))))"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "Compose(<function strip at 0x7ff4894750e0>, <function normalize_useless_spaces at 0x7ff48946eef0>, <function normalize_repeating_chars at 0x7ff48946ef80>, <function normalize_repeating_words at 0x7ff4871a7170>, <function punctuation at 0x7ff48946e4d0>, <function remove_numbers at 0x7ff4894754d0>, <function lowercase at 0x7ff489475050>)"
       ]
      },
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
+   "source": [
+    "pipe.post"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "odf = pipe.vaex_process(df, \"text\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "odf"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "default_pre_steps_idx"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "default_post_steps_idx"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "_re_non_words.sub(\" \", \"Mimmo23\")"
    ]