Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
b3ecaa7
1
Parent(s):
b482a79
add support for chinese
Browse files- .streamlit/config.toml +1 -1
- app.py +3 -2
- data/test_chinese.xlsx +0 -0
- requirements.txt +3 -0
- src/components.py +52 -5
- src/configs.py +4 -3
- src/preprocessing.py +54 -14
- src/utils.py +4 -3
- tests/notebook.ipynb +66 -191
.streamlit/config.toml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
[server]
|
| 2 |
# Max size, in megabytes, for files uploaded with the file_uploader.
|
| 3 |
# Default: 200
|
| 4 |
-
maxUploadSize =
|
| 5 |
|
| 6 |
[browser]
|
| 7 |
gatherUsageStats = false
|
|
|
|
| 1 |
[server]
|
| 2 |
# Max size, in megabytes, for files uploaded with the file_uploader.
|
| 3 |
# Default: 200
|
| 4 |
+
maxUploadSize = 10
|
| 5 |
|
| 6 |
[browser]
|
| 7 |
gatherUsageStats = false
|
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
|
| 3 |
-
from src.components import faq, footer, form, presentation
|
|
|
|
| 4 |
from src.utils import convert_df, get_logo, read_file
|
| 5 |
|
| 6 |
# app configs
|
|
@@ -25,7 +26,7 @@ st.title("Wordify")
|
|
| 25 |
# file uploader
|
| 26 |
uploaded_fl = st.sidebar.file_uploader(
|
| 27 |
label="Choose a file",
|
| 28 |
-
type=[
|
| 29 |
accept_multiple_files=False,
|
| 30 |
help="""
|
| 31 |
Supported formats:
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
|
| 3 |
+
from src.components import analysis, docs, faq, footer, form, presentation
|
| 4 |
+
from src.configs import SupportedFiles
|
| 5 |
from src.utils import convert_df, get_logo, read_file
|
| 6 |
|
| 7 |
# app configs
|
|
|
|
| 26 |
# file uploader
|
| 27 |
uploaded_fl = st.sidebar.file_uploader(
|
| 28 |
label="Choose a file",
|
| 29 |
+
type=[i.name for i in SupportedFiles],
|
| 30 |
accept_multiple_files=False,
|
| 31 |
help="""
|
| 32 |
Supported formats:
|
data/test_chinese.xlsx
ADDED
|
Binary file (580 kB). View file
|
|
|
requirements.txt
CHANGED
|
@@ -37,3 +37,6 @@ https://github.com/explosion/spacy-models/releases/download/ro_core_news_sm-3.2.
|
|
| 37 |
https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz#egg=ru_core_news_sm
|
| 38 |
# multi-language
|
| 39 |
https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.2.0/xx_ent_wiki_sm-3.2.0.tar.gz#egg=xx_ent_wiki_sm
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz#egg=ru_core_news_sm
|
| 38 |
# multi-language
|
| 39 |
https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.2.0/xx_ent_wiki_sm-3.2.0.tar.gz#egg=xx_ent_wiki_sm
|
| 40 |
+
# chinese
|
| 41 |
+
https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.2.0/zh_core_web_sm-3.2.0.tar.gz#egg=zh_core_web_sm
|
| 42 |
+
|
src/components.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
import time
|
|
|
|
| 3 |
import pandas as pd
|
|
|
|
| 4 |
|
| 5 |
-
from src.configs import Languages, PreprocessingConfigs, SupportedFiles
|
| 6 |
from src.preprocessing import PreprocessingPipeline
|
| 7 |
-
from src.wordifier import input_transform, output_transform, wordifier
|
| 8 |
from src.utils import get_col_indices
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def docs():
|
|
@@ -78,7 +79,7 @@ def form(df):
|
|
| 78 |
"Select lemmatization",
|
| 79 |
options=lammatization_options,
|
| 80 |
index=PreprocessingConfigs.DEFAULT_LEMMA.value,
|
| 81 |
-
help="Select lemmatization procedure",
|
| 82 |
)
|
| 83 |
|
| 84 |
post_steps = st.multiselect(
|
|
@@ -98,6 +99,11 @@ def form(df):
|
|
| 98 |
|
| 99 |
start_time = time.time()
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
# preprocess
|
| 102 |
if not disable_preprocessing:
|
| 103 |
with st.spinner("Step 1/4: Preprocessing text"):
|
|
@@ -109,7 +115,10 @@ def form(df):
|
|
| 109 |
with st.spinner(
|
| 110 |
"Step 1/4: Preprocessing has been disabled - doing nothing"
|
| 111 |
):
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
# prepare input
|
| 115 |
with st.spinner("Step 2/4: Preparing inputs"):
|
|
@@ -260,6 +269,15 @@ def presentation():
|
|
| 260 |
you provide a file following this naming convention, Wordify will automatically select the
|
| 261 |
correct columns. However, if you wish to use a different nomenclature, you will be asked to
|
| 262 |
provide the column names in the interactive UI.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
"""
|
| 264 |
)
|
| 265 |
|
|
@@ -377,3 +395,32 @@ def analysis(outputs):
|
|
| 377 |
st.write(meta_data["labels"])
|
| 378 |
|
| 379 |
return subset_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import time
|
| 2 |
+
|
| 3 |
import pandas as pd
|
| 4 |
+
import streamlit as st
|
| 5 |
|
| 6 |
+
from src.configs import ColumnNames, Languages, PreprocessingConfigs, SupportedFiles
|
| 7 |
from src.preprocessing import PreprocessingPipeline
|
|
|
|
| 8 |
from src.utils import get_col_indices
|
| 9 |
+
from src.wordifier import input_transform, output_transform, wordifier
|
| 10 |
|
| 11 |
|
| 12 |
def docs():
|
|
|
|
| 79 |
"Select lemmatization",
|
| 80 |
options=lammatization_options,
|
| 81 |
index=PreprocessingConfigs.DEFAULT_LEMMA.value,
|
| 82 |
+
help="Select lemmatization procedure. This is automatically disabled when the selected language is Chinese or MultiLanguage.",
|
| 83 |
)
|
| 84 |
|
| 85 |
post_steps = st.multiselect(
|
|
|
|
| 99 |
|
| 100 |
start_time = time.time()
|
| 101 |
|
| 102 |
+
# warnings about inputs
|
| 103 |
+
language_specific_warnings(
|
| 104 |
+
pre_steps, post_steps, lemmatization_step, language
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
# preprocess
|
| 108 |
if not disable_preprocessing:
|
| 109 |
with st.spinner("Step 1/4: Preprocessing text"):
|
|
|
|
| 115 |
with st.spinner(
|
| 116 |
"Step 1/4: Preprocessing has been disabled - doing nothing"
|
| 117 |
):
|
| 118 |
+
df = df.rename(
|
| 119 |
+
columns={text_column: ColumnNames.PROCESSED_TEXT.value}
|
| 120 |
+
)
|
| 121 |
+
time.sleep(1.2)
|
| 122 |
|
| 123 |
# prepare input
|
| 124 |
with st.spinner("Step 2/4: Preparing inputs"):
|
|
|
|
| 269 |
you provide a file following this naming convention, Wordify will automatically select the
|
| 270 |
correct columns. However, if you wish to use a different nomenclature, you will be asked to
|
| 271 |
provide the column names in the interactive UI.
|
| 272 |
+
|
| 273 |
+
- Maintain a stable connection with the Wordify page until you download the data. If you refresh the page,
|
| 274 |
+
a new Wordify session is created and your progress is lost.
|
| 275 |
+
|
| 276 |
+
- Wordify performances depend on the length of the individual texts in your file. The longer the texts, the higher
|
| 277 |
+
the chance that Wordify considers many n-grams. More n-grams means more data to analyse in each run.
|
| 278 |
+
We tailored Wordify performance for files of approximately 5'000 lines or 50k n-grams. In such cases we expect a runtime
|
| 279 |
+
between 90 seconds and 10 minutes. If your file is big, try to apply a stricter preprocessing of the text in the `Advanced Options` section.
|
| 280 |
+
If this is not enough, please do feel free to reach out to us directly so we can help.
|
| 281 |
"""
|
| 282 |
)
|
| 283 |
|
|
|
|
| 395 |
st.write(meta_data["labels"])
|
| 396 |
|
| 397 |
return subset_df
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
# warning for Chinese and MultiLanguage
|
| 401 |
+
def language_specific_warnings(pre_steps, post_steps, lemmatization_step, language):
|
| 402 |
+
|
| 403 |
+
if language in ("MultiLanguage", "Chinese") and (
|
| 404 |
+
"remove_non_words" in pre_steps or "remove_non_words" in post_steps
|
| 405 |
+
):
|
| 406 |
+
msg = """
|
| 407 |
+
NOTE: for Chinese and MultiLanguage we automatically substitute `remove_non_words` with
|
| 408 |
+
`remove_numbers` and `remove_punctuation` to avoid wrong results.
|
| 409 |
+
"""
|
| 410 |
+
st.info(msg)
|
| 411 |
+
|
| 412 |
+
msg = "NOTE: for Chinese and MultiLanguage we turn-off lemmatization automatically."
|
| 413 |
+
if lemmatization_step == "Spacy lemmatizer (keep stopwords)" and language in (
|
| 414 |
+
"MultiLanguage",
|
| 415 |
+
"Chinese",
|
| 416 |
+
):
|
| 417 |
+
st.info(msg)
|
| 418 |
+
|
| 419 |
+
elif lemmatization_step == "Spacy lemmatizer (remove stopwords)" and language in (
|
| 420 |
+
"MultiLanguage",
|
| 421 |
+
"Chinese",
|
| 422 |
+
):
|
| 423 |
+
st.info(
|
| 424 |
+
msg
|
| 425 |
+
+ " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
|
| 426 |
+
)
|
src/configs.py
CHANGED
|
@@ -25,7 +25,7 @@ class InputTransformConfigs(Enum):
|
|
| 25 |
|
| 26 |
|
| 27 |
class PreprocessingConfigs(Enum):
|
| 28 |
-
DEFAULT_PRE = [1, 14, 2, 3, 4,
|
| 29 |
DEFAULT_LEMMA = 1
|
| 30 |
DEFAULT_POST = [0, 17, 15, 19, 23, 22, 21, 24]
|
| 31 |
|
|
@@ -39,7 +39,6 @@ class Languages(Enum):
|
|
| 39 |
Dutch = "nl_core_news_sm"
|
| 40 |
Portuguese = "pt_core_news_sm"
|
| 41 |
French = "fr_core_news_sm"
|
| 42 |
-
# Chinese = "zh_core_news_sm"
|
| 43 |
Danish = "da_core_news_sm"
|
| 44 |
# Japanese = "ja_core_news_sm"
|
| 45 |
Lithuanian = "lt_core_news_sm"
|
|
@@ -48,9 +47,11 @@ class Languages(Enum):
|
|
| 48 |
Romanian = "ro_core_news_sm"
|
| 49 |
Russian = "ru_core_news_sm"
|
| 50 |
MultiLanguage = "xx_ent_wiki_sm"
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
class SupportedFiles(Enum):
|
| 54 |
xlsx = (lambda x: pd.read_excel(x, dtype=str),)
|
| 55 |
-
|
|
|
|
| 56 |
parquet = (lambda x: pd.read_parquet(x),)
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
class PreprocessingConfigs(Enum):
|
| 28 |
+
DEFAULT_PRE = [1, 14, 2, 3, 4, 5, 23, 22, 21, 24]
|
| 29 |
DEFAULT_LEMMA = 1
|
| 30 |
DEFAULT_POST = [0, 17, 15, 19, 23, 22, 21, 24]
|
| 31 |
|
|
|
|
| 39 |
Dutch = "nl_core_news_sm"
|
| 40 |
Portuguese = "pt_core_news_sm"
|
| 41 |
French = "fr_core_news_sm"
|
|
|
|
| 42 |
Danish = "da_core_news_sm"
|
| 43 |
# Japanese = "ja_core_news_sm"
|
| 44 |
Lithuanian = "lt_core_news_sm"
|
|
|
|
| 47 |
Romanian = "ro_core_news_sm"
|
| 48 |
Russian = "ru_core_news_sm"
|
| 49 |
MultiLanguage = "xx_ent_wiki_sm"
|
| 50 |
+
Chinese = "zh_core_web_sm"
|
| 51 |
|
| 52 |
|
| 53 |
class SupportedFiles(Enum):
|
| 54 |
xlsx = (lambda x: pd.read_excel(x, dtype=str),)
|
| 55 |
+
tsv = (lambda x: pd.read_csv(x, dtype=str, sep="\t"),)
|
| 56 |
+
csv = (lambda x: pd.read_csv(x, dtype=str, sep=","),)
|
| 57 |
parquet = (lambda x: pd.read_parquet(x),)
|
src/preprocessing.py
CHANGED
|
@@ -3,11 +3,9 @@ import os
|
|
| 3 |
import re
|
| 4 |
import string
|
| 5 |
from collections import OrderedDict
|
| 6 |
-
from typing import Callable, List, Optional
|
| 7 |
|
| 8 |
-
import pandas as pd
|
| 9 |
import spacy
|
| 10 |
-
import streamlit as st
|
| 11 |
import vaex
|
| 12 |
from pandas.core.frame import DataFrame
|
| 13 |
from pandas.core.series import Series
|
|
@@ -99,14 +97,10 @@ class PreprocessingPipeline:
|
|
| 99 |
self.lemmatization_step = lemmatization_step
|
| 100 |
self.post_steps = post_steps
|
| 101 |
|
| 102 |
-
self.
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
)
|
| 107 |
-
self.pre = self.make_pipe_component(self.pre_steps)
|
| 108 |
-
self.post = self.make_pipe_component(self.post_steps)
|
| 109 |
-
self.lemma = self.lemmatization_component().get(self.lemmatization_step)
|
| 110 |
|
| 111 |
# def apply_multiproc(fn, series):
|
| 112 |
# with mp.Pool(mp.cpu_count()) as pool:
|
|
@@ -148,13 +142,59 @@ class PreprocessingPipeline:
|
|
| 148 |
|
| 149 |
# return series
|
| 150 |
|
| 151 |
-
|
|
|
|
| 152 |
if not steps:
|
| 153 |
return identity
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
return make_pipeline(*components)
|
| 157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
@staticmethod
|
| 159 |
def pipeline_components() -> "OrderedDict[str, Callable]":
|
| 160 |
"""Returns available cleaning steps in order"""
|
|
@@ -193,7 +233,7 @@ class PreprocessingPipeline:
|
|
| 193 |
return OrderedDict(
|
| 194 |
[
|
| 195 |
("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
|
| 196 |
-
("Spacy lemmatizer (
|
| 197 |
("Disable lemmatizer", identity),
|
| 198 |
("Remove stopwords", remove_stopwords),
|
| 199 |
]
|
|
|
|
| 3 |
import re
|
| 4 |
import string
|
| 5 |
from collections import OrderedDict
|
| 6 |
+
from typing import Callable, List, Optional, Union
|
| 7 |
|
|
|
|
| 8 |
import spacy
|
|
|
|
| 9 |
import vaex
|
| 10 |
from pandas.core.frame import DataFrame
|
| 11 |
from pandas.core.series import Series
|
|
|
|
| 97 |
self.lemmatization_step = lemmatization_step
|
| 98 |
self.post_steps = post_steps
|
| 99 |
|
| 100 |
+
self.pre = self.make_pipe_component(self.pre_steps, self.language)
|
| 101 |
+
self.post = self.make_pipe_component(self.post_steps, self.language)
|
| 102 |
+
self.nlp = self.make_nlp(self.lemmatization_step, self.language)
|
| 103 |
+
self.lemma = self.make_lemma(self.lemmatization_step, self.language)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
# def apply_multiproc(fn, series):
|
| 106 |
# with mp.Pool(mp.cpu_count()) as pool:
|
|
|
|
| 142 |
|
| 143 |
# return series
|
| 144 |
|
| 145 |
+
@classmethod
|
| 146 |
+
def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
|
| 147 |
if not steps:
|
| 148 |
return identity
|
| 149 |
+
|
| 150 |
+
elif language in ("MultiLanguage", "Chinese") and "remove_non_words" in steps:
|
| 151 |
+
idx = steps.index("remove_non_words")
|
| 152 |
+
steps = (
|
| 153 |
+
steps[:idx]
|
| 154 |
+
+ ["remove_numbers", "remove_punctuation"]
|
| 155 |
+
+ steps[idx + 1 :]
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
components = [cls.pipeline_components()[step] for step in steps]
|
| 159 |
|
| 160 |
return make_pipeline(*components)
|
| 161 |
|
| 162 |
+
@staticmethod
|
| 163 |
+
def make_nlp(
|
| 164 |
+
lemmatization_step: Optional[str], language: str
|
| 165 |
+
) -> Union[spacy.language.Language, Callable]:
|
| 166 |
+
if (
|
| 167 |
+
lemmatization_step is None
|
| 168 |
+
or lemmatization_step == "Disable lemmatizer"
|
| 169 |
+
or (
|
| 170 |
+
lemmatization_step == "Spacy lemmatizer (keep stopwords)"
|
| 171 |
+
and language in ("MultiLanguage", "Chinese")
|
| 172 |
+
)
|
| 173 |
+
):
|
| 174 |
+
return identity
|
| 175 |
+
return spacy.load(Languages[language].value, disable=["parser", "ner"])
|
| 176 |
+
|
| 177 |
+
@classmethod
|
| 178 |
+
def make_lemma(cls, lemmatization_step: Optional[str], language: str) -> Callable:
|
| 179 |
+
|
| 180 |
+
if (
|
| 181 |
+
lemmatization_step is None
|
| 182 |
+
or lemmatization_step == "Disable lemmatizer"
|
| 183 |
+
or (
|
| 184 |
+
lemmatization_step == "Spacy lemmatizer (keep stopwords)"
|
| 185 |
+
and language in ("MultiLanguage", "Chinese")
|
| 186 |
+
)
|
| 187 |
+
):
|
| 188 |
+
return identity
|
| 189 |
+
|
| 190 |
+
elif (
|
| 191 |
+
lemmatization_step == "Spacy lemmatizer (remove stopwords)"
|
| 192 |
+
and language in ("MultiLanguage", "Chinese")
|
| 193 |
+
):
|
| 194 |
+
return cls.lemmatization_component().get("Remove stopwords")
|
| 195 |
+
|
| 196 |
+
return cls.lemmatization_component().get(lemmatization_step)
|
| 197 |
+
|
| 198 |
@staticmethod
|
| 199 |
def pipeline_components() -> "OrderedDict[str, Callable]":
|
| 200 |
"""Returns available cleaning steps in order"""
|
|
|
|
| 233 |
return OrderedDict(
|
| 234 |
[
|
| 235 |
("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
|
| 236 |
+
("Spacy lemmatizer (remove stopwords)", lemmatize_remove_stopwords),
|
| 237 |
("Disable lemmatizer", identity),
|
| 238 |
("Remove stopwords", remove_stopwords),
|
| 239 |
]
|
src/utils.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
import base64
|
| 2 |
from typing import List, Tuple
|
| 3 |
-
|
| 4 |
import streamlit as st
|
|
|
|
| 5 |
from PIL import Image
|
| 6 |
|
| 7 |
-
|
| 8 |
|
| 9 |
-
|
| 10 |
|
| 11 |
|
| 12 |
def get_col_indices(cols: List) -> Tuple[int, int]:
|
|
|
|
| 1 |
import base64
|
| 2 |
from typing import List, Tuple
|
| 3 |
+
|
| 4 |
import streamlit as st
|
| 5 |
+
from pandas.core.frame import DataFrame
|
| 6 |
from PIL import Image
|
| 7 |
|
| 8 |
+
from .configs import ColumnNames, SupportedFiles
|
| 9 |
|
| 10 |
+
# import altair as alt
|
| 11 |
|
| 12 |
|
| 13 |
def get_col_indices(cols: List) -> Tuple[int, int]:
|
tests/notebook.ipynb
CHANGED
|
@@ -21,7 +21,8 @@
|
|
| 21 |
"metadata": {},
|
| 22 |
"outputs": [],
|
| 23 |
"source": [
|
| 24 |
-
"df = pd.read_csv(\"../data/test_en.csv\")"
|
|
|
|
| 25 |
]
|
| 26 |
},
|
| 27 |
{
|
|
@@ -36,10 +37,10 @@
|
|
| 36 |
" \"normalize_bullet_points\",\n",
|
| 37 |
" \"normalize_hyphenated_words\",\n",
|
| 38 |
" \"normalize_quotation_marks\",\n",
|
| 39 |
-
" \"
|
| 40 |
" \"normalize_repeating_words\",\n",
|
| 41 |
" \"normalize_repeating_chars\",\n",
|
| 42 |
-
" \"
|
| 43 |
" # \"replace_currency_symbols\",\n",
|
| 44 |
" # \"replace_emails\",\n",
|
| 45 |
" # \"replace_emojis\",\n",
|
|
@@ -60,7 +61,7 @@
|
|
| 60 |
},
|
| 61 |
{
|
| 62 |
"cell_type": "code",
|
| 63 |
-
"execution_count":
|
| 64 |
"metadata": {},
|
| 65 |
"outputs": [],
|
| 66 |
"source": [
|
|
@@ -74,8 +75,8 @@
|
|
| 74 |
" # \"replace_emojis\",\n",
|
| 75 |
" # \"replace_phone_numbers\",\n",
|
| 76 |
" # \"replace_numbers\",\n",
|
| 77 |
-
" \"remove_html_tags\",\n",
|
| 78 |
-
" \"remove_accents\",\n",
|
| 79 |
" # \"remove_brackets\",\n",
|
| 80 |
" \"remove_non_words\",\n",
|
| 81 |
" # \"remove_numbers\",\n",
|
|
@@ -89,13 +90,13 @@
|
|
| 89 |
},
|
| 90 |
{
|
| 91 |
"cell_type": "code",
|
| 92 |
-
"execution_count":
|
| 93 |
"metadata": {},
|
| 94 |
"outputs": [],
|
| 95 |
"source": [
|
| 96 |
"pipe = PreprocessingPipeline(\n",
|
| 97 |
-
" language=\"
|
| 98 |
-
" lemmatization_step=\"Spacy lemmatizer (
|
| 99 |
" pre_steps=pre_steps,\n",
|
| 100 |
" post_steps=post_steps,\n",
|
| 101 |
")"
|
|
@@ -103,218 +104,125 @@
|
|
| 103 |
},
|
| 104 |
{
|
| 105 |
"cell_type": "code",
|
| 106 |
-
"execution_count":
|
| 107 |
"metadata": {},
|
| 108 |
"outputs": [
|
| 109 |
{
|
| 110 |
"data": {
|
| 111 |
"text/plain": [
|
| 112 |
-
"
|
| 113 |
]
|
| 114 |
},
|
| 115 |
-
"execution_count":
|
| 116 |
"metadata": {},
|
| 117 |
"output_type": "execute_result"
|
| 118 |
}
|
| 119 |
],
|
| 120 |
"source": [
|
| 121 |
-
"
|
| 122 |
]
|
| 123 |
},
|
| 124 |
{
|
| 125 |
"cell_type": "code",
|
| 126 |
-
"execution_count":
|
| 127 |
"metadata": {},
|
| 128 |
"outputs": [
|
| 129 |
{
|
| 130 |
"data": {
|
| 131 |
"text/plain": [
|
| 132 |
-
"'
|
| 133 |
]
|
| 134 |
},
|
| 135 |
-
"execution_count":
|
| 136 |
"metadata": {},
|
| 137 |
"output_type": "execute_result"
|
| 138 |
}
|
| 139 |
],
|
| 140 |
"source": [
|
| 141 |
-
"pipe.
|
| 142 |
]
|
| 143 |
},
|
| 144 |
{
|
| 145 |
"cell_type": "code",
|
| 146 |
-
"execution_count":
|
| 147 |
"metadata": {},
|
| 148 |
"outputs": [
|
| 149 |
{
|
| 150 |
"data": {
|
| 151 |
"text/plain": [
|
| 152 |
-
"'
|
| 153 |
]
|
| 154 |
},
|
| 155 |
-
"execution_count":
|
| 156 |
"metadata": {},
|
| 157 |
"output_type": "execute_result"
|
| 158 |
}
|
| 159 |
],
|
| 160 |
"source": [
|
| 161 |
-
"pipe.
|
| 162 |
]
|
| 163 |
},
|
| 164 |
{
|
| 165 |
"cell_type": "code",
|
| 166 |
-
"execution_count":
|
| 167 |
"metadata": {},
|
| 168 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
"source": [
|
| 170 |
-
"
|
| 171 |
]
|
| 172 |
},
|
| 173 |
{
|
| 174 |
"cell_type": "code",
|
| 175 |
-
"execution_count":
|
| 176 |
"metadata": {},
|
| 177 |
"outputs": [
|
| 178 |
{
|
| 179 |
"data": {
|
| 180 |
-
"text/html": [
|
| 181 |
-
"<div>\n",
|
| 182 |
-
"<style scoped>\n",
|
| 183 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
| 184 |
-
" vertical-align: middle;\n",
|
| 185 |
-
" }\n",
|
| 186 |
-
"\n",
|
| 187 |
-
" .dataframe tbody tr th {\n",
|
| 188 |
-
" vertical-align: top;\n",
|
| 189 |
-
" }\n",
|
| 190 |
-
"\n",
|
| 191 |
-
" .dataframe thead th {\n",
|
| 192 |
-
" text-align: right;\n",
|
| 193 |
-
" }\n",
|
| 194 |
-
"</style>\n",
|
| 195 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
| 196 |
-
" <thead>\n",
|
| 197 |
-
" <tr style=\"text-align: right;\">\n",
|
| 198 |
-
" <th></th>\n",
|
| 199 |
-
" <th>label</th>\n",
|
| 200 |
-
" <th>text</th>\n",
|
| 201 |
-
" <th>processed_text</th>\n",
|
| 202 |
-
" </tr>\n",
|
| 203 |
-
" </thead>\n",
|
| 204 |
-
" <tbody>\n",
|
| 205 |
-
" <tr>\n",
|
| 206 |
-
" <th>0</th>\n",
|
| 207 |
-
" <td>0</td>\n",
|
| 208 |
-
" <td>I think it's time John Rambo move on with his ...</td>\n",
|
| 209 |
-
" <td>think time john rambo life try vietnam series ...</td>\n",
|
| 210 |
-
" </tr>\n",
|
| 211 |
-
" <tr>\n",
|
| 212 |
-
" <th>1</th>\n",
|
| 213 |
-
" <td>1</td>\n",
|
| 214 |
-
" <td>I've just watch 2 films of Pang brothers, The ...</td>\n",
|
| 215 |
-
" <td>watch film pang brother eye watch eye kind dis...</td>\n",
|
| 216 |
-
" </tr>\n",
|
| 217 |
-
" <tr>\n",
|
| 218 |
-
" <th>2</th>\n",
|
| 219 |
-
" <td>1</td>\n",
|
| 220 |
-
" <td>Jewel Thief is *THE* crime thriller of Bollywo...</td>\n",
|
| 221 |
-
" <td>jewel thief crime thriller bollywood direct bi...</td>\n",
|
| 222 |
-
" </tr>\n",
|
| 223 |
-
" <tr>\n",
|
| 224 |
-
" <th>3</th>\n",
|
| 225 |
-
" <td>0</td>\n",
|
| 226 |
-
" <td>This so called remake is terrible. I went to s...</td>\n",
|
| 227 |
-
" <td>call remake terrible go tonight day anticipati...</td>\n",
|
| 228 |
-
" </tr>\n",
|
| 229 |
-
" <tr>\n",
|
| 230 |
-
" <th>4</th>\n",
|
| 231 |
-
" <td>1</td>\n",
|
| 232 |
-
" <td>When Northfork debuted at the Cannes Film Fest...</td>\n",
|
| 233 |
-
" <td>northfork debut cannes film festival people li...</td>\n",
|
| 234 |
-
" </tr>\n",
|
| 235 |
-
" <tr>\n",
|
| 236 |
-
" <th>...</th>\n",
|
| 237 |
-
" <td>...</td>\n",
|
| 238 |
-
" <td>...</td>\n",
|
| 239 |
-
" <td>...</td>\n",
|
| 240 |
-
" </tr>\n",
|
| 241 |
-
" <tr>\n",
|
| 242 |
-
" <th>4995</th>\n",
|
| 243 |
-
" <td>0</td>\n",
|
| 244 |
-
" <td>The title tells it all -- Ed Gein, the butcher...</td>\n",
|
| 245 |
-
" <td>title tell ed gein butcher plainfield it zappy...</td>\n",
|
| 246 |
-
" </tr>\n",
|
| 247 |
-
" <tr>\n",
|
| 248 |
-
" <th>4996</th>\n",
|
| 249 |
-
" <td>0</td>\n",
|
| 250 |
-
" <td>This film makes about as much sense as an 'Ozz...</td>\n",
|
| 251 |
-
" <td>film make sense ozzie harriet father know best...</td>\n",
|
| 252 |
-
" </tr>\n",
|
| 253 |
-
" <tr>\n",
|
| 254 |
-
" <th>4997</th>\n",
|
| 255 |
-
" <td>0</td>\n",
|
| 256 |
-
" <td>\"Sex and the City\" has some great things going...</td>\n",
|
| 257 |
-
" <td>sex city great thing go problem saddle number ...</td>\n",
|
| 258 |
-
" </tr>\n",
|
| 259 |
-
" <tr>\n",
|
| 260 |
-
" <th>4998</th>\n",
|
| 261 |
-
" <td>0</td>\n",
|
| 262 |
-
" <td>Please...if anybody gets the chance to read th...</td>\n",
|
| 263 |
-
" <td>please if anybody get chance read watch movie ...</td>\n",
|
| 264 |
-
" </tr>\n",
|
| 265 |
-
" <tr>\n",
|
| 266 |
-
" <th>4999</th>\n",
|
| 267 |
-
" <td>0</td>\n",
|
| 268 |
-
" <td>...a film comes along that manages to be absol...</td>\n",
|
| 269 |
-
" <td>a film come manage absolutely terrible open ti...</td>\n",
|
| 270 |
-
" </tr>\n",
|
| 271 |
-
" </tbody>\n",
|
| 272 |
-
"</table>\n",
|
| 273 |
-
"<p>5000 rows × 3 columns</p>\n",
|
| 274 |
-
"</div>"
|
| 275 |
-
],
|
| 276 |
"text/plain": [
|
| 277 |
-
"
|
| 278 |
-
"0 0 I think it's time John Rambo move on with his ... \n",
|
| 279 |
-
"1 1 I've just watch 2 films of Pang brothers, The ... \n",
|
| 280 |
-
"2 1 Jewel Thief is *THE* crime thriller of Bollywo... \n",
|
| 281 |
-
"3 0 This so called remake is terrible. I went to s... \n",
|
| 282 |
-
"4 1 When Northfork debuted at the Cannes Film Fest... \n",
|
| 283 |
-
"... ... ... \n",
|
| 284 |
-
"4995 0 The title tells it all -- Ed Gein, the butcher... \n",
|
| 285 |
-
"4996 0 This film makes about as much sense as an 'Ozz... \n",
|
| 286 |
-
"4997 0 \"Sex and the City\" has some great things going... \n",
|
| 287 |
-
"4998 0 Please...if anybody gets the chance to read th... \n",
|
| 288 |
-
"4999 0 ...a film comes along that manages to be absol... \n",
|
| 289 |
-
"\n",
|
| 290 |
-
" processed_text \n",
|
| 291 |
-
"0 think time john rambo life try vietnam series ... \n",
|
| 292 |
-
"1 watch film pang brother eye watch eye kind dis... \n",
|
| 293 |
-
"2 jewel thief crime thriller bollywood direct bi... \n",
|
| 294 |
-
"3 call remake terrible go tonight day anticipati... \n",
|
| 295 |
-
"4 northfork debut cannes film festival people li... \n",
|
| 296 |
-
"... ... \n",
|
| 297 |
-
"4995 title tell ed gein butcher plainfield it zappy... \n",
|
| 298 |
-
"4996 film make sense ozzie harriet father know best... \n",
|
| 299 |
-
"4997 sex city great thing go problem saddle number ... \n",
|
| 300 |
-
"4998 please if anybody get chance read watch movie ... \n",
|
| 301 |
-
"4999 a film come manage absolutely terrible open ti... \n",
|
| 302 |
-
"\n",
|
| 303 |
-
"[5000 rows x 3 columns]"
|
| 304 |
]
|
| 305 |
},
|
| 306 |
-
"execution_count":
|
| 307 |
"metadata": {},
|
| 308 |
"output_type": "execute_result"
|
| 309 |
}
|
| 310 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
"source": [
|
| 312 |
"odf"
|
| 313 |
]
|
| 314 |
},
|
| 315 |
{
|
| 316 |
"cell_type": "code",
|
| 317 |
-
"execution_count":
|
| 318 |
"metadata": {},
|
| 319 |
"outputs": [],
|
| 320 |
"source": [
|
|
@@ -325,40 +233,18 @@
|
|
| 325 |
},
|
| 326 |
{
|
| 327 |
"cell_type": "code",
|
| 328 |
-
"execution_count":
|
| 329 |
"metadata": {},
|
| 330 |
-
"outputs": [
|
| 331 |
-
{
|
| 332 |
-
"data": {
|
| 333 |
-
"text/plain": [
|
| 334 |
-
"[1, 14, 2, 3, 4, 21, 23, 22, 5, 24]"
|
| 335 |
-
]
|
| 336 |
-
},
|
| 337 |
-
"execution_count": 16,
|
| 338 |
-
"metadata": {},
|
| 339 |
-
"output_type": "execute_result"
|
| 340 |
-
}
|
| 341 |
-
],
|
| 342 |
"source": [
|
| 343 |
"default_pre_steps_idx"
|
| 344 |
]
|
| 345 |
},
|
| 346 |
{
|
| 347 |
"cell_type": "code",
|
| 348 |
-
"execution_count":
|
| 349 |
"metadata": {},
|
| 350 |
-
"outputs": [
|
| 351 |
-
{
|
| 352 |
-
"data": {
|
| 353 |
-
"text/plain": [
|
| 354 |
-
"[0, 17, 15, 19, 23, 22, 21, 24]"
|
| 355 |
-
]
|
| 356 |
-
},
|
| 357 |
-
"execution_count": 17,
|
| 358 |
-
"metadata": {},
|
| 359 |
-
"output_type": "execute_result"
|
| 360 |
-
}
|
| 361 |
-
],
|
| 362 |
"source": [
|
| 363 |
"default_post_steps_idx"
|
| 364 |
]
|
|
@@ -383,7 +269,7 @@
|
|
| 383 |
},
|
| 384 |
{
|
| 385 |
"cell_type": "code",
|
| 386 |
-
"execution_count":
|
| 387 |
"metadata": {},
|
| 388 |
"outputs": [],
|
| 389 |
"source": [
|
|
@@ -392,7 +278,7 @@
|
|
| 392 |
},
|
| 393 |
{
|
| 394 |
"cell_type": "code",
|
| 395 |
-
"execution_count":
|
| 396 |
"metadata": {},
|
| 397 |
"outputs": [],
|
| 398 |
"source": [
|
|
@@ -401,20 +287,9 @@
|
|
| 401 |
},
|
| 402 |
{
|
| 403 |
"cell_type": "code",
|
| 404 |
-
"execution_count":
|
| 405 |
"metadata": {},
|
| 406 |
-
"outputs": [
|
| 407 |
-
{
|
| 408 |
-
"data": {
|
| 409 |
-
"text/plain": [
|
| 410 |
-
"'Mimmo '"
|
| 411 |
-
]
|
| 412 |
-
},
|
| 413 |
-
"execution_count": 28,
|
| 414 |
-
"metadata": {},
|
| 415 |
-
"output_type": "execute_result"
|
| 416 |
-
}
|
| 417 |
-
],
|
| 418 |
"source": [
|
| 419 |
"_re_non_words.sub(\" \", \"Mimmo23\")"
|
| 420 |
]
|
|
|
|
| 21 |
"metadata": {},
|
| 22 |
"outputs": [],
|
| 23 |
"source": [
|
| 24 |
+
"# df = pd.read_csv(\"../data/test_en.csv\")\n",
|
| 25 |
+
"df = pd.read_excel(\"../data/test_chinese.xlsx\")"
|
| 26 |
]
|
| 27 |
},
|
| 28 |
{
|
|
|
|
| 37 |
" \"normalize_bullet_points\",\n",
|
| 38 |
" \"normalize_hyphenated_words\",\n",
|
| 39 |
" \"normalize_quotation_marks\",\n",
|
| 40 |
+
" \"normalize_whitespaces\",\n",
|
| 41 |
" \"normalize_repeating_words\",\n",
|
| 42 |
" \"normalize_repeating_chars\",\n",
|
| 43 |
+
" \"normalize_useless_spaces\",\n",
|
| 44 |
" # \"replace_currency_symbols\",\n",
|
| 45 |
" # \"replace_emails\",\n",
|
| 46 |
" # \"replace_emojis\",\n",
|
|
|
|
| 61 |
},
|
| 62 |
{
|
| 63 |
"cell_type": "code",
|
| 64 |
+
"execution_count": 11,
|
| 65 |
"metadata": {},
|
| 66 |
"outputs": [],
|
| 67 |
"source": [
|
|
|
|
| 75 |
" # \"replace_emojis\",\n",
|
| 76 |
" # \"replace_phone_numbers\",\n",
|
| 77 |
" # \"replace_numbers\",\n",
|
| 78 |
+
" # \"remove_html_tags\",\n",
|
| 79 |
+
" # \"remove_accents\",\n",
|
| 80 |
" # \"remove_brackets\",\n",
|
| 81 |
" \"remove_non_words\",\n",
|
| 82 |
" # \"remove_numbers\",\n",
|
|
|
|
| 90 |
},
|
| 91 |
{
|
| 92 |
"cell_type": "code",
|
| 93 |
+
"execution_count": 12,
|
| 94 |
"metadata": {},
|
| 95 |
"outputs": [],
|
| 96 |
"source": [
|
| 97 |
"pipe = PreprocessingPipeline(\n",
|
| 98 |
+
" language=\"Chinese\",\n",
|
| 99 |
+
" lemmatization_step=\"Spacy lemmatizer (keep stopwords)\", # \"Disable lemmatizer\",\n",
|
| 100 |
" pre_steps=pre_steps,\n",
|
| 101 |
" post_steps=post_steps,\n",
|
| 102 |
")"
|
|
|
|
| 104 |
},
|
| 105 |
{
|
| 106 |
"cell_type": "code",
|
| 107 |
+
"execution_count": 13,
|
| 108 |
"metadata": {},
|
| 109 |
"outputs": [
|
| 110 |
{
|
| 111 |
"data": {
|
| 112 |
"text/plain": [
|
| 113 |
+
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
|
| 114 |
]
|
| 115 |
},
|
| 116 |
+
"execution_count": 13,
|
| 117 |
"metadata": {},
|
| 118 |
"output_type": "execute_result"
|
| 119 |
}
|
| 120 |
],
|
| 121 |
"source": [
|
| 122 |
+
"df.text[0]"
|
| 123 |
]
|
| 124 |
},
|
| 125 |
{
|
| 126 |
"cell_type": "code",
|
| 127 |
+
"execution_count": 14,
|
| 128 |
"metadata": {},
|
| 129 |
"outputs": [
|
| 130 |
{
|
| 131 |
"data": {
|
| 132 |
"text/plain": [
|
| 133 |
+
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
|
| 134 |
]
|
| 135 |
},
|
| 136 |
+
"execution_count": 14,
|
| 137 |
"metadata": {},
|
| 138 |
"output_type": "execute_result"
|
| 139 |
}
|
| 140 |
],
|
| 141 |
"source": [
|
| 142 |
+
"pipe.pre(df.text[0])"
|
| 143 |
]
|
| 144 |
},
|
| 145 |
{
|
| 146 |
"cell_type": "code",
|
| 147 |
+
"execution_count": 15,
|
| 148 |
"metadata": {},
|
| 149 |
"outputs": [
|
| 150 |
{
|
| 151 |
"data": {
|
| 152 |
"text/plain": [
|
| 153 |
+
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
|
| 154 |
]
|
| 155 |
},
|
| 156 |
+
"execution_count": 15,
|
| 157 |
"metadata": {},
|
| 158 |
"output_type": "execute_result"
|
| 159 |
}
|
| 160 |
],
|
| 161 |
"source": [
|
| 162 |
+
"pipe.lemma(pipe.nlp(pipe.pre(df.text[0])))"
|
| 163 |
]
|
| 164 |
},
|
| 165 |
{
|
| 166 |
"cell_type": "code",
|
| 167 |
+
"execution_count": 16,
|
| 168 |
"metadata": {},
|
| 169 |
+
"outputs": [
|
| 170 |
+
{
|
| 171 |
+
"data": {
|
| 172 |
+
"text/plain": [
|
| 173 |
+
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 mp 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
|
| 174 |
+
]
|
| 175 |
+
},
|
| 176 |
+
"execution_count": 16,
|
| 177 |
+
"metadata": {},
|
| 178 |
+
"output_type": "execute_result"
|
| 179 |
+
}
|
| 180 |
+
],
|
| 181 |
"source": [
|
| 182 |
+
"pipe.post(pipe.lemma(pipe.nlp(pipe.pre(df.text[0]))))"
|
| 183 |
]
|
| 184 |
},
|
| 185 |
{
|
| 186 |
"cell_type": "code",
|
| 187 |
+
"execution_count": 17,
|
| 188 |
"metadata": {},
|
| 189 |
"outputs": [
|
| 190 |
{
|
| 191 |
"data": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
"text/plain": [
|
| 193 |
+
"Compose(<function strip at 0x7ff4894750e0>, <function normalize_useless_spaces at 0x7ff48946eef0>, <function normalize_repeating_chars at 0x7ff48946ef80>, <function normalize_repeating_words at 0x7ff4871a7170>, <function punctuation at 0x7ff48946e4d0>, <function remove_numbers at 0x7ff4894754d0>, <function lowercase at 0x7ff489475050>)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
]
|
| 195 |
},
|
| 196 |
+
"execution_count": 17,
|
| 197 |
"metadata": {},
|
| 198 |
"output_type": "execute_result"
|
| 199 |
}
|
| 200 |
],
|
| 201 |
+
"source": [
|
| 202 |
+
"pipe.post"
|
| 203 |
+
]
|
| 204 |
+
},
|
| 205 |
+
{
|
| 206 |
+
"cell_type": "code",
|
| 207 |
+
"execution_count": null,
|
| 208 |
+
"metadata": {},
|
| 209 |
+
"outputs": [],
|
| 210 |
+
"source": [
|
| 211 |
+
"odf = pipe.vaex_process(df, \"text\")"
|
| 212 |
+
]
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"cell_type": "code",
|
| 216 |
+
"execution_count": null,
|
| 217 |
+
"metadata": {},
|
| 218 |
+
"outputs": [],
|
| 219 |
"source": [
|
| 220 |
"odf"
|
| 221 |
]
|
| 222 |
},
|
| 223 |
{
|
| 224 |
"cell_type": "code",
|
| 225 |
+
"execution_count": null,
|
| 226 |
"metadata": {},
|
| 227 |
"outputs": [],
|
| 228 |
"source": [
|
|
|
|
| 233 |
},
|
| 234 |
{
|
| 235 |
"cell_type": "code",
|
| 236 |
+
"execution_count": null,
|
| 237 |
"metadata": {},
|
| 238 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
"source": [
|
| 240 |
"default_pre_steps_idx"
|
| 241 |
]
|
| 242 |
},
|
| 243 |
{
|
| 244 |
"cell_type": "code",
|
| 245 |
+
"execution_count": null,
|
| 246 |
"metadata": {},
|
| 247 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
"source": [
|
| 249 |
"default_post_steps_idx"
|
| 250 |
]
|
|
|
|
| 269 |
},
|
| 270 |
{
|
| 271 |
"cell_type": "code",
|
| 272 |
+
"execution_count": null,
|
| 273 |
"metadata": {},
|
| 274 |
"outputs": [],
|
| 275 |
"source": [
|
|
|
|
| 278 |
},
|
| 279 |
{
|
| 280 |
"cell_type": "code",
|
| 281 |
+
"execution_count": null,
|
| 282 |
"metadata": {},
|
| 283 |
"outputs": [],
|
| 284 |
"source": [
|
|
|
|
| 287 |
},
|
| 288 |
{
|
| 289 |
"cell_type": "code",
|
| 290 |
+
"execution_count": null,
|
| 291 |
"metadata": {},
|
| 292 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
"source": [
|
| 294 |
"_re_non_words.sub(\" \", \"Mimmo23\")"
|
| 295 |
]
|