Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
dc4ad9e
1
Parent(s):
13c6837
formatting
Browse files- Makefile +9 -1
- src/components.py +46 -14
- src/preprocessing.py +0 -26
- src/utils.py +0 -80
Makefile
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
# Docker image build info
|
| 2 |
PROJECT:=wordify
|
| 3 |
BUILD_TAG?=v2.0
|
|
|
|
| 4 |
|
| 5 |
########################################################
|
| 6 |
## Local development
|
|
@@ -21,4 +22,11 @@ run:
|
|
| 21 |
docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 4321:8501 $(PROJECT):${BUILD_TAG}
|
| 22 |
|
| 23 |
stop:
|
| 24 |
-
docker stop $(PROJECT)-${BUILD_TAG}-container
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Docker image build info
|
| 2 |
PROJECT:=wordify
|
| 3 |
BUILD_TAG?=v2.0
|
| 4 |
+
sources = src
|
| 5 |
|
| 6 |
########################################################
|
| 7 |
## Local development
|
|
|
|
| 22 |
docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 4321:8501 $(PROJECT):${BUILD_TAG}
|
| 23 |
|
| 24 |
stop:
|
| 25 |
+
docker stop $(PROJECT)-${BUILD_TAG}-container
|
| 26 |
+
|
| 27 |
+
format:
|
| 28 |
+
isort $(sources)
|
| 29 |
+
black $(sources)
|
| 30 |
+
|
| 31 |
+
lint:
|
| 32 |
+
flake8 $(sources)
|
src/components.py
CHANGED
|
@@ -65,12 +65,16 @@ def form(df):
|
|
| 65 |
pre_steps = st.multiselect(
|
| 66 |
"Select pre-lemmatization processing steps (ordered)",
|
| 67 |
options=steps_options,
|
| 68 |
-
default=[
|
|
|
|
|
|
|
| 69 |
format_func=lambda x: x.replace("_", " ").title(),
|
| 70 |
help="Select the processing steps to apply before the text is lemmatized",
|
| 71 |
)
|
| 72 |
|
| 73 |
-
lammatization_options = list(
|
|
|
|
|
|
|
| 74 |
lemmatization_step = st.selectbox(
|
| 75 |
"Select lemmatization",
|
| 76 |
options=lammatization_options,
|
|
@@ -81,7 +85,10 @@ def form(df):
|
|
| 81 |
post_steps = st.multiselect(
|
| 82 |
"Select post-lemmatization processing steps (ordered)",
|
| 83 |
options=steps_options,
|
| 84 |
-
default=[
|
|
|
|
|
|
|
|
|
|
| 85 |
format_func=lambda x: x.replace("_", " ").title(),
|
| 86 |
help="Select the processing steps to apply after the text is lemmatized",
|
| 87 |
)
|
|
@@ -93,21 +100,31 @@ def form(df):
|
|
| 93 |
start_time = time.time()
|
| 94 |
|
| 95 |
# warnings about inputs
|
| 96 |
-
language_specific_warnings(
|
|
|
|
|
|
|
| 97 |
|
| 98 |
# preprocess
|
| 99 |
if not disable_preprocessing:
|
| 100 |
with st.spinner("Step 1/4: Preprocessing text"):
|
| 101 |
-
pipe = PreprocessingPipeline(
|
|
|
|
|
|
|
| 102 |
df = pipe.vaex_process(df, text_column)
|
| 103 |
else:
|
| 104 |
-
with st.spinner(
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
time.sleep(1.2)
|
| 107 |
|
| 108 |
# prepare input
|
| 109 |
with st.spinner("Step 2/4: Preparing inputs"):
|
| 110 |
-
input_dict = input_transform(
|
|
|
|
|
|
|
| 111 |
|
| 112 |
# wordify
|
| 113 |
with st.spinner("Step 3/4: Wordifying"):
|
|
@@ -217,7 +234,13 @@ def how_it_works():
|
|
| 217 |
"Wine light cherry",
|
| 218 |
"Chardonnay wine oak buttery",
|
| 219 |
],
|
| 220 |
-
"Label": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
}
|
| 222 |
)
|
| 223 |
|
|
@@ -268,7 +291,9 @@ def how_it_works():
|
|
| 268 |
vectors of coefficients reported in table 3 (indicators that are not present in a run are listed as 0 here):
|
| 269 |
"""
|
| 270 |
)
|
| 271 |
-
st.caption(
|
|
|
|
|
|
|
| 272 |
st.table(table3)
|
| 273 |
|
| 274 |
st.markdown(
|
|
@@ -278,7 +303,9 @@ def how_it_works():
|
|
| 278 |
that are positively and negatively correlated with the US wines.
|
| 279 |
"""
|
| 280 |
)
|
| 281 |
-
st.caption(
|
|
|
|
|
|
|
| 282 |
st.table(table4)
|
| 283 |
st.markdown(
|
| 284 |
"""
|
|
@@ -459,11 +486,15 @@ def analysis(outputs):
|
|
| 459 |
)
|
| 460 |
|
| 461 |
with st.expander("Vocabulary"):
|
| 462 |
-
st.markdown(
|
|
|
|
|
|
|
| 463 |
st.write(meta_data["vocabulary"])
|
| 464 |
|
| 465 |
with st.expander("Labels"):
|
| 466 |
-
st.markdown(
|
|
|
|
|
|
|
| 467 |
st.write(meta_data["labels"])
|
| 468 |
|
| 469 |
return subset_df
|
|
@@ -493,5 +524,6 @@ def language_specific_warnings(pre_steps, post_steps, lemmatization_step, langua
|
|
| 493 |
"Chinese",
|
| 494 |
):
|
| 495 |
st.info(
|
| 496 |
-
msg
|
|
|
|
| 497 |
)
|
|
|
|
| 65 |
pre_steps = st.multiselect(
|
| 66 |
"Select pre-lemmatization processing steps (ordered)",
|
| 67 |
options=steps_options,
|
| 68 |
+
default=[
|
| 69 |
+
steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
|
| 70 |
+
],
|
| 71 |
format_func=lambda x: x.replace("_", " ").title(),
|
| 72 |
help="Select the processing steps to apply before the text is lemmatized",
|
| 73 |
)
|
| 74 |
|
| 75 |
+
lammatization_options = list(
|
| 76 |
+
PreprocessingPipeline.lemmatization_component().keys()
|
| 77 |
+
)
|
| 78 |
lemmatization_step = st.selectbox(
|
| 79 |
"Select lemmatization",
|
| 80 |
options=lammatization_options,
|
|
|
|
| 85 |
post_steps = st.multiselect(
|
| 86 |
"Select post-lemmatization processing steps (ordered)",
|
| 87 |
options=steps_options,
|
| 88 |
+
default=[
|
| 89 |
+
steps_options[i]
|
| 90 |
+
for i in PreprocessingConfigs.DEFAULT_POST.value
|
| 91 |
+
],
|
| 92 |
format_func=lambda x: x.replace("_", " ").title(),
|
| 93 |
help="Select the processing steps to apply after the text is lemmatized",
|
| 94 |
)
|
|
|
|
| 100 |
start_time = time.time()
|
| 101 |
|
| 102 |
# warnings about inputs
|
| 103 |
+
language_specific_warnings(
|
| 104 |
+
pre_steps, post_steps, lemmatization_step, language
|
| 105 |
+
)
|
| 106 |
|
| 107 |
# preprocess
|
| 108 |
if not disable_preprocessing:
|
| 109 |
with st.spinner("Step 1/4: Preprocessing text"):
|
| 110 |
+
pipe = PreprocessingPipeline(
|
| 111 |
+
language, pre_steps, lemmatization_step, post_steps
|
| 112 |
+
)
|
| 113 |
df = pipe.vaex_process(df, text_column)
|
| 114 |
else:
|
| 115 |
+
with st.spinner(
|
| 116 |
+
"Step 1/4: Preprocessing has been disabled - doing nothing"
|
| 117 |
+
):
|
| 118 |
+
df = df.rename(
|
| 119 |
+
columns={text_column: ColumnNames.PROCESSED_TEXT.value}
|
| 120 |
+
)
|
| 121 |
time.sleep(1.2)
|
| 122 |
|
| 123 |
# prepare input
|
| 124 |
with st.spinner("Step 2/4: Preparing inputs"):
|
| 125 |
+
input_dict = input_transform(
|
| 126 |
+
df[ColumnNames.PROCESSED_TEXT.value], df[label_column]
|
| 127 |
+
)
|
| 128 |
|
| 129 |
# wordify
|
| 130 |
with st.spinner("Step 3/4: Wordifying"):
|
|
|
|
| 234 |
"Wine light cherry",
|
| 235 |
"Chardonnay wine oak buttery",
|
| 236 |
],
|
| 237 |
+
"Label": [
|
| 238 |
+
"Italy",
|
| 239 |
+
"United States",
|
| 240 |
+
"United States",
|
| 241 |
+
"Italy",
|
| 242 |
+
"United States",
|
| 243 |
+
],
|
| 244 |
}
|
| 245 |
)
|
| 246 |
|
|
|
|
| 291 |
vectors of coefficients reported in table 3 (indicators that are not present in a run are listed as 0 here):
|
| 292 |
"""
|
| 293 |
)
|
| 294 |
+
st.caption(
|
| 295 |
+
"Table 3: Coefficients for frequency of indicators in each of the four runs for US wines."
|
| 296 |
+
)
|
| 297 |
st.table(table3)
|
| 298 |
|
| 299 |
st.markdown(
|
|
|
|
| 303 |
that are positively and negatively correlated with the US wines.
|
| 304 |
"""
|
| 305 |
)
|
| 306 |
+
st.caption(
|
| 307 |
+
"Table 4: Final set of indicators that are positively versus negatively correlated with US wines."
|
| 308 |
+
)
|
| 309 |
st.table(table4)
|
| 310 |
st.markdown(
|
| 311 |
"""
|
|
|
|
| 486 |
)
|
| 487 |
|
| 488 |
with st.expander("Vocabulary"):
|
| 489 |
+
st.markdown(
|
| 490 |
+
"The table below shows all candidate n-grams that Wordify considered"
|
| 491 |
+
)
|
| 492 |
st.write(meta_data["vocabulary"])
|
| 493 |
|
| 494 |
with st.expander("Labels"):
|
| 495 |
+
st.markdown(
|
| 496 |
+
"The table below summarizes the labels that your file contained"
|
| 497 |
+
)
|
| 498 |
st.write(meta_data["labels"])
|
| 499 |
|
| 500 |
return subset_df
|
|
|
|
| 524 |
"Chinese",
|
| 525 |
):
|
| 526 |
st.info(
|
| 527 |
+
msg
|
| 528 |
+
+ " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
|
| 529 |
)
|
src/preprocessing.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
import multiprocessing as mp
|
| 2 |
-
import os
|
| 3 |
import re
|
| 4 |
import string
|
| 5 |
from collections import OrderedDict
|
|
@@ -8,7 +6,6 @@ from typing import Callable, List, Optional, Union
|
|
| 8 |
import spacy
|
| 9 |
import vaex
|
| 10 |
from pandas.core.frame import DataFrame
|
| 11 |
-
from pandas.core.series import Series
|
| 12 |
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
| 13 |
|
| 14 |
from .configs import Languages
|
|
@@ -119,29 +116,6 @@ class PreprocessingPipeline:
|
|
| 119 |
|
| 120 |
return df
|
| 121 |
|
| 122 |
-
# def __call__(self, series: Series) -> Series:
|
| 123 |
-
# if self.pre:
|
| 124 |
-
# series = series.map(self.pre)
|
| 125 |
-
|
| 126 |
-
# if self.lemma:
|
| 127 |
-
# total_steps = len(series) // 100
|
| 128 |
-
# res = []
|
| 129 |
-
# pbar = st.progress(0)
|
| 130 |
-
# for i, doc in enumerate(
|
| 131 |
-
# self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())
|
| 132 |
-
# ):
|
| 133 |
-
# res.append(self.lemma(doc))
|
| 134 |
-
|
| 135 |
-
# if i % total_steps == 0:
|
| 136 |
-
# pbar.progress(1)
|
| 137 |
-
|
| 138 |
-
# series = pd.Series(res)
|
| 139 |
-
|
| 140 |
-
# if self.post:
|
| 141 |
-
# series = series.map(self.post)
|
| 142 |
-
|
| 143 |
-
# return series
|
| 144 |
-
|
| 145 |
@classmethod
|
| 146 |
def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
|
| 147 |
if not steps:
|
|
|
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
import string
|
| 3 |
from collections import OrderedDict
|
|
|
|
| 6 |
import spacy
|
| 7 |
import vaex
|
| 8 |
from pandas.core.frame import DataFrame
|
|
|
|
| 9 |
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
| 10 |
|
| 11 |
from .configs import Languages
|
|
|
|
| 116 |
|
| 117 |
return df
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
@classmethod
|
| 120 |
def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
|
| 121 |
if not steps:
|
src/utils.py
CHANGED
|
@@ -7,8 +7,6 @@ from PIL import Image
|
|
| 7 |
|
| 8 |
from .configs import ColumnNames, SupportedFiles
|
| 9 |
|
| 10 |
-
# import altair as alt
|
| 11 |
-
|
| 12 |
|
| 13 |
def get_col_indices(cols: List) -> Tuple[int, int]:
|
| 14 |
"""Ugly but works"""
|
|
@@ -52,81 +50,3 @@ def download_button(dataframe: DataFrame, name: str) -> None:
|
|
| 52 |
b64 = base64.b64encode(csv.encode()).decode()
|
| 53 |
href = f'<a href="data:file/csv;base64,{b64}" download="{name}.csv">Download</a>'
|
| 54 |
st.write(href, unsafe_allow_html=True)
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# def plot_labels_prop(data: DataFrame, label_column: str):
|
| 58 |
-
|
| 59 |
-
# unique_value_limit = 100
|
| 60 |
-
|
| 61 |
-
# if data[label_column].nunique() > unique_value_limit:
|
| 62 |
-
|
| 63 |
-
# st.warning(
|
| 64 |
-
# f"""
|
| 65 |
-
# The column you selected has more than {unique_value_limit}.
|
| 66 |
-
# Are you sure it's the right column? If it is, please note that
|
| 67 |
-
# this will impact __Wordify__ performance.
|
| 68 |
-
# """
|
| 69 |
-
# )
|
| 70 |
-
|
| 71 |
-
# return
|
| 72 |
-
|
| 73 |
-
# source = (
|
| 74 |
-
# data[label_column]
|
| 75 |
-
# .value_counts()
|
| 76 |
-
# .reset_index()
|
| 77 |
-
# .rename(columns={"index": "Labels", label_column: "Counts"})
|
| 78 |
-
# )
|
| 79 |
-
# source["Props"] = source["Counts"] / source["Counts"].sum()
|
| 80 |
-
# source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
|
| 81 |
-
|
| 82 |
-
# bars = (
|
| 83 |
-
# alt.Chart(source)
|
| 84 |
-
# .mark_bar()
|
| 85 |
-
# .encode(
|
| 86 |
-
# x=alt.X("Labels:O", sort="-y"),
|
| 87 |
-
# y="Counts:Q",
|
| 88 |
-
# )
|
| 89 |
-
# )
|
| 90 |
-
|
| 91 |
-
# text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
|
| 92 |
-
# text="Proportions:O"
|
| 93 |
-
# )
|
| 94 |
-
|
| 95 |
-
# return (bars + text).properties(height=300)
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
# def plot_nchars(data: DataFrame, text_column: str):
|
| 99 |
-
# source = data[text_column].str.len().to_frame()
|
| 100 |
-
|
| 101 |
-
# plot = (
|
| 102 |
-
# alt.Chart(source)
|
| 103 |
-
# .mark_bar()
|
| 104 |
-
# .encode(
|
| 105 |
-
# alt.X(
|
| 106 |
-
# f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
|
| 107 |
-
# ),
|
| 108 |
-
# alt.Y("count()", axis=alt.Axis(title="")),
|
| 109 |
-
# )
|
| 110 |
-
# )
|
| 111 |
-
|
| 112 |
-
# return plot.properties(height=300)
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
# def plot_score(data: DataFrame, label_col: str, label: str):
|
| 116 |
-
|
| 117 |
-
# source = (
|
| 118 |
-
# data.loc[data[label_col] == label]
|
| 119 |
-
# .sort_values("score", ascending=False)
|
| 120 |
-
# .head(100)
|
| 121 |
-
# )
|
| 122 |
-
|
| 123 |
-
# plot = (
|
| 124 |
-
# alt.Chart(source)
|
| 125 |
-
# .mark_bar()
|
| 126 |
-
# .encode(
|
| 127 |
-
# y=alt.Y("word:O", sort="-x"),
|
| 128 |
-
# x="score:Q",
|
| 129 |
-
# )
|
| 130 |
-
# )
|
| 131 |
-
|
| 132 |
-
# return plot.properties(height=max(30 * source.shape[0], 50))
|
|
|
|
| 7 |
|
| 8 |
from .configs import ColumnNames, SupportedFiles
|
| 9 |
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def get_col_indices(cols: List) -> Tuple[int, int]:
|
| 12 |
"""Ugly but works"""
|
|
|
|
| 50 |
b64 = base64.b64encode(csv.encode()).decode()
|
| 51 |
href = f'<a href="data:file/csv;base64,{b64}" download="{name}.csv">Download</a>'
|
| 52 |
st.write(href, unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|