Spaces:
Running
Running
Bor Hodošček
commited on
Commit
·
c8648fb
1
Parent(s):
2e51472
feat: counts for ca; stopword support
Browse files
app.py
CHANGED
@@ -14,6 +14,7 @@
|
|
14 |
# "scattertext==0.2.2",
|
15 |
# "scikit-learn==1.7.0",
|
16 |
# "scipy==1.13.1",
|
|
|
17 |
# "spacy==3.8.7",
|
18 |
# "umap",
|
19 |
# ]
|
@@ -143,20 +144,22 @@ def function_export():
|
|
143 |
for idx in range(n_chunks):
|
144 |
seg = " ".join(tokens[idx * chunk_size : (idx + 1) * chunk_size])
|
145 |
label_idx = idx + 1 if idx + 1 < n_chunks else "last"
|
146 |
-
records.append(
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
160 |
return pd.DataFrame(records)
|
161 |
|
162 |
@mo.cache
|
@@ -167,6 +170,7 @@ def function_export():
|
|
167 |
min_df: float = 0.25,
|
168 |
max_df: float = 0.8,
|
169 |
max_features: int = 200,
|
|
|
170 |
) -> tuple[
|
171 |
st.Corpus,
|
172 |
scipy.sparse.spmatrix,
|
@@ -174,10 +178,17 @@ def function_export():
|
|
174 |
list[str],
|
175 |
list[str],
|
176 |
]:
|
177 |
-
"""Fit TF-IDF + CountVectorizer & build a st.Corpus on already‐chunked data.
|
|
|
|
|
178 |
|
179 |
# texts, categories, filenames are assumed already chunked upstream
|
180 |
-
tfv = TfidfVectorizer(
|
|
|
|
|
|
|
|
|
|
|
181 |
X_tfidf = tfv.fit_transform(texts)
|
182 |
y_codes = pd.Categorical(
|
183 |
categories, categories=pd.Categorical(categories).categories
|
@@ -232,7 +243,6 @@ def function_export():
|
|
232 |
Ingest uploaded vs. default files into a DataFrame with columns:
|
233 |
['filename','raw_text','category' (if split),'author','work'].
|
234 |
"""
|
235 |
-
import pandas as pd
|
236 |
|
237 |
names, raws = _load_files(uploaded, defaults)
|
238 |
records: list[dict] = []
|
@@ -263,6 +273,7 @@ def function_export():
|
|
263 |
)
|
264 |
|
265 |
df_p = pd.DataFrame(records)
|
|
|
266 |
# infer author & work from the file's true stem (no extension, no "_advanced")
|
267 |
def _extract_auth_work(fn: str) -> tuple[str, str]:
|
268 |
base = Path(fn).stem.replace("_advanced", "")
|
@@ -602,7 +613,7 @@ def _(html):
|
|
602 |
download_button = mo.download(
|
603 |
data=html.encode(),
|
604 |
filename="scattertext_analysis.html",
|
605 |
-
label="可視化結果をダウンロード",
|
606 |
)
|
607 |
|
608 |
mo.md(f"{download_button}")
|
@@ -664,9 +675,74 @@ def _():
|
|
664 |
return max_df_setting, max_features_setting, min_df_setting
|
665 |
|
666 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
667 |
@app.cell
|
668 |
def _(
|
669 |
cats,
|
|
|
670 |
fnames,
|
671 |
max_df_setting,
|
672 |
max_features_setting,
|
@@ -681,6 +757,7 @@ def _(
|
|
681 |
min_df=min_df_setting.value,
|
682 |
max_df=max_df_setting.value,
|
683 |
max_features=max_features_setting.value,
|
|
|
684 |
)
|
685 |
return chunk_cats, chunk_fnames, tfidf_X, vectorizer
|
686 |
|
@@ -701,7 +778,7 @@ def _(chunk_cats, tfidf_X):
|
|
701 |
|
702 |
|
703 |
@app.cell
|
704 |
-
def _(X_train, chunk_fnames, vectorizer):
|
705 |
tf_idf_formula = r"$\mathrm{tfidf}(t,d,D)=\mathrm{tf} (t,d)\cdot \mathrm{idf}(t,D)$"
|
706 |
D_formula = r"|\{d:d\in D{\text{ and }}t\in d\}|"
|
707 |
idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
|
@@ -732,7 +809,19 @@ def _(X_train, chunk_fnames, vectorizer):
|
|
732 |
|
733 |
{mo.ui.table(X_df, selection=None)}
|
734 |
""")
|
735 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
736 |
|
737 |
|
738 |
@app.cell
|
@@ -792,7 +881,7 @@ def _():
|
|
792 |
## Correspondence Analysis / 対応分析
|
793 |
|
794 |
対応分析(CA)のbiplotでは、主成分分析のbiplotと似ているような分析として、サンプルと素性の関係が観察できますが、いくつかの違いがあります。
|
795 |
-
対応分析を行うには、$\mathrm{tfidf}
|
796 |
|
797 |
- 会話文と地の文カテゴリと特定単語出現パターンとの関連性を検討
|
798 |
- サンプルのカテゴリと単語特徴量との離散的な関連として関係性を示すバイプロットを作成
|
@@ -816,7 +905,55 @@ def _(X_df, authors, chunk_cats, speech_types, works):
|
|
816 |
df_chk["work"] = works
|
817 |
df_chk["speech_type"] = speech_types
|
818 |
|
819 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
820 |
options: list[str] = []
|
821 |
# Enumerate all non-empty combinations; keep those yielding >2 groups
|
822 |
for r in range(1, len(dims_all) + 1):
|
@@ -826,7 +963,9 @@ def _(X_df, authors, chunk_cats, speech_types, works):
|
|
826 |
|
827 |
mo.stop(
|
828 |
not options,
|
829 |
-
|
|
|
|
|
830 |
)
|
831 |
|
832 |
ca_group_by = mo.ui.dropdown(
|
@@ -840,8 +979,8 @@ def _(X_df, authors, chunk_cats, speech_types, works):
|
|
840 |
|
841 |
|
842 |
@app.cell
|
843 |
-
def _(
|
844 |
-
df =
|
845 |
df["author"] = authors
|
846 |
df["category"] = chunk_cats
|
847 |
df["work"] = works
|
@@ -950,7 +1089,11 @@ def _(X, chunk_fnames, dendrogram_height, distance_metrics, linkage_methods):
|
|
950 |
distfun=distfun,
|
951 |
linkagefun=linkagefun,
|
952 |
)
|
953 |
-
fig.update_layout(
|
|
|
|
|
|
|
|
|
954 |
|
955 |
mo.ui.plotly(fig)
|
956 |
return distfun, ff, linkagefun
|
@@ -974,7 +1117,11 @@ def _(
|
|
974 |
distfun=distfun,
|
975 |
linkagefun=linkagefun,
|
976 |
)
|
977 |
-
fig_T.update_layout(
|
|
|
|
|
|
|
|
|
978 |
|
979 |
mo.ui.plotly(fig_T)
|
980 |
return
|
|
|
14 |
# "scattertext==0.2.2",
|
15 |
# "scikit-learn==1.7.0",
|
16 |
# "scipy==1.13.1",
|
17 |
+
# "seaborn==0.13.2",
|
18 |
# "spacy==3.8.7",
|
19 |
# "umap",
|
20 |
# ]
|
|
|
144 |
for idx in range(n_chunks):
|
145 |
seg = " ".join(tokens[idx * chunk_size : (idx + 1) * chunk_size])
|
146 |
label_idx = idx + 1 if idx + 1 < n_chunks else "last"
|
147 |
+
records.append(
|
148 |
+
{
|
149 |
+
"text": seg,
|
150 |
+
"category": row["category"],
|
151 |
+
"speech_type": row["speech_type"],
|
152 |
+
"filename": row["filename"],
|
153 |
+
"author": row["author"],
|
154 |
+
"work": row["work"],
|
155 |
+
"chunk_label": format_chunk_label(
|
156 |
+
row["filename"],
|
157 |
+
row["category"],
|
158 |
+
row["speech_type"],
|
159 |
+
label_idx,
|
160 |
+
),
|
161 |
+
}
|
162 |
+
)
|
163 |
return pd.DataFrame(records)
|
164 |
|
165 |
@mo.cache
|
|
|
170 |
min_df: float = 0.25,
|
171 |
max_df: float = 0.8,
|
172 |
max_features: int = 200,
|
173 |
+
stop_words: list[str] | None = None,
|
174 |
) -> tuple[
|
175 |
st.Corpus,
|
176 |
scipy.sparse.spmatrix,
|
|
|
178 |
list[str],
|
179 |
list[str],
|
180 |
]:
|
181 |
+
"""Fit TF-IDF + CountVectorizer & build a st.Corpus on already‐chunked data.
|
182 |
+
stop_words: list of tokens to filter out or None.
|
183 |
+
"""
|
184 |
|
185 |
# texts, categories, filenames are assumed already chunked upstream
|
186 |
+
tfv = TfidfVectorizer(
|
187 |
+
min_df=min_df,
|
188 |
+
max_df=max_df,
|
189 |
+
max_features=max_features,
|
190 |
+
stop_words=stop_words,
|
191 |
+
)
|
192 |
X_tfidf = tfv.fit_transform(texts)
|
193 |
y_codes = pd.Categorical(
|
194 |
categories, categories=pd.Categorical(categories).categories
|
|
|
243 |
Ingest uploaded vs. default files into a DataFrame with columns:
|
244 |
['filename','raw_text','category' (if split),'author','work'].
|
245 |
"""
|
|
|
246 |
|
247 |
names, raws = _load_files(uploaded, defaults)
|
248 |
records: list[dict] = []
|
|
|
273 |
)
|
274 |
|
275 |
df_p = pd.DataFrame(records)
|
276 |
+
|
277 |
# infer author & work from the file's true stem (no extension, no "_advanced")
|
278 |
def _extract_auth_work(fn: str) -> tuple[str, str]:
|
279 |
base = Path(fn).stem.replace("_advanced", "")
|
|
|
613 |
download_button = mo.download(
|
614 |
data=html.encode(),
|
615 |
filename="scattertext_analysis.html",
|
616 |
+
label="ScatterText可視化結果をダウンロード",
|
617 |
)
|
618 |
|
619 |
mo.md(f"{download_button}")
|
|
|
675 |
return max_df_setting, max_features_setting, min_df_setting
|
676 |
|
677 |
|
678 |
+
@app.cell
|
679 |
+
def stopword_settings():
|
680 |
+
stop_filter = mo.ui.switch(label="Enable stop-word filtering?", value=False)
|
681 |
+
stop_filter
|
682 |
+
return (stop_filter,)
|
683 |
+
|
684 |
+
|
685 |
+
@app.cell
|
686 |
+
def _(stop_filter):
|
687 |
+
mo.stop(not stop_filter.value)
|
688 |
+
|
689 |
+
sw_source = mo.ui.dropdown(
|
690 |
+
options=["spaCy", "Custom", "Both"],
|
691 |
+
value="spaCy",
|
692 |
+
label="Stop-word source",
|
693 |
+
)
|
694 |
+
|
695 |
+
empty_df = pd.DataFrame({"stopword": []}, dtype=pd.StringDtype())
|
696 |
+
editor = mo.ui.data_editor(empty_df).form(
|
697 |
+
label="Your custom stop-words (+ Add Row)", bordered=True
|
698 |
+
)
|
699 |
+
sw_source
|
700 |
+
return editor, sw_source
|
701 |
+
|
702 |
+
|
703 |
+
@app.cell
|
704 |
+
def _(editor, sw_source):
|
705 |
+
mo.stop(sw_source.value == "spaCy")
|
706 |
+
|
707 |
+
editor
|
708 |
+
return
|
709 |
+
|
710 |
+
|
711 |
+
@app.cell
|
712 |
+
def make_stopword_list(editor, sw_source):
|
713 |
+
mo.stop(
|
714 |
+
editor.value is None
|
715 |
+
)
|
716 |
+
|
717 |
+
sw = set()
|
718 |
+
if sw_source.value in ("spaCy", "Both"):
|
719 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
720 |
+
|
721 |
+
sw.update(STOP_WORDS)
|
722 |
+
if sw_source.value in ("Custom", "Both"):
|
723 |
+
# editor.value is a pandas DataFrame
|
724 |
+
for w in editor.value["stopword"].dropna().astype(str):
|
725 |
+
token = w.strip()
|
726 |
+
if token:
|
727 |
+
sw.add(token)
|
728 |
+
if sw:
|
729 |
+
sw = list(sw)
|
730 |
+
return (sw,)
|
731 |
+
|
732 |
+
|
733 |
+
@app.cell
|
734 |
+
def _(editor, stop_filter, sw):
|
735 |
+
final_stopwords = None
|
736 |
+
if editor.value is not None:
|
737 |
+
print(stop_filter.value)
|
738 |
+
final_stopwords = sw
|
739 |
+
return (final_stopwords,)
|
740 |
+
|
741 |
+
|
742 |
@app.cell
|
743 |
def _(
|
744 |
cats,
|
745 |
+
final_stopwords,
|
746 |
fnames,
|
747 |
max_df_setting,
|
748 |
max_features_setting,
|
|
|
757 |
min_df=min_df_setting.value,
|
758 |
max_df=max_df_setting.value,
|
759 |
max_features=max_features_setting.value,
|
760 |
+
stop_words=final_stopwords,
|
761 |
)
|
762 |
return chunk_cats, chunk_fnames, tfidf_X, vectorizer
|
763 |
|
|
|
778 |
|
779 |
|
780 |
@app.cell
|
781 |
+
def _(X_train, chunk_fnames, texts, vectorizer):
|
782 |
tf_idf_formula = r"$\mathrm{tfidf}(t,d,D)=\mathrm{tf} (t,d)\cdot \mathrm{idf}(t,D)$"
|
783 |
D_formula = r"|\{d:d\in D{\text{ and }}t\in d\}|"
|
784 |
idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
|
|
|
809 |
|
810 |
{mo.ui.table(X_df, selection=None)}
|
811 |
""")
|
812 |
+
|
813 |
+
# build raw‐counts table on identical vocab
|
814 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
815 |
+
|
816 |
+
cv = CountVectorizer(vocabulary=vectorizer.vocabulary_)
|
817 |
+
count_mat = cv.fit_transform(texts)
|
818 |
+
count_df = pd.DataFrame(
|
819 |
+
count_mat.toarray(),
|
820 |
+
index=chunk_fnames,
|
821 |
+
columns=vectorizer.get_feature_names_out(),
|
822 |
+
)
|
823 |
+
|
824 |
+
return X_df, count_df
|
825 |
|
826 |
|
827 |
@app.cell
|
|
|
881 |
## Correspondence Analysis / 対応分析
|
882 |
|
883 |
対応分析(CA)のbiplotでは、主成分分析のbiplotと似ているような分析として、サンプルと素性の関係が観察できますが、いくつかの違いがあります。
|
884 |
+
対応分析を行うには、$\mathrm{tfidf}$行列ではなく粗頻度行列をカテゴリカルな形式の分割表(contingency table)に変換する必要があります。次に、そのデータを連関表として解析します。この手法により、
|
885 |
|
886 |
- 会話文と地の文カテゴリと特定単語出現パターンとの関連性を検討
|
887 |
- サンプルのカテゴリと単語特徴量との離散的な関連として関係性を示すバイプロットを作成
|
|
|
905 |
df_chk["work"] = works
|
906 |
df_chk["speech_type"] = speech_types
|
907 |
|
908 |
+
# filter out collinear dimensions by Cramér’s V
|
909 |
+
from scipy.stats import chi2_contingency
|
910 |
+
|
911 |
+
def cramers_v(m: np.ndarray) -> float:
|
912 |
+
"""Compute Cramér’s V from a contingency‐matrix."""
|
913 |
+
chi2 = chi2_contingency(m, correction=False)[0]
|
914 |
+
n = m.sum()
|
915 |
+
k = min(m.shape) - 1
|
916 |
+
return np.sqrt(chi2 / (n * k))
|
917 |
+
|
918 |
+
cols = ["author", "category", "work", "speech_type"]
|
919 |
+
vmat = pd.DataFrame(index=cols, columns=cols, dtype=float)
|
920 |
+
for i in cols:
|
921 |
+
for j in cols:
|
922 |
+
if i == j:
|
923 |
+
vmat.loc[i, j] = 1.0
|
924 |
+
else:
|
925 |
+
m = pd.crosstab(df_chk[i], df_chk[j]).values
|
926 |
+
vmat.loc[i, j] = cramers_v(m)
|
927 |
+
|
928 |
+
print(vmat)
|
929 |
+
|
930 |
+
# drop any dimension that is nearly collinear with another (V > .95)
|
931 |
+
high_thresh = 0.95
|
932 |
+
# only drop the later dimension in each tuple
|
933 |
+
drop = {
|
934 |
+
j for i, j in itertools.combinations(cols, 2) if vmat.loc[i, j] > high_thresh
|
935 |
+
}
|
936 |
+
# special‐case: in pure speech vs non-speech mode (category == speech_type),
|
937 |
+
# keep speech_type (the more descriptive) and drop category instead
|
938 |
+
if vmat.loc["category", "speech_type"] > high_thresh and chunk_cats == speech_types:
|
939 |
+
drop.discard("speech_type")
|
940 |
+
drop.add("category")
|
941 |
+
filtered_dims = [d for d in cols if d not in drop]
|
942 |
+
print(drop, filtered_dims)
|
943 |
+
|
944 |
+
# warn on moderate association .3 ≤ V ≤ .6
|
945 |
+
collinear_warns = []
|
946 |
+
for i in cols:
|
947 |
+
for j in cols:
|
948 |
+
if i < j and 0.3 <= vmat.loc[i, j] <= 0.6:
|
949 |
+
collinear_warns.append(
|
950 |
+
f"⚠️ `{i}` vs `{j}` moderate association (V={vmat.loc[i, j]:.2f})"
|
951 |
+
)
|
952 |
+
collinear_message = mo.md("## Warning\n" + "\n".join(collinear_warns)).callout(
|
953 |
+
kind="warning"
|
954 |
+
)
|
955 |
+
|
956 |
+
dims_all = filtered_dims # start with our filtered labels
|
957 |
options: list[str] = []
|
958 |
# Enumerate all non-empty combinations; keep those yielding >2 groups
|
959 |
for r in range(1, len(dims_all) + 1):
|
|
|
963 |
|
964 |
mo.stop(
|
965 |
not options,
|
966 |
+
mo.md(
|
967 |
+
f"No category combination yielding more than two rows, so cannot perform CA.\n{collinear_message}"
|
968 |
+
),
|
969 |
)
|
970 |
|
971 |
ca_group_by = mo.ui.dropdown(
|
|
|
979 |
|
980 |
|
981 |
@app.cell
|
982 |
+
def _(authors, ca_group_by, chunk_cats, count_df, speech_types, works):
|
983 |
+
df = count_df.copy()
|
984 |
df["author"] = authors
|
985 |
df["category"] = chunk_cats
|
986 |
df["work"] = works
|
|
|
1089 |
distfun=distfun,
|
1090 |
linkagefun=linkagefun,
|
1091 |
)
|
1092 |
+
fig.update_layout(
|
1093 |
+
width=800,
|
1094 |
+
height=dendrogram_height.value,
|
1095 |
+
title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on samples",
|
1096 |
+
)
|
1097 |
|
1098 |
mo.ui.plotly(fig)
|
1099 |
return distfun, ff, linkagefun
|
|
|
1117 |
distfun=distfun,
|
1118 |
linkagefun=linkagefun,
|
1119 |
)
|
1120 |
+
fig_T.update_layout(
|
1121 |
+
width=800,
|
1122 |
+
height=dendrogram_height.value,
|
1123 |
+
title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on features",
|
1124 |
+
)
|
1125 |
|
1126 |
mo.ui.plotly(fig_T)
|
1127 |
return
|