Bor Hodošček commited on
Commit
c8648fb
·
1 Parent(s): 2e51472

feat: counts for ca; stopword support

Browse files
Files changed (1) hide show
  1. app.py +174 -27
app.py CHANGED
@@ -14,6 +14,7 @@
14
  # "scattertext==0.2.2",
15
  # "scikit-learn==1.7.0",
16
  # "scipy==1.13.1",
 
17
  # "spacy==3.8.7",
18
  # "umap",
19
  # ]
@@ -143,20 +144,22 @@ def function_export():
143
  for idx in range(n_chunks):
144
  seg = " ".join(tokens[idx * chunk_size : (idx + 1) * chunk_size])
145
  label_idx = idx + 1 if idx + 1 < n_chunks else "last"
146
- records.append({
147
- "text": seg,
148
- "category": row["category"],
149
- "speech_type": row["speech_type"],
150
- "filename": row["filename"],
151
- "author": row["author"],
152
- "work": row["work"],
153
- "chunk_label": format_chunk_label(
154
- row["filename"],
155
- row["category"],
156
- row["speech_type"],
157
- label_idx,
158
- ),
159
- })
 
 
160
  return pd.DataFrame(records)
161
 
162
  @mo.cache
@@ -167,6 +170,7 @@ def function_export():
167
  min_df: float = 0.25,
168
  max_df: float = 0.8,
169
  max_features: int = 200,
 
170
  ) -> tuple[
171
  st.Corpus,
172
  scipy.sparse.spmatrix,
@@ -174,10 +178,17 @@ def function_export():
174
  list[str],
175
  list[str],
176
  ]:
177
- """Fit TF-IDF + CountVectorizer & build a st.Corpus on already‐chunked data."""
 
 
178
 
179
  # texts, categories, filenames are assumed already chunked upstream
180
- tfv = TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=max_features)
 
 
 
 
 
181
  X_tfidf = tfv.fit_transform(texts)
182
  y_codes = pd.Categorical(
183
  categories, categories=pd.Categorical(categories).categories
@@ -232,7 +243,6 @@ def function_export():
232
  Ingest uploaded vs. default files into a DataFrame with columns:
233
  ['filename','raw_text','category' (if split),'author','work'].
234
  """
235
- import pandas as pd
236
 
237
  names, raws = _load_files(uploaded, defaults)
238
  records: list[dict] = []
@@ -263,6 +273,7 @@ def function_export():
263
  )
264
 
265
  df_p = pd.DataFrame(records)
 
266
  # infer author & work from the file's true stem (no extension, no "_advanced")
267
  def _extract_auth_work(fn: str) -> tuple[str, str]:
268
  base = Path(fn).stem.replace("_advanced", "")
@@ -602,7 +613,7 @@ def _(html):
602
  download_button = mo.download(
603
  data=html.encode(),
604
  filename="scattertext_analysis.html",
605
- label="可視化結果をダウンロード",
606
  )
607
 
608
  mo.md(f"{download_button}")
@@ -664,9 +675,74 @@ def _():
664
  return max_df_setting, max_features_setting, min_df_setting
665
 
666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
667
  @app.cell
668
  def _(
669
  cats,
 
670
  fnames,
671
  max_df_setting,
672
  max_features_setting,
@@ -681,6 +757,7 @@ def _(
681
  min_df=min_df_setting.value,
682
  max_df=max_df_setting.value,
683
  max_features=max_features_setting.value,
 
684
  )
685
  return chunk_cats, chunk_fnames, tfidf_X, vectorizer
686
 
@@ -701,7 +778,7 @@ def _(chunk_cats, tfidf_X):
701
 
702
 
703
  @app.cell
704
- def _(X_train, chunk_fnames, vectorizer):
705
  tf_idf_formula = r"$\mathrm{tfidf}(t,d,D)=\mathrm{tf} (t,d)\cdot \mathrm{idf}(t,D)$"
706
  D_formula = r"|\{d:d\in D{\text{ and }}t\in d\}|"
707
  idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
@@ -732,7 +809,19 @@ def _(X_train, chunk_fnames, vectorizer):
732
 
733
  {mo.ui.table(X_df, selection=None)}
734
  """)
735
- return (X_df,)
 
 
 
 
 
 
 
 
 
 
 
 
736
 
737
 
738
  @app.cell
@@ -792,7 +881,7 @@ def _():
792
  ## Correspondence Analysis / 対応分析
793
 
794
  対応分析(CA)のbiplotでは、主成分分析のbiplotと似ているような分析として、サンプルと素性の関係が観察できますが、いくつかの違いがあります。
795
- 対応分析を行うには、$\mathrm{tfidf}$行列をカテゴリカルな形式の分割表(contingency table)に変換する必要があります。次に、そのデータを連関表として解析します。この手法により、
796
 
797
  - 会話文と地の文カテゴリと特定単語出現パターンとの関連性を検討
798
  - サンプルのカテゴリと単語特徴量との離散的な関連として関係性を示すバイプロットを作成
@@ -816,7 +905,55 @@ def _(X_df, authors, chunk_cats, speech_types, works):
816
  df_chk["work"] = works
817
  df_chk["speech_type"] = speech_types
818
 
819
- dims_all = ["author", "category", "work", "speech_type"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
820
  options: list[str] = []
821
  # Enumerate all non-empty combinations; keep those yielding >2 groups
822
  for r in range(1, len(dims_all) + 1):
@@ -826,7 +963,9 @@ def _(X_df, authors, chunk_cats, speech_types, works):
826
 
827
  mo.stop(
828
  not options,
829
- "No category combination yielding more than two rows, so cannot perform CA.",
 
 
830
  )
831
 
832
  ca_group_by = mo.ui.dropdown(
@@ -840,8 +979,8 @@ def _(X_df, authors, chunk_cats, speech_types, works):
840
 
841
 
842
  @app.cell
843
- def _(X_df, authors, ca_group_by, chunk_cats, speech_types, works):
844
- df = X_df.copy()
845
  df["author"] = authors
846
  df["category"] = chunk_cats
847
  df["work"] = works
@@ -950,7 +1089,11 @@ def _(X, chunk_fnames, dendrogram_height, distance_metrics, linkage_methods):
950
  distfun=distfun,
951
  linkagefun=linkagefun,
952
  )
953
- fig.update_layout(width=800, height=dendrogram_height.value, title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on samples",)
 
 
 
 
954
 
955
  mo.ui.plotly(fig)
956
  return distfun, ff, linkagefun
@@ -974,7 +1117,11 @@ def _(
974
  distfun=distfun,
975
  linkagefun=linkagefun,
976
  )
977
- fig_T.update_layout(width=800, height=dendrogram_height.value, title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on features")
 
 
 
 
978
 
979
  mo.ui.plotly(fig_T)
980
  return
 
14
  # "scattertext==0.2.2",
15
  # "scikit-learn==1.7.0",
16
  # "scipy==1.13.1",
17
+ # "seaborn==0.13.2",
18
  # "spacy==3.8.7",
19
  # "umap",
20
  # ]
 
144
  for idx in range(n_chunks):
145
  seg = " ".join(tokens[idx * chunk_size : (idx + 1) * chunk_size])
146
  label_idx = idx + 1 if idx + 1 < n_chunks else "last"
147
+ records.append(
148
+ {
149
+ "text": seg,
150
+ "category": row["category"],
151
+ "speech_type": row["speech_type"],
152
+ "filename": row["filename"],
153
+ "author": row["author"],
154
+ "work": row["work"],
155
+ "chunk_label": format_chunk_label(
156
+ row["filename"],
157
+ row["category"],
158
+ row["speech_type"],
159
+ label_idx,
160
+ ),
161
+ }
162
+ )
163
  return pd.DataFrame(records)
164
 
165
  @mo.cache
 
170
  min_df: float = 0.25,
171
  max_df: float = 0.8,
172
  max_features: int = 200,
173
+ stop_words: list[str] | None = None,
174
  ) -> tuple[
175
  st.Corpus,
176
  scipy.sparse.spmatrix,
 
178
  list[str],
179
  list[str],
180
  ]:
181
+ """Fit TF-IDF + CountVectorizer & build a st.Corpus on already‐chunked data.
182
+ stop_words: list of tokens to filter out or None.
183
+ """
184
 
185
  # texts, categories, filenames are assumed already chunked upstream
186
+ tfv = TfidfVectorizer(
187
+ min_df=min_df,
188
+ max_df=max_df,
189
+ max_features=max_features,
190
+ stop_words=stop_words,
191
+ )
192
  X_tfidf = tfv.fit_transform(texts)
193
  y_codes = pd.Categorical(
194
  categories, categories=pd.Categorical(categories).categories
 
243
  Ingest uploaded vs. default files into a DataFrame with columns:
244
  ['filename','raw_text','category' (if split),'author','work'].
245
  """
 
246
 
247
  names, raws = _load_files(uploaded, defaults)
248
  records: list[dict] = []
 
273
  )
274
 
275
  df_p = pd.DataFrame(records)
276
+
277
  # infer author & work from the file's true stem (no extension, no "_advanced")
278
  def _extract_auth_work(fn: str) -> tuple[str, str]:
279
  base = Path(fn).stem.replace("_advanced", "")
 
613
  download_button = mo.download(
614
  data=html.encode(),
615
  filename="scattertext_analysis.html",
616
+ label="ScatterText可視化結果をダウンロード",
617
  )
618
 
619
  mo.md(f"{download_button}")
 
675
  return max_df_setting, max_features_setting, min_df_setting
676
 
677
 
678
+ @app.cell
679
+ def stopword_settings():
680
+ stop_filter = mo.ui.switch(label="Enable stop-word filtering?", value=False)
681
+ stop_filter
682
+ return (stop_filter,)
683
+
684
+
685
+ @app.cell
686
+ def _(stop_filter):
687
+ mo.stop(not stop_filter.value)
688
+
689
+ sw_source = mo.ui.dropdown(
690
+ options=["spaCy", "Custom", "Both"],
691
+ value="spaCy",
692
+ label="Stop-word source",
693
+ )
694
+
695
+ empty_df = pd.DataFrame({"stopword": []}, dtype=pd.StringDtype())
696
+ editor = mo.ui.data_editor(empty_df).form(
697
+ label="Your custom stop-words (+ Add Row)", bordered=True
698
+ )
699
+ sw_source
700
+ return editor, sw_source
701
+
702
+
703
+ @app.cell
704
+ def _(editor, sw_source):
705
+ mo.stop(sw_source.value == "spaCy")
706
+
707
+ editor
708
+ return
709
+
710
+
711
+ @app.cell
712
+ def make_stopword_list(editor, sw_source):
713
+ mo.stop(
714
+ editor.value is None
715
+ )
716
+
717
+ sw = set()
718
+ if sw_source.value in ("spaCy", "Both"):
719
+ from spacy.lang.en.stop_words import STOP_WORDS
720
+
721
+ sw.update(STOP_WORDS)
722
+ if sw_source.value in ("Custom", "Both"):
723
+ # editor.value is a pandas DataFrame
724
+ for w in editor.value["stopword"].dropna().astype(str):
725
+ token = w.strip()
726
+ if token:
727
+ sw.add(token)
728
+ if sw:
729
+ sw = list(sw)
730
+ return (sw,)
731
+
732
+
733
+ @app.cell
734
+ def _(editor, stop_filter, sw):
735
+ final_stopwords = None
736
+ if editor.value is not None:
737
+ print(stop_filter.value)
738
+ final_stopwords = sw
739
+ return (final_stopwords,)
740
+
741
+
742
  @app.cell
743
  def _(
744
  cats,
745
+ final_stopwords,
746
  fnames,
747
  max_df_setting,
748
  max_features_setting,
 
757
  min_df=min_df_setting.value,
758
  max_df=max_df_setting.value,
759
  max_features=max_features_setting.value,
760
+ stop_words=final_stopwords,
761
  )
762
  return chunk_cats, chunk_fnames, tfidf_X, vectorizer
763
 
 
778
 
779
 
780
  @app.cell
781
+ def _(X_train, chunk_fnames, texts, vectorizer):
782
  tf_idf_formula = r"$\mathrm{tfidf}(t,d,D)=\mathrm{tf} (t,d)\cdot \mathrm{idf}(t,D)$"
783
  D_formula = r"|\{d:d\in D{\text{ and }}t\in d\}|"
784
  idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
 
809
 
810
  {mo.ui.table(X_df, selection=None)}
811
  """)
812
+
813
+ # build raw‐counts table on identical vocab
814
+ from sklearn.feature_extraction.text import CountVectorizer
815
+
816
+ cv = CountVectorizer(vocabulary=vectorizer.vocabulary_)
817
+ count_mat = cv.fit_transform(texts)
818
+ count_df = pd.DataFrame(
819
+ count_mat.toarray(),
820
+ index=chunk_fnames,
821
+ columns=vectorizer.get_feature_names_out(),
822
+ )
823
+
824
+ return X_df, count_df
825
 
826
 
827
  @app.cell
 
881
  ## Correspondence Analysis / 対応分析
882
 
883
  対応分析(CA)のbiplotでは、主成分分析のbiplotと似ているような分析として、サンプルと素性の関係が観察できますが、いくつかの違いがあります。
884
+ 対応分析を行うには、$\mathrm{tfidf}$行列ではなく粗頻度行列をカテゴリカルな形式の分割表(contingency table)に変換する必要があります。次に、そのデータを連関表として解析します。この手法により、
885
 
886
  - 会話文と地の文カテゴリと特定単語出現パターンとの関連性を検討
887
  - サンプルのカテゴリと単語特徴量との離散的な関連として関係性を示すバイプロットを作成
 
905
  df_chk["work"] = works
906
  df_chk["speech_type"] = speech_types
907
 
908
+ # filter out collinear dimensions by Cramér’s V
909
+ from scipy.stats import chi2_contingency
910
+
911
+ def cramers_v(m: np.ndarray) -> float:
912
+ """Compute Cramér’s V from a contingency‐matrix."""
913
+ chi2 = chi2_contingency(m, correction=False)[0]
914
+ n = m.sum()
915
+ k = min(m.shape) - 1
916
+ return np.sqrt(chi2 / (n * k))
917
+
918
+ cols = ["author", "category", "work", "speech_type"]
919
+ vmat = pd.DataFrame(index=cols, columns=cols, dtype=float)
920
+ for i in cols:
921
+ for j in cols:
922
+ if i == j:
923
+ vmat.loc[i, j] = 1.0
924
+ else:
925
+ m = pd.crosstab(df_chk[i], df_chk[j]).values
926
+ vmat.loc[i, j] = cramers_v(m)
927
+
928
+ print(vmat)
929
+
930
+ # drop any dimension that is nearly collinear with another (V > .95)
931
+ high_thresh = 0.95
932
+ # only drop the later dimension in each tuple
933
+ drop = {
934
+ j for i, j in itertools.combinations(cols, 2) if vmat.loc[i, j] > high_thresh
935
+ }
936
+ # special‐case: in pure speech vs non-speech mode (category == speech_type),
937
+ # keep speech_type (the more descriptive) and drop category instead
938
+ if vmat.loc["category", "speech_type"] > high_thresh and chunk_cats == speech_types:
939
+ drop.discard("speech_type")
940
+ drop.add("category")
941
+ filtered_dims = [d for d in cols if d not in drop]
942
+ print(drop, filtered_dims)
943
+
944
+ # warn on moderate association .3 ≤ V ≤ .6
945
+ collinear_warns = []
946
+ for i in cols:
947
+ for j in cols:
948
+ if i < j and 0.3 <= vmat.loc[i, j] <= 0.6:
949
+ collinear_warns.append(
950
+ f"⚠️ `{i}` vs `{j}` moderate association (V={vmat.loc[i, j]:.2f})"
951
+ )
952
+ collinear_message = mo.md("## Warning\n" + "\n".join(collinear_warns)).callout(
953
+ kind="warning"
954
+ )
955
+
956
+ dims_all = filtered_dims # start with our filtered labels
957
  options: list[str] = []
958
  # Enumerate all non-empty combinations; keep those yielding >2 groups
959
  for r in range(1, len(dims_all) + 1):
 
963
 
964
  mo.stop(
965
  not options,
966
+ mo.md(
967
+ f"No category combination yielding more than two rows, so cannot perform CA.\n{collinear_message}"
968
+ ),
969
  )
970
 
971
  ca_group_by = mo.ui.dropdown(
 
979
 
980
 
981
  @app.cell
982
+ def _(authors, ca_group_by, chunk_cats, count_df, speech_types, works):
983
+ df = count_df.copy()
984
  df["author"] = authors
985
  df["category"] = chunk_cats
986
  df["work"] = works
 
1089
  distfun=distfun,
1090
  linkagefun=linkagefun,
1091
  )
1092
+ fig.update_layout(
1093
+ width=800,
1094
+ height=dendrogram_height.value,
1095
+ title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on samples",
1096
+ )
1097
 
1098
  mo.ui.plotly(fig)
1099
  return distfun, ff, linkagefun
 
1117
  distfun=distfun,
1118
  linkagefun=linkagefun,
1119
  )
1120
+ fig_T.update_layout(
1121
+ width=800,
1122
+ height=dendrogram_height.value,
1123
+ title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on features",
1124
+ )
1125
 
1126
  mo.ui.plotly(fig_T)
1127
  return