Spaces:

bor
/

scattertext-en-novels

Running

App Files Files Community

Bor Hodošček commited on Jul 8

Commit

c8648fb

1 Parent(s): 2e51472

feat: counts for ca; stopword support

Browse files

Files changed (1) hide show

app.py +174 -27

app.py CHANGED Viewed

@@ -14,6 +14,7 @@
 #     "scattertext==0.2.2",
 #     "scikit-learn==1.7.0",
 #     "scipy==1.13.1",
 #     "spacy==3.8.7",
 #     "umap",
 # ]
@@ -143,20 +144,22 @@ def function_export():
             for idx in range(n_chunks):
                 seg = " ".join(tokens[idx * chunk_size : (idx + 1) * chunk_size])
                 label_idx = idx + 1 if idx + 1 < n_chunks else "last"
-                records.append({
-                    "text":         seg,
-                    "category":     row["category"],
-                    "speech_type":  row["speech_type"],
-                    "filename":     row["filename"],
-                    "author":       row["author"],
-                    "work":         row["work"],
-                    "chunk_label":  format_chunk_label(
-                        row["filename"],
-                        row["category"],
-                        row["speech_type"],
-                        label_idx,
-                    ),
-                })
         return pd.DataFrame(records)
     @mo.cache
@@ -167,6 +170,7 @@ def function_export():
         min_df: float = 0.25,
         max_df: float = 0.8,
         max_features: int = 200,
     ) -> tuple[
         st.Corpus,
         scipy.sparse.spmatrix,
@@ -174,10 +178,17 @@ def function_export():
         list[str],
         list[str],
     ]:
-        """Fit TF-IDF + CountVectorizer & build a st.Corpus on already‐chunked data."""
         # texts, categories, filenames are assumed already chunked upstream
-        tfv = TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=max_features)
         X_tfidf = tfv.fit_transform(texts)
         y_codes = pd.Categorical(
             categories, categories=pd.Categorical(categories).categories
@@ -232,7 +243,6 @@ def function_export():
         Ingest uploaded vs. default files into a DataFrame with columns:
         ['filename','raw_text','category' (if split),'author','work'].
         """
-        import pandas as pd
         names, raws = _load_files(uploaded, defaults)
         records: list[dict] = []
@@ -263,6 +273,7 @@ def function_export():
                 )
         df_p = pd.DataFrame(records)
         # infer author & work from the file's true stem (no extension, no "_advanced")
         def _extract_auth_work(fn: str) -> tuple[str, str]:
             base = Path(fn).stem.replace("_advanced", "")
@@ -602,7 +613,7 @@ def _(html):
     download_button = mo.download(
         data=html.encode(),
         filename="scattertext_analysis.html",
-        label="可視化結果をダウンロード",
     )
     mo.md(f"{download_button}")
@@ -664,9 +675,74 @@ def _():
     return max_df_setting, max_features_setting, min_df_setting
 @app.cell
 def _(
     cats,
     fnames,
     max_df_setting,
     max_features_setting,
@@ -681,6 +757,7 @@ def _(
         min_df=min_df_setting.value,
         max_df=max_df_setting.value,
         max_features=max_features_setting.value,
     )
     return chunk_cats, chunk_fnames, tfidf_X, vectorizer
@@ -701,7 +778,7 @@ def _(chunk_cats, tfidf_X):
 @app.cell
-def _(X_train, chunk_fnames, vectorizer):
     tf_idf_formula = r"$\mathrm{tfidf}(t,d,D)=\mathrm{tf} (t,d)\cdot \mathrm{idf}(t,D)$"
     D_formula = r"|\{d:d\in D{\text{ and }}t\in d\}|"
     idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
@@ -732,7 +809,19 @@ def _(X_train, chunk_fnames, vectorizer):
     {mo.ui.table(X_df, selection=None)}
     """)
-    return (X_df,)
 @app.cell
@@ -792,7 +881,7 @@ def _():
     ## Correspondence Analysis / 対応分析
     対応分析（CA）のbiplotでは、主成分分析のbiplotと似ているような分析として、サンプルと素性の関係が観察できますが、いくつかの違いがあります。
-    対応分析を行うには、$\mathrm{tfidf}$行列をカテゴリカルな形式の分割表（contingency table）に変換する必要があります。次に、そのデータを連関表として解析します。この手法により、
     - 会話文と地の文カテゴリと特定単語出現パターンとの関連性を検討
     - サンプルのカテゴリと単語特徴量との離散的な関連として関係性を示すバイプロットを作成
@@ -816,7 +905,55 @@ def _(X_df, authors, chunk_cats, speech_types, works):
     df_chk["work"] = works
     df_chk["speech_type"] = speech_types
-    dims_all = ["author", "category", "work", "speech_type"]
     options: list[str] = []
     # Enumerate all non-empty combinations; keep those yielding >2 groups
     for r in range(1, len(dims_all) + 1):
@@ -826,7 +963,9 @@ def _(X_df, authors, chunk_cats, speech_types, works):
     mo.stop(
         not options,
-        "No category combination yielding more than two rows, so cannot perform CA.",
     )
     ca_group_by = mo.ui.dropdown(
@@ -840,8 +979,8 @@ def _(X_df, authors, chunk_cats, speech_types, works):
 @app.cell
-def _(X_df, authors, ca_group_by, chunk_cats, speech_types, works):
-    df = X_df.copy()
     df["author"] = authors
     df["category"] = chunk_cats
     df["work"] = works
@@ -950,7 +1089,11 @@ def _(X, chunk_fnames, dendrogram_height, distance_metrics, linkage_methods):
         distfun=distfun,
         linkagefun=linkagefun,
     )
-    fig.update_layout(width=800, height=dendrogram_height.value, title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on samples",)
     mo.ui.plotly(fig)
     return distfun, ff, linkagefun
@@ -974,7 +1117,11 @@ def _(
         distfun=distfun,
         linkagefun=linkagefun,
     )
-    fig_T.update_layout(width=800, height=dendrogram_height.value, title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on features")
     mo.ui.plotly(fig_T)
     return

 #     "scattertext==0.2.2",
 #     "scikit-learn==1.7.0",
 #     "scipy==1.13.1",
+#     "seaborn==0.13.2",
 #     "spacy==3.8.7",
 #     "umap",
 # ]
             for idx in range(n_chunks):
                 seg = " ".join(tokens[idx * chunk_size : (idx + 1) * chunk_size])
                 label_idx = idx + 1 if idx + 1 < n_chunks else "last"
+                records.append(
+                    {
+                        "text": seg,
+                        "category": row["category"],
+                        "speech_type": row["speech_type"],
+                        "filename": row["filename"],
+                        "author": row["author"],
+                        "work": row["work"],
+                        "chunk_label": format_chunk_label(
+                            row["filename"],
+                            row["category"],
+                            row["speech_type"],
+                            label_idx,
+                        ),
+                    }
+                )
         return pd.DataFrame(records)
     @mo.cache
         min_df: float = 0.25,
         max_df: float = 0.8,
         max_features: int = 200,
+        stop_words: list[str] | None = None,
     ) -> tuple[
         st.Corpus,
         scipy.sparse.spmatrix,
         list[str],
         list[str],
     ]:
+        """Fit TF-IDF + CountVectorizer & build a st.Corpus on already‐chunked data.
+        stop_words: list of tokens to filter out or None.
+        """
         # texts, categories, filenames are assumed already chunked upstream
+        tfv = TfidfVectorizer(
+            min_df=min_df,
+            max_df=max_df,
+            max_features=max_features,
+            stop_words=stop_words,
+        )
         X_tfidf = tfv.fit_transform(texts)
         y_codes = pd.Categorical(
             categories, categories=pd.Categorical(categories).categories
         Ingest uploaded vs. default files into a DataFrame with columns:
         ['filename','raw_text','category' (if split),'author','work'].
         """
         names, raws = _load_files(uploaded, defaults)
         records: list[dict] = []
                 )
         df_p = pd.DataFrame(records)
         # infer author & work from the file's true stem (no extension, no "_advanced")
         def _extract_auth_work(fn: str) -> tuple[str, str]:
             base = Path(fn).stem.replace("_advanced", "")
     download_button = mo.download(
         data=html.encode(),
         filename="scattertext_analysis.html",
+        label="ScatterText可視化結果をダウンロード",
     )
     mo.md(f"{download_button}")
     return max_df_setting, max_features_setting, min_df_setting
+@app.cell
+def stopword_settings():
+    stop_filter = mo.ui.switch(label="Enable stop-word filtering?", value=False)
+    stop_filter
+    return (stop_filter,)
+@app.cell
+def _(stop_filter):
+    mo.stop(not stop_filter.value)
+    sw_source = mo.ui.dropdown(
+        options=["spaCy", "Custom", "Both"],
+        value="spaCy",
+        label="Stop-word source",
+    )
+    empty_df = pd.DataFrame({"stopword": []}, dtype=pd.StringDtype())
+    editor = mo.ui.data_editor(empty_df).form(
+        label="Your custom stop-words (+ Add Row)", bordered=True
+    )
+    sw_source
+    return editor, sw_source
+@app.cell
+def _(editor, sw_source):
+    mo.stop(sw_source.value == "spaCy")
+    editor
+    return
+@app.cell
+def make_stopword_list(editor, sw_source):
+    mo.stop(
+        editor.value is None
+    )
+    sw = set()
+    if sw_source.value in ("spaCy", "Both"):
+        from spacy.lang.en.stop_words import STOP_WORDS
+        sw.update(STOP_WORDS)
+    if sw_source.value in ("Custom", "Both"):
+        # editor.value is a pandas DataFrame
+        for w in editor.value["stopword"].dropna().astype(str):
+            token = w.strip()
+            if token:
+                sw.add(token)
+    if sw:
+        sw = list(sw)
+    return (sw,)
+@app.cell
+def _(editor, stop_filter, sw):
+    final_stopwords = None
+    if editor.value is not None:
+        print(stop_filter.value)
+        final_stopwords = sw
+    return (final_stopwords,)
 @app.cell
 def _(
     cats,
+    final_stopwords,
     fnames,
     max_df_setting,
     max_features_setting,
         min_df=min_df_setting.value,
         max_df=max_df_setting.value,
         max_features=max_features_setting.value,
+        stop_words=final_stopwords,
     )
     return chunk_cats, chunk_fnames, tfidf_X, vectorizer
 @app.cell
+def _(X_train, chunk_fnames, texts, vectorizer):
     tf_idf_formula = r"$\mathrm{tfidf}(t,d,D)=\mathrm{tf} (t,d)\cdot \mathrm{idf}(t,D)$"
     D_formula = r"|\{d:d\in D{\text{ and }}t\in d\}|"
     idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
     {mo.ui.table(X_df, selection=None)}
     """)
+    # build raw‐counts table on identical vocab
+    from sklearn.feature_extraction.text import CountVectorizer
+    cv = CountVectorizer(vocabulary=vectorizer.vocabulary_)
+    count_mat = cv.fit_transform(texts)
+    count_df = pd.DataFrame(
+        count_mat.toarray(),
+        index=chunk_fnames,
+        columns=vectorizer.get_feature_names_out(),
+    )
+    return X_df, count_df
 @app.cell
     ## Correspondence Analysis / 対応分析
     対応分析（CA）のbiplotでは、主成分分析のbiplotと似ているような分析として、サンプルと素性の関係が観察できますが、いくつかの違いがあります。
+    対応分析を行うには、$\mathrm{tfidf}$行列ではなく粗頻度行列をカテゴリカルな形式の分割表（contingency table）に変換する必要があります。次に、そのデータを連関表として解析します。この手法により、
     - 会話文と地の文カテゴリと特定単語出現パターンとの関連性を検討
     - サンプルのカテゴリと単語特徴量との離散的な関連として関係性を示すバイプロットを作成
     df_chk["work"] = works
     df_chk["speech_type"] = speech_types
+    # filter out collinear dimensions by Cramér’s V
+    from scipy.stats import chi2_contingency
+    def cramers_v(m: np.ndarray) -> float:
+        """Compute Cramér’s V from a contingency‐matrix."""
+        chi2 = chi2_contingency(m, correction=False)[0]
+        n = m.sum()
+        k = min(m.shape) - 1
+        return np.sqrt(chi2 / (n * k))
+    cols = ["author", "category", "work", "speech_type"]
+    vmat = pd.DataFrame(index=cols, columns=cols, dtype=float)
+    for i in cols:
+        for j in cols:
+            if i == j:
+                vmat.loc[i, j] = 1.0
+            else:
+                m = pd.crosstab(df_chk[i], df_chk[j]).values
+                vmat.loc[i, j] = cramers_v(m)
+    print(vmat)
+    # drop any dimension that is nearly collinear with another (V > .95)
+    high_thresh = 0.95
+    # only drop the later dimension in each tuple
+    drop = {
+        j for i, j in itertools.combinations(cols, 2) if vmat.loc[i, j] > high_thresh
+    }
+    # special‐case: in pure speech vs non-speech mode (category == speech_type),
+    # keep speech_type (the more descriptive) and drop category instead
+    if vmat.loc["category", "speech_type"] > high_thresh and chunk_cats == speech_types:
+        drop.discard("speech_type")
+        drop.add("category")
+    filtered_dims = [d for d in cols if d not in drop]
+    print(drop, filtered_dims)
+    # warn on moderate association .3 ≤ V ≤ .6
+    collinear_warns = []
+    for i in cols:
+        for j in cols:
+            if i < j and 0.3 <= vmat.loc[i, j] <= 0.6:
+                collinear_warns.append(
+                    f"⚠️ `{i}` vs `{j}` moderate association (V={vmat.loc[i, j]:.2f})"
+                )
+    collinear_message = mo.md("## Warning\n" + "\n".join(collinear_warns)).callout(
+        kind="warning"
+    )
+    dims_all = filtered_dims  # start with our filtered labels
     options: list[str] = []
     # Enumerate all non-empty combinations; keep those yielding >2 groups
     for r in range(1, len(dims_all) + 1):
     mo.stop(
         not options,
+        mo.md(
+            f"No category combination yielding more than two rows, so cannot perform CA.\n{collinear_message}"
+        ),
     )
     ca_group_by = mo.ui.dropdown(
 @app.cell
+def _(authors, ca_group_by, chunk_cats, count_df, speech_types, works):
+    df = count_df.copy()
     df["author"] = authors
     df["category"] = chunk_cats
     df["work"] = works
         distfun=distfun,
         linkagefun=linkagefun,
     )
+    fig.update_layout(
+        width=800,
+        height=dendrogram_height.value,
+        title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on samples",
+    )
     mo.ui.plotly(fig)
     return distfun, ff, linkagefun
         distfun=distfun,
         linkagefun=linkagefun,
     )
+    fig_T.update_layout(
+        width=800,
+        height=dendrogram_height.value,
+        title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on features",
+    )
     mo.ui.plotly(fig_T)
     return