Spaces:

bor
/

scattertext-en-novels

Running

App Files Files Community

Bor Hodošček commited on Jul 1

Commit

d479194

unverified ·

1 Parent(s): e8b9f04

fix: actually use tfvectorizer; feat: improved plots

Browse files

Files changed (3) hide show

app.py +103 -90
pyproject.toml +1 -0
uv.lock +15 -0

app.py CHANGED Viewed

@@ -8,6 +8,7 @@
 #     "numpy==2.2.6",
 #     "pandas==2.3.0",
 #     "pca==2.10.0",
 #     "pyarrow",
 #     "scattertext==0.2.2",
 #     "scikit-learn==1.7.0",
@@ -34,13 +35,12 @@ with app.setup:
     import numpy as np
     import random
     import re
-    import altair as alt
     import scattertext as st
     from pca import pca
     import matplotlib.pyplot as plt
     from pathlib import Path
     from types import SimpleNamespace
-    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
     RANDOM_SEED = 42
     random.seed(RANDOM_SEED)
@@ -92,20 +92,28 @@ def function_export():
         chunk_size: int = 2000,
     ) -> tuple[list[str], list[str], list[str]]:
         """Chunk each text into segments of chunk_size tokens, preserving category and filename."""
-        chunked_texts = []
-        chunked_cats = []
-        chunked_fnames = []
         for text, cat, fname in zip(texts, categories, filenames):
             tokens = text.split()
             for i in range(0, len(tokens), chunk_size):
                 chunk = " ".join(tokens[i : i + chunk_size])
                 chunked_texts.append(chunk)
                 chunked_cats.append(cat)
-                chunked_fnames.append(f"{fname}#{i // chunk_size + 1}")
-            else:  # chunk_size is larger then the text
                 chunked_texts.append(chunk)
                 chunked_cats.append(cat)
-                chunked_fnames.append(f"{fname}#leftover")
         return chunked_texts, chunked_cats, chunked_fnames
     @mo.cache
@@ -113,7 +121,9 @@ def function_export():
         texts: list[str],
         categories: list[str],
         filenames: list[str],
-        max_features: int = 100,
     ) -> tuple[
         st.Corpus,
         scipy.sparse.spmatrix,
@@ -124,15 +134,14 @@ def function_export():
         """Fit TF-IDF + CountVectorizer & build a st.Corpus on already‐chunked data."""
         # texts, categories, filenames are assumed already chunked upstream
-        tfv = TfidfVectorizer()
         X_tfidf = tfv.fit_transform(texts)
-        cv = CountVectorizer(vocabulary=tfv.vocabulary_, max_features=max_features)
         y_codes = pd.Categorical(
             categories, categories=pd.Categorical(categories).categories
         ).codes
         scikit_corpus = st.CorpusFromScikit(
-            X=cv.fit_transform(texts),
             y=y_codes,
             feature_vocabulary=tfv.vocabulary_,
             category_names=list(pd.Categorical(categories).categories),
@@ -570,6 +579,9 @@ def _():
     # 探索的検証
     クラスター分析のデンドログラムと主成分分析（biplot）による探索的検証を行います。
     """
     )
     return
@@ -608,108 +620,109 @@ def pca_biplot(chunk_cats, tfidf_X, vectorizer):
         row_labels=chunk_cats,
     )
-    model.biplot(legend=True, figsize=(12, 8), PC=[0, 1])
     # labels=np.array(chunk_fnames)
     topfeat = results["topfeat"]
     mo.vstack(
         [
             mo.md(
-                "## [PCA](https://erdogant.github.io/pca/pages/html/index.html)によるbiplot"
             ),
             mo.mpl.interactive(plt.gcf()),
             topfeat,
         ]
     )
-    return (X,)
 @app.cell
-def _(X, chunk_fnames):
-    import scipy.cluster.hierarchy as sch
-    import scipy.spatial.distance as ssd
-    # 2. compute linkage on cosine distance
-    dists = ssd.pdist(X, metric="cosine")
-    Z = sch.linkage(dists, method="average")
-    # 3. get a truncated dendrogram (no_plot=True just to get data)
-    # Use our filenames for leaf labels
-    den = sch.dendrogram(
-        Z,
-        no_plot=True,
-        truncate_mode="level",
-        p=3,
-        labels=chunk_fnames,
-    )
-    # 4. helpers to reshape the SciPy output
-    def get_leaf_loc(den):
-        # leaves are spaced every 10 units in icoord
-        mn = int(np.min(den["icoord"]))
-        mx = int(np.max(den["icoord"]) + 1)
-        return list(range(mn, mx, 10))
-    def get_df_coord(den):
-        cols_x = ["xk1", "xk2", "xk3", "xk4"]
-        cols_y = ["yk1", "yk2", "yk3", "yk4"]
-        dfx = pd.DataFrame(den["icoord"], columns=cols_x)
-        dfy = pd.DataFrame(den["dcoord"], columns=cols_y)
-        return dfx.merge(dfy, left_index=True, right_index=True)
-    source = get_df_coord(den)
-    # 5. build the U‐shapes with three mark_rule layers
-    base = alt.Chart(source)
-    shoulder = base.mark_rule().encode(
-        alt.X("xk2:Q", title=""),
-        alt.X2("xk3:Q"),
-        alt.Y("yk2:Q", title=""),
-    )
-    arm1 = base.mark_rule().encode(
-        alt.X("xk1:Q"),
-        alt.Y("yk1:Q"),
-        alt.Y2("yk2:Q"),
     )
-    arm2 = base.mark_rule().encode(
-        alt.X("xk3:Q"),
-        alt.Y("yk3:Q"),
-        alt.Y2("yk4:Q"),
     )
-    chart_den = shoulder + arm1 + arm2
-    # 6. leaf labels
-    # den["ivl"] now contains the correct filenames for each displayed leaf
-    df_text = pd.DataFrame({
-     "labels": den["ivl"],
-     "x":      get_leaf_loc(den),
-    })
-    chart_text = (
-        alt.Chart(df_text)
-        .mark_text(dy=0, angle=0, align="center")
-        .encode(
-            x=alt.X("x:Q", axis={"grid": False, "title": "Leaf nodes"}),
-            text=alt.Text("labels:N"),
-        )
     )
-    # 7. combine and configure
-    final = (
-        (chart_den & chart_text)
-        .resolve_scale(x="shared")
-        .configure(padding={"top": 10, "left": 10})
-        .configure_concat(spacing=0)
-        .configure_axis(labels=False, ticks=False, grid=False)
-        .properties(title="Hierarchical Clustering Dendrogram")
     )
-    # 8. hand off to Marimo
-    mo.ui.altair_chart(final)
     return
 @app.cell
-def _():
     return

 #     "numpy==2.2.6",
 #     "pandas==2.3.0",
 #     "pca==2.10.0",
+#     "plotly==6.2.0",
 #     "pyarrow",
 #     "scattertext==0.2.2",
 #     "scikit-learn==1.7.0",
     import numpy as np
     import random
     import re
     import scattertext as st
     from pca import pca
     import matplotlib.pyplot as plt
     from pathlib import Path
     from types import SimpleNamespace
+    from sklearn.feature_extraction.text import TfidfVectorizer
     RANDOM_SEED = 42
     random.seed(RANDOM_SEED)
         chunk_size: int = 2000,
     ) -> tuple[list[str], list[str], list[str]]:
         """Chunk each text into segments of chunk_size tokens, preserving category and filename."""
+        chunked_texts: list[str] = []
+        chunked_cats: list[str] = []
+        chunked_fnames: list[str] = []
         for text, cat, fname in zip(texts, categories, filenames):
+            # compute a short “Initials‐Initials” label for author‐title
+            stem = Path(fname).stem.replace("_advanced", "")
+            author, title = stem.split("_", 1)
+            def _initials(s: str) -> str:
+                return "".join(tok[0].upper() for tok in s.split("-"))
+            short_label = f"{_initials(author)}-{_initials(title)}"
             tokens = text.split()
             for i in range(0, len(tokens), chunk_size):
                 chunk = " ".join(tokens[i : i + chunk_size])
                 chunked_texts.append(chunk)
                 chunked_cats.append(cat)
+                chunked_fnames.append(f"{short_label}({cat})#{i // chunk_size + 1}")
+            else:
                 chunked_texts.append(chunk)
                 chunked_cats.append(cat)
+                chunked_fnames.append(f"{short_label}({cat})#last")
         return chunked_texts, chunked_cats, chunked_fnames
     @mo.cache
         texts: list[str],
         categories: list[str],
         filenames: list[str],
+        min_df: float = 0.25,
+        max_df: float = 0.8,
+        max_features: int = 200,
     ) -> tuple[
         st.Corpus,
         scipy.sparse.spmatrix,
         """Fit TF-IDF + CountVectorizer & build a st.Corpus on already‐chunked data."""
         # texts, categories, filenames are assumed already chunked upstream
+        tfv = TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=max_features)
         X_tfidf = tfv.fit_transform(texts)
         y_codes = pd.Categorical(
             categories, categories=pd.Categorical(categories).categories
         ).codes
         scikit_corpus = st.CorpusFromScikit(
+            X=tfv.fit_transform(texts),
             y=y_codes,
             feature_vocabulary=tfv.vocabulary_,
             category_names=list(pd.Categorical(categories).categories),
     # 探索的検証
     クラスター分析のデンドログラムと主成分分析（biplot）による探索的検証を行います。
+    Biplotでは各テキストが丸点で、各素性が矢印で同じプロットで示されています。
+    矢印の色が赤の場合、その素性の負荷量絶対値が高く、色が青いの場合は、どの主成分で高くないという意味になります。
     """
     )
     return
         row_labels=chunk_cats,
     )
+    three_switch = mo.ui.switch(label="3D")
+    three_switch
+    return X, model, results, three_switch
+@app.cell
+def _(model, results, three_switch):
+    model.biplot(
+        legend=True,
+        figsize=(12, 8),
+        fontsize=12,
+        s=20,
+        PC=[0, 1, 2] if three_switch.value else [0, 1],
+    )
     # labels=np.array(chunk_fnames)
     topfeat = results["topfeat"]
     mo.vstack(
         [
             mo.md(
+                """## [PCA](https://erdogant.github.io/pca/pages/html/index.html)のbiplot
+                """
             ),
             mo.mpl.interactive(plt.gcf()),
             topfeat,
         ]
     )
+    return
 @app.cell
+def _():
+    linkage_methods = mo.ui.dropdown(
+        options=[
+            "ward",
+            "single",
+            "complete",
+            "average",
+        ],
+        value="ward",
+        label="Linkage Method",
     )
+    distance_metrics = mo.ui.dropdown(
+        options=["cosine", "euclidean", "cityblock", "hamming"],
+        value="cosine",
+        label="Distance Metric",
     )
+    dendrogram_height = mo.ui.number(
+        label="Dendrogram plot height (increase if hard to see labels)",
+        start=800,
+        value=1600,
     )
+    d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")
+    mo.md(f"""
+    ## 階層的クラスタリング
+    {d_stack}
+    {dendrogram_height}
+    """)
+    return dendrogram_height, distance_metrics, linkage_methods
+@app.cell
+def _(X, chunk_fnames, dendrogram_height, distance_metrics, linkage_methods):
+    import plotly.figure_factory as ff
+    import scipy.spatial.distance as ssd
+    import scipy.cluster.hierarchy as sch
+    distfun = lambda M: ssd.pdist(M, metric=distance_metrics.value)
+    linkagefun = lambda D: sch.linkage(D, method=linkage_methods.value)
+    fig = ff.create_dendrogram(
+        X,
+        orientation="left",
+        labels=list(chunk_fnames),
+        distfun=distfun,
+        linkagefun=linkagefun,
     )
+    fig.update_layout(width=800, height=dendrogram_height.value)
+    mo.ui.plotly(fig)
     return
 @app.cell
+def sample_selector(fnames):
+    text_selector = mo.ui.dropdown(
+        options=list(sorted(fnames)),
+        value=fnames[0] if fnames else None,
+        label="Select a sample to view",
+    )
+    text_selector
+    return (text_selector,)
+@app.cell
+def sample_viewer(fnames, text_selector, texts):
+    mo.stop(not text_selector.value, "No sample selected.")
+    selected_idx = fnames.index(text_selector.value)
+    mo.md(f"### {text_selector.value}\n\n{texts[selected_idx]}")
     return

pyproject.toml CHANGED Viewed

@@ -11,6 +11,7 @@ dependencies = [
     "numpy>=2.2.6",
     "pandas>=2.3.0",
     "pca>=2.10.0",
     "pyarrow>=20.0.0",
     "scattertext==0.2.2",
     "scikit-learn==1.7.0",

     "numpy>=2.2.6",
     "pandas>=2.3.0",
     "pca>=2.10.0",
+    "plotly>=6.2.0",
     "pyarrow>=20.0.0",
     "scattertext==0.2.2",
     "scikit-learn==1.7.0",

uv.lock CHANGED Viewed

@@ -914,6 +914,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/67/32/32dc030cfa91ca0fc52baebbba2e009bb001122a1daa8b6a79ad830b38d3/pillow-11.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:225c832a13326e34f212d2072982bb1adb210e0cc0b153e688743018c94a2681", size = 2417234, upload-time = "2025-04-12T17:49:08.399Z" },
 ]
 [[package]]
 name = "preshed"
 version = "3.0.10"
@@ -1262,6 +1275,7 @@ dependencies = [
     { name = "numpy" },
     { name = "pandas" },
     { name = "pca" },
     { name = "pyarrow" },
     { name = "scattertext" },
     { name = "scikit-learn" },
@@ -1278,6 +1292,7 @@ requires-dist = [
     { name = "numpy", specifier = ">=2.2.6" },
     { name = "pandas", specifier = ">=2.3.0" },
     { name = "pca", specifier = ">=2.10.0" },
     { name = "pyarrow", specifier = ">=20.0.0" },
     { name = "scattertext", specifier = "==0.2.2" },
     { name = "scikit-learn", specifier = "==1.7.0" },

     { url = "https://files.pythonhosted.org/packages/67/32/32dc030cfa91ca0fc52baebbba2e009bb001122a1daa8b6a79ad830b38d3/pillow-11.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:225c832a13326e34f212d2072982bb1adb210e0cc0b153e688743018c94a2681", size = 2417234, upload-time = "2025-04-12T17:49:08.399Z" },
 ]
+[[package]]
+name = "plotly"
+version = "6.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "narwhals" },
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6e/5c/0efc297df362b88b74957a230af61cd6929f531f72f48063e8408702ffba/plotly-6.2.0.tar.gz", hash = "sha256:9dfa23c328000f16c928beb68927444c1ab9eae837d1fe648dbcda5360c7953d", size = 6801941, upload-time = "2025-06-26T16:20:45.765Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ed/20/f2b7ac96a91cc5f70d81320adad24cc41bf52013508d649b1481db225780/plotly-6.2.0-py3-none-any.whl", hash = "sha256:32c444d4c940887219cb80738317040363deefdfee4f354498cc0b6dab8978bd", size = 9635469, upload-time = "2025-06-26T16:20:40.76Z" },
+]
 [[package]]
 name = "preshed"
 version = "3.0.10"
     { name = "numpy" },
     { name = "pandas" },
     { name = "pca" },
+    { name = "plotly" },
     { name = "pyarrow" },
     { name = "scattertext" },
     { name = "scikit-learn" },
     { name = "numpy", specifier = ">=2.2.6" },
     { name = "pandas", specifier = ">=2.3.0" },
     { name = "pca", specifier = ">=2.10.0" },
+    { name = "plotly", specifier = ">=6.2.0" },
     { name = "pyarrow", specifier = ">=20.0.0" },
     { name = "scattertext", specifier = "==0.2.2" },
     { name = "scikit-learn", specifier = "==1.7.0" },