Spaces:

bor
/

scattertext-ja-novels

Running

App Files Files Community

Bor Hodošček commited on 28 days ago

Commit

e960361

unverified ·

1 Parent(s): 1a0a2cf

feat: inital commit of working demo

Browse files

Files changed (11) hide show

Dockerfile +12 -9
Natsume_S_Bocchan.txt +0 -0
Natsume_S_Kokoro.txt +0 -0
README.md +1 -1
Unno_J_Chikyuuyousa.txt +0 -0
Unno_J_Kaseiheidan.txt +0 -0
app.py +563 -376
development.md +2 -2
pyproject.toml +17 -0
requirements.txt +0 -5
uv.lock +0 -0

Dockerfile CHANGED Viewed

@@ -1,5 +1,5 @@
-FROM python:3.12
-COPY --from=ghcr.io/astral-sh/uv:0.4.20 /uv /bin/uv
 RUN useradd -m -u 1000 user
 ENV PATH="/home/user/.local/bin:$PATH"
@@ -7,13 +7,16 @@ ENV UV_SYSTEM_PYTHON=1
 WORKDIR /app
-COPY --chown=user ./requirements.txt requirements.txt
-RUN uv pip install -r requirements.txt
-COPY --chown=user . /app
-RUN mkdir -p /app/__marimo__ && \
-    chown -R user:user /app && \
-    chmod -R 755 /app
 USER user
-CMD ["marimo", "run", "app.py", "--include-code", "--host", "0.0.0.0", "--port", "7860"]

+FROM python:3.12-slim
+COPY --from=ghcr.io/astral-sh/uv:0.7.13 /uv /bin/uv
 RUN useradd -m -u 1000 user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
+RUN apt update && apt install -y curl unzip gcc g++
+RUN mkdir -p /app && chown -R user:user /app
+COPY --chown=user ./pyproject.toml ./uv.lock ./app.py ./*.txt /app
+RUN chmod -R u+w /app
 USER user
+RUN curl -O https://clrd.ninjal.ac.jp/unidic_archive/2308/unidic-novel-v202308.zip && unzip -x unidic-novel-v202308.zip
+RUN uv sync
+CMD ["uv", "run", "marimo", "run", "app.py", "--no-sandbox", "--include-code", "--host", "0.0.0.0", "--port", "7860"]

Natsume_S_Bocchan.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Natsume_S_Kokoro.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Scattertext Ja Novels
 emoji: 🍃
 colorFrom: indigo
 colorTo: purple

 ---
+title: scattertext-ja-novels
 emoji: 🍃
 colorFrom: indigo
 colorTo: purple

Unno_J_Chikyuuyousa.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Unno_J_Kaseiheidan.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py CHANGED Viewed

@@ -1,469 +1,656 @@
-import marimo
-__generated_with = "0.9.2"
-app = marimo.App()
-@app.cell
-def __():
     import marimo as mo
-    mo.md("# Welcome to marimo! 🌊🍃")
-    return (mo,)
 @app.cell
-def __(mo):
-    slider = mo.ui.slider(1, 22)
-    return (slider,)
 @app.cell
-def __(mo, slider):
     mo.md(
-        f"""
-        marimo is a **reactive** Python notebook.
-        This means that unlike traditional notebooks, marimo notebooks **run
-        automatically** when you modify them or
-        interact with UI elements, like this slider: {slider}.
-        {"##" + "🍃" * slider.value}
-        """
-    )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
-        {
-            "Tip: disabling automatic execution": mo.md(
-                rf"""
-            marimo lets you disable automatic execution: just go into the
-            notebook settings and set
-            "Runtime > On Cell Change" to "lazy".
-            When the runtime is lazy, after running a cell, marimo marks its
-            descendants as stale instead of automatically running them. The
-            lazy runtime puts you in control over when cells are run, while
-            still giving guarantees about the notebook state.
-            """
-            )
-        }
     )
     return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        Tip: This is a tutorial notebook. You can create your own notebooks
-        by entering `marimo edit` at the command line.
-        """
-    ).callout()
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 1. Reactive execution
-        A marimo notebook is made up of small blocks of Python code called
-        cells.
-        marimo reads your cells and models the dependencies among them: whenever
-        a cell that defines a global variable  is run, marimo
-        **automatically runs** all cells that reference that variable.
-        Reactivity keeps your program state and outputs in sync with your code,
-        making for a dynamic programming environment that prevents bugs before they
-        happen.
-        """
-    )
-    return
-@app.cell(hide_code=True)
-def __(changed, mo):
-    (
-        mo.md(
-            f"""
-            **✨ Nice!** The value of `changed` is now {changed}.
-            When you updated the value of the variable `changed`, marimo
-            **reacted** by running this cell automatically, because this cell
-            references the global variable `changed`.
-            Reactivity ensures that your notebook state is always
-            consistent, which is crucial for doing good science; it's also what
-            enables marimo notebooks to double as tools and  apps.
-            """
-        )
-        if changed
-        else mo.md(
-            """
-            **🌊 See it in action.** In the next cell, change the value of the
-            variable  `changed` to `True`, then click the run button.
-            """
         )
     )
-    return
 @app.cell
-def __():
-    changed = False
-    return (changed,)
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
-        {
-            "Tip: execution order": (
-                """
-                The order of cells on the page has no bearing on
-                the order in which cells are executed: marimo knows that a cell
-                reading a variable must run after the cell that  defines it. This
-                frees you to organize your code in the way that makes the most
-                sense for you.
-                """
-            )
-        }
-    )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        **Global names must be unique.** To enable reactivity, marimo imposes a
-        constraint on how names appear in cells: no two cells may define the same
-        variable.
-        """
-    )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
-        {
-            "Tip: encapsulation": (
-                """
-                By encapsulating logic in functions, classes, or Python modules,
-                you can minimize the number of global variables in your notebook.
-                """
-            )
-        }
-    )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.accordion(
-        {
-            "Tip: private variables": (
-                """
-                Variables prefixed with an underscore are "private" to a cell, so
-                they can be defined by multiple cells.
-                """
             )
-        }
-    )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 2. UI elements
-        Cells can output interactive UI elements. Interacting with a UI
-        element **automatically triggers notebook execution**: when
-        you interact with a UI element, its value is sent back to Python, and
-        every cell that references that element is re-run.
-        marimo provides a library of UI elements to choose from under
-        `marimo.ui`.
-        """
-    )
-    return
-@app.cell
-def __(mo):
-    mo.md("""**🌊 Some UI elements.** Try interacting with the below elements.""")
-    return
 @app.cell
-def __(mo):
-    icon = mo.ui.dropdown(["🍃", "🌊", "✨"], value="🍃")
-    return (icon,)
 @app.cell
-def __(icon, mo):
-    repetitions = mo.ui.slider(1, 16, label=f"number of {icon.value}: ")
-    return (repetitions,)
 @app.cell
-def __(icon, repetitions):
-    icon, repetitions
     return
 @app.cell
-def __(icon, mo, repetitions):
-    mo.md("# " + icon.value * repetitions.value)
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 3. marimo is just Python
-        marimo cells parse Python (and only Python), and marimo notebooks are
-        stored as pure Python files — outputs are _not_ included. There's no
-        magical syntax.
-        The Python files generated by marimo are:
-        - easily versioned with git, yielding minimal diffs
-        - legible for both humans and machines
-        - formattable using your tool of choice,
-        - usable as Python  scripts, with UI  elements taking their default
-        values, and
-        - importable by other modules (more on that in the future).
-        """
     )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 4. Running notebooks as apps
-        marimo notebooks can double as apps. Click the app window icon in the
-        bottom-right to see this notebook in "app view."
-        Serve a notebook as an app with `marimo run` at the command-line.
-        Of course, you can use marimo just to level-up your
-        notebooking, without ever making apps.
-        """
     )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 5. The `marimo` command-line tool
-        **Creating and editing notebooks.** Use
-        ```
-        marimo edit
-        ```
-        in a terminal to start the marimo notebook server. From here
-        you can create a new notebook or edit existing ones.
-        **Running as apps.** Use
-        ```
-        marimo run notebook.py
-        ```
-        to start a webserver that serves your notebook as an app in read-only mode,
-        with code cells hidden.
-        **Convert a Jupyter notebook.** Convert a Jupyter notebook to a marimo
-        notebook using `marimo convert`:
-        ```
-        marimo convert your_notebook.ipynb > your_app.py
-        ```
-        **Tutorials.** marimo comes packaged with tutorials:
-        - `dataflow`: more on marimo's automatic execution
-        - `ui`: how to use UI elements
-        - `markdown`: how to write markdown, with interpolated values and
-           LaTeX
-        - `plots`: how plotting works in marimo
-        - `sql`: how to use SQL
-        - `layout`: layout elements in marimo
-        - `fileformat`: how marimo's file format works
-        - `markdown-format`: for using `.md` files in marimo
-        - `for-jupyter-users`: if you are coming from Jupyter
-        Start a tutorial with `marimo tutorial`; for example,
-        ```
-        marimo tutorial dataflow
-        ```
-        In addition to tutorials, we have examples in our
-        [our GitHub repo](https://www.github.com/marimo-team/marimo/tree/main/examples).
-        """
     )
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        ## 6. The marimo editor
-        Here are some tips to help you get started with the marimo editor.
-        """
     )
-    return
-@app.cell
-def __(mo, tips):
-    mo.accordion(tips)
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md("""## Finally, a fun fact""")
-    return
-@app.cell(hide_code=True)
-def __(mo):
-    mo.md(
-        """
-        The name "marimo" is a reference to a type of algae that, under
-        the right conditions, clumps together to form a small sphere
-        called a "marimo moss ball". Made of just strands of algae, these
-        beloved assemblages are greater than the sum of their parts.
-        """
     )
-    return
-@app.cell(hide_code=True)
-def __():
-    tips = {
-        "Saving": (
-            """
-            **Saving**
-            - _Name_ your app using the box at the top of the screen, or
-              with `Ctrl/Cmd+s`. You can also create a named app at the
-              command line, e.g., `marimo edit app_name.py`.
-            - _Save_ by clicking the save icon on the bottom right, or by
-              inputting `Ctrl/Cmd+s`. By default marimo is configured
-              to autosave.
-            """
-        ),
-        "Running": (
-            """
-            1. _Run a cell_ by clicking the play ( ▷ ) button on the top
-            right of a cell, or by inputting `Ctrl/Cmd+Enter`.
-            2. _Run a stale cell_  by clicking the yellow run button on the
-            right of the cell, or by inputting `Ctrl/Cmd+Enter`. A cell is
-            stale when its code has been modified but not run.
-            3. _Run all stale cells_ by clicking the play ( ▷ ) button on
-            the bottom right of the screen, or input `Ctrl/Cmd+Shift+r`.
-            """
-        ),
-        "Console Output": (
-            """
-            Console output (e.g., `print()` statements) is shown below a
-            cell.
-            """
-        ),
-        "Creating, Moving, and Deleting Cells": (
-            """
-            1. _Create_ a new cell above or below a given one by clicking
-                the plus button to the left of the cell, which appears on
-                mouse hover.
-            2. _Move_ a cell up or down by dragging on the handle to the
-                right of the cell, which appears on mouse hover.
-            3. _Delete_ a cell by clicking the trash bin icon. Bring it
-                back by clicking the undo button on the bottom right of the
-                screen, or with `Ctrl/Cmd+Shift+z`.
-            """
-        ),
-        "Disabling Automatic Execution": (
-            """
-            Via the notebook settings (gear icon) or footer panel, you
-            can disable automatic execution. This is helpful when
-            working with expensive notebooks or notebooks that have
-            side-effects like database transactions.
-            """
-        ),
-        "Disabling Cells": (
-            """
-            You can disable a cell via the cell context menu.
-            marimo will never run a disabled cell or any cells that depend on it.
-            This can help prevent accidental execution of expensive computations
-            when editing a notebook.
-            """
-        ),
-        "Code Folding": (
-            """
-            You can collapse or fold the code in a cell by clicking the arrow
-            icons in the line number column to the left, or by using keyboard
-            shortcuts.
-            Use the command palette (`Ctrl/Cmd+k`) or a keyboard shortcut to
-            quickly fold or unfold all cells.
-            """
-        ),
-        "Code Formatting": (
-            """
-            If you have [ruff](https://github.com/astral-sh/ruff) installed,
-            you can format a cell with the keyboard shortcut `Ctrl/Cmd+b`.
-            """
-        ),
-        "Command Palette": (
-            """
-            Use `Ctrl/Cmd+k` to open the command palette.
-            """
-        ),
-        "Keyboard Shortcuts": (
-            """
-            Open the notebook menu (top-right) or input `Ctrl/Cmd+Shift+h` to
-            view a list of all keyboard shortcuts.
-            """
-        ),
-        "Configuration": (
-            """
-           Configure the editor by clicking the gears icon near the top-right
-           of the screen.
-           """
-        ),
-    }
-    return (tips,)
 if __name__ == "__main__":

+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "altair==5.5.0",
+#     "fugashi-plus",
+#     "marimo",
+#     "numpy==2.2.6",
+#     "pandas==2.3.0",
+#     "pyarrow",
+#     "scattertext==0.2.2",
+#     "scikit-learn==1.7.0",
+#     "scipy==1.13.1",
+# ]
+# ///
+import marimo
+__generated_with = "0.13.15"
+app = marimo.App(width="full", app_title="Scattertext on Japanese novels")
+with app.setup:
     import marimo as mo
+    import itertools
+    import fugashi
+    import pandas as pd
+    import scipy
+    import numpy as np
+    import random
+    import scattertext as st
+    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+    RANDOM_SEED = 42
+    random.seed(RANDOM_SEED)
+    np.random.seed(RANDOM_SEED)
 @app.cell
+def function_export():
+    @mo.cache
+    def parse_texts(texts: list[str]) -> list[str]:
+        """Tokenize a list of raw strings via fugashi (MeCab)."""
+        tagger = fugashi.Tagger("-Owakati -d ./unidic-novel -r ./unidic-novel/dicrc")
+        return [tagger.parse(txt).strip() for txt in texts]
+    @mo.cache
+    def build_corpus_cached(
+        texts: list[str],
+        categories: list[str],
+    ) -> st.Corpus:
+        """Build or reuse cached Scattertext corpus."""
+        df = pd.DataFrame({"text": texts, "category": categories})
+        return (
+            st.CorpusFromPandas(
+                df,
+                category_col="category",
+                text_col="text",
+                nlp=st.whitespace_nlp_with_sentences,
+            )
+            .build()
+            .get_unigram_corpus()
+            .compact(st.AssociationCompactor(2000))
+        )
+    @mo.cache
+    def chunk_texts(
+        texts: list[str],
+        categories: list[str],
+        filenames: list[str],
+        chunk_size: int = 2000,
+    ) -> tuple[list[str], list[str], list[str]]:
+        """Chunk each text into segments of chunk_size tokens, preserving category and filename."""
+        chunked_texts = []
+        chunked_cats = []
+        chunked_fnames = []
+        for text, cat, fname in zip(texts, categories, filenames):
+            tokens = text.split()
+            for i in range(0, len(tokens), chunk_size):
+                chunk = " ".join(tokens[i : i + chunk_size])
+                chunked_texts.append(chunk)
+                chunked_cats.append(cat)
+                chunked_fnames.append(f"{fname}#{i // chunk_size + 1}")
+        return chunked_texts, chunked_cats, chunked_fnames
+    @mo.cache
+    def train_scikit_cached(
+        texts: list[str], categories: list[str], filenames: list[str]
+    ) -> tuple[
+        st.Corpus,
+        scipy.sparse.spmatrix,
+        TfidfVectorizer,
+        list[str],
+        list[str],
+    ]:
+        """Fit TF-IDF + CountVectorizer & build a st.Corpus on chunked data."""
+        chunk_texts_out, chunk_cats, chunk_fnames = chunk_texts(
+            texts, categories, filenames
+        )
+        tfv = TfidfVectorizer()
+        X_tfidf = tfv.fit_transform(chunk_texts_out)
+        cv = CountVectorizer(vocabulary=tfv.vocabulary_, max_features=100)
+        y_codes = pd.Categorical(
+            chunk_cats, categories=pd.Categorical(chunk_cats).categories
+        ).codes
+        scikit_corpus = st.CorpusFromScikit(
+            X=cv.fit_transform(chunk_texts_out),
+            y=y_codes,
+            feature_vocabulary=tfv.vocabulary_,
+            category_names=list(pd.Categorical(chunk_cats).categories),
+            raw_texts=chunk_texts_out,
+        ).build()
+        return (
+            scikit_corpus,
+            X_tfidf,
+            tfv,
+            chunk_cats,
+            chunk_fnames,
+        )
+    return build_corpus_cached, chunk_texts, parse_texts, train_scikit_cached
 @app.cell
+def intro():
     mo.md(
+        r"""
+    # Scattertext on Japanese novels / 近代文学作品のScattertext可視化
+    ## 概要
+    2つの異なるカテゴリのテキストファイル群をアップロードし、その差異をScattertextで可視化します。
+    オプショナルで機械学習モデルで分類をし、モデルの分類制度とモデルが識別に用いるトークンも確認できます。
+    ## ワークフロー
+    1. テキストファイルをアップロード（デフォルトを使う場合はそのままSubmitしてください）
+    2. データ内容を確認・修正
+    3. チャンク＆サンプリング設定
+    4. Scattertextによる可視化
+    5. （任意）分類モデルによる性能検証
+    > 単語分割には、[近現代口語小説UniDic](https://clrd.ninjal.ac.jp/unidic/download_all.html#unidic_novel)を使用しています。異なる時代やジャンルのテキストには不向きです。
+    """
     )
     return
+@app.cell
+def data_settings():
+    # 1) Create each widget
+    category_name = mo.ui.text(
+        label="カテゴリ名（例：著者名・時代区分など）",
+        placeholder="例：時代・性別・著者など",
+        value="著者",
+        full_width=True,
+    )
+    label_a = mo.ui.text(
+        label="Aのラベル", placeholder="例：夏目漱石", value="夏目漱石", full_width=True
+    )
+    files_a = mo.ui.file(
+        label="Aのファイルアップロード（UTF-8、.txt形式）", multiple=True, kind="area"
+    )
+    label_b = mo.ui.text(
+        label="Bのラベル", placeholder="例：海野十三", value="海野十三", full_width=True
+    )
+    files_b = mo.ui.file(
+        label="Bのファイルアップロード（UTF-8、.txt形式）", multiple=True, kind="area"
+    )
+    tpl = r"""
+    ## データと分析の設定
+    ※　初期では夏目漱石と海野十三から各2作品をサンプルコーパスにしています。設定を変更せずSubmitすると、サンプルコーパスでの分析になります。ファイルをアップロードする場合は忘れずにカテゴリとラベルも変更してください。
+    ※　ファイルはプレインテキスト形式必須（.txt, UTF-8エンコーディング）
+    {category_name}
+    ### グループA
+    {label_a}
+    {files_a}
+    ### グループB
+    {label_b}
+    {files_b}
+    """
+    data_form = (
+        mo.md(tpl)
+        .batch(
+            # info_box=info_box,
+            category_name=category_name,
+            label_a=label_a,
+            files_a=files_a,
+            label_b=label_b,
+            files_b=files_b,
         )
+        .form(show_clear_button=True, bordered=True)
     )
+    data_form
+    return data_form, label_a, label_b
 @app.cell
+def data_check(data_form, parse_texts):
+    mo.stop(data_form.value is None)
+    from pathlib import Path
+    validation_messages: list[str] = []
+    if data_form.value["label_a"] == data_form.value["label_b"]:
+        print("a")
+        validation_messages.append(
+            "⚠️ **警告**: グループAとBのラベルが同じです。AとBは異なるラベルを設定してください。\n"
+        )
+    if not data_form.value["files_a"] and not data_form.value["files_b"]:
+        validation_messages.append(
+            "ℹ️ ファイルが未指定のため、デフォルトサンプルを使用しています。\n"
+        )
+    try:
+        # Group A: either uploaded files or default (坊っちゃん + こころ)
+        if data_form.value["files_a"]:
+            category_a_texts = (
+                f.contents.decode("utf-8") for f in data_form.value["files_a"]
             )
+            category_a_names = (f.name for f in data_form.value["files_a"])
+        else:
+            natsume_1 = Path("Natsume_S_Bocchan.txt").read_text(encoding="utf-8")
+            natsume_2 = Path("Natsume_S_Kokoro.txt").read_text(encoding="utf-8")
+            category_a_texts = [natsume_1, natsume_2]
+            category_a_names = ["Natsume_S_Bocchan.txt", "Natsume_S_Kokoro.txt"]
+        # Group B: either uploaded files or default (地球要塞 + 火星兵団)
+        if data_form.value["files_b"]:
+            category_b_texts = (
+                f.contents.decode("utf-8") for f in data_form.value["files_b"]
+            )
+            category_b_names = (f.name for f in data_form.value["files_b"])
+        else:
+            unno_1 = Path("Unno_J_Chikyuuyousa.txt").read_text(encoding="utf-8")
+            unno_2 = Path("Unno_J_Kaseiheidan.txt").read_text(encoding="utf-8")
+            category_b_texts = [unno_1, unno_2]
+            category_b_names = ["Unno_J_Chikyuuyousa.txt", "Unno_J_Kaseiheidan.txt"]
+        data = pd.DataFrame(
+            {
+                "category": (
+                    [data_form.value["label_a"]]
+                    * (
+                        len(data_form.value["files_a"])
+                        if data_form.value["files_a"]
+                        else 2
+                    )
+                )
+                + (
+                    [data_form.value["label_b"]]
+                    * (
+                        len(data_form.value["files_b"])
+                        if data_form.value["files_b"]
+                        else 2
+                    )
+                ),
+                "filename": itertools.chain(category_a_names, category_b_names),
+                "text": itertools.chain(category_a_texts, category_b_texts),
+            }
+        )
+        with mo.status.spinner("コーパスを形態素解析中..."):
+            data["text"] = parse_texts(list(data["text"]))
+    except Exception as e:
+        data = None
+        validation_messages.append(
+            f"❌ **エラー**: ファイルの読み込みに失敗しました: {str(e)}\n"
+        )
+    # We need the maximum number of tokens for the slider
+    max_tokens = data["text"].map(lambda s: len(s.split())).max()
+    mo.md(f"""
+    ## データ確認
+    {"**警告**:\n" if validation_messages else ""}
+    {"\n".join(map(lambda x: f"- {x}", validation_messages))}
+    解析済テキスト一覧:
+    {mo.ui.table(data, selection="multi", format_mapping={"text": lambda s: s[:20] + "..."})}
+    """)
+    return (data,)
 @app.cell
+def sampling_controls_setup():
+    chunk_size = mo.ui.slider(
+        start=500,
+        stop=50_000,
+        value=2000,
+        step=500,
+        label="1チャンクあたり最大トークン数",
+        full_width=True,
+    )
+    sample_frac = mo.ui.slider(
+        start=0.1,
+        stop=1.0,
+        value=0.2,
+        step=0.05,
+        label="使用割合（1.0で全データ）",
+        full_width=True,
+    )
+    sampling_form = (
+        mo.md("{chunk_size}\n{sample_frac}")
+        .batch(chunk_size=chunk_size, sample_frac=sample_frac)
+        .form(show_clear_button=True, bordered=False)
+    )
+    sampling_form
+    return chunk_size, sample_frac, sampling_form
 @app.cell
+def _(build_corpus_cached, chunk_texts, data, sample_frac, sampling_form):
+    mo.stop(sampling_form.value is None)
+    with mo.status.spinner("コーパスをサンプリング中…"):
+        texts, cats, fnames = chunk_texts(
+            list(data.text),
+            list(data.category),
+            list(data.filename),
+            sampling_form.value["chunk_size"],
+        )
+        if sample_frac.value < 1.0:
+            N = len(texts)
+            k = int(N * sampling_form.value["sample_frac"])
+            idx = random.sample(range(N), k)
+            texts = [texts[i] for i in idx]
+            cats = [cats[i] for i in idx]
+            fnames = [fnames[i] for i in idx]
+        corpus = build_corpus_cached(
+            texts,
+            cats,
+        )
+    return cats, corpus, fnames, texts
 @app.cell
+def sampling_controls(chunk_size):
+    mo.md("トークン数を増やすと処理時間が長くなります").callout(
+        kind="info"
+    ) if chunk_size.value > 30_000 else None
     return
 @app.cell
+def plot_main_scatterplot(corpus, data_form, fnames):
+    cat_name = data_form.value["category_name"]
+    with mo.status.spinner("Scatterplot作成中…"):
+        html = st.produce_scattertext_explorer(
+            corpus,
+            category=data_form.value["label_a"],
+            category_name=f"{cat_name}: {data_form.value['label_a']}",
+            not_category_name=f"{cat_name}: {data_form.value['label_b']}",
+            width_in_pixels=1000,
+            metadata=fnames,
+        )
+    mo.vstack(
+        [
+            mo.md(f"""
+            # Scattertextの結果
+            ### Scattertext可視化の見方
+            -   （縦）上に行くほど{data_form.value["label_a"]}で相対的に多く使われるトークン
+            -   （横）右に行くほど{data_form.value["label_b"]}で相対的に多く使われるトークン
+            HTMLをダウンロードしてブラウザで開くと見やすい
+            """),
+            mo.iframe(html),
+        ]
     )
+    return (html,)
+@app.cell
+def _(html):
+    download_button = mo.download(
+        data=html.encode(),
+        filename="scattertext_analysis.html",
+        label="可視化結果をダウンロード",
     )
+    mo.md(f"{download_button}")
+    return
+@app.cell
+def classification_toggle():
+    run_model = mo.ui.switch(label="分類モデルを適用する")
+    run_model
+    return (run_model,)
+@app.cell
+def _(run_model):
+    mo.stop(not run_model.value)
+    mo.md(
+        r"""
+    # 分類モデルによる検証
+    2つのカテゴリを分類するモデルを学習し、それぞれのカテゴリを分ける有効な蘇生（単語）がどれなのかもScattertextで観察できます。
+    ここはRandom Forestという機械学習モデルを使用しています。
+    """
+    )
+    return
+@app.cell
+def _(cats, fnames, run_model, texts, train_scikit_cached):
+    mo.stop(not run_model.value)
+    scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
+        texts, cats, fnames
+    )
+    return chunk_cats, chunk_fnames, scikit_corpus, tfidf_X, vectorizer
+@app.cell
+def model_selection(run_model):
+    mo.stop(not run_model.value)
+    model_dropdown = mo.ui.dropdown(
+        options=[
+            "LogisticRegression",
+            "RandomForestClassifier",
+            "GradientBoostingClassifier",
+        ],
+        value="LogisticRegression",
+        label="モデル選択",
+    )
+    model_dropdown
+    return (model_dropdown,)
+@app.cell
+def hyperparameters(model_dropdown):
+    lr_C = mo.ui.slider(0.01, 10.0, value=1.0, step=0.01, label="LR C")
+    lr_max_iter = mo.ui.slider(100, 2000, value=1000, step=100, label="LR max_iter")
+    rf_n = mo.ui.slider(10, 500, value=100, step=10, label="RF n_estimators")
+    rf_max_depth = mo.ui.slider(1, 50, value=10, step=1, label="RF max_depth")
+    gb_n = mo.ui.slider(10, 500, value=100, step=10, label="GB n_estimators")
+    gb_lr = mo.ui.slider(0.01, 1.0, value=0.1, step=0.01, label="GB learning_rate")
+    gb_md = mo.ui.slider(1, 10, value=3, step=1, label="GB max_depth")
+    widgets = []
+    if model_dropdown.value == "LogisticRegression":
+        widgets = {"lr_C": lr_C, "lr_max_iter": lr_max_iter}
+    elif model_dropdown.value == "RandomForestClassifier":
+        widgets = {"rf_n": rf_n, "rf_max_depth": rf_max_depth}
+    else:  # GradientBoostingClassifier
+        widgets = {"gb_n": gb_n, "gb_lr": gb_lr, "gb_md": gb_md}
+    test_size = mo.ui.slider(0.1, 0.5, value=0.3, step=0.05, label="テストデータ比率")
+    model_form = (
+        mo.md("### モデルのパラメータ設定\n{widgets}\n{test_size}")
+        .batch(
+            widgets=mo.ui.dictionary(widgets),
+            test_size=test_size,
+        )
+        .form(show_clear_button=True, bordered=False)
     )
+    model_form
+    return (model_form,)
+@app.cell
+def _(
+    chunk_cats,
+    label_a,
+    label_b,
+    model_dropdown,
+    model_form,
+    roc_auc,
+    roc_df,
+    run_model,
+    tfidf_X,
+    vectorizer,
+):
+    mo.stop(not run_model.value or not model_form.value)
+    import altair as alt
+    from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.metrics import (
+        auc,
+        classification_report,
+        confusion_matrix,
+        roc_curve,
     )
+    from sklearn.model_selection import train_test_split
+    X_train, X_test, y_train, y_test = train_test_split(
+        tfidf_X,
+        chunk_cats,
+        test_size=model_form.value["test_size"],
+        random_state=RANDOM_SEED,
+    )
+    name = model_dropdown.value
+    if name == "LogisticRegression":
+        clf = LogisticRegression(
+            C=model_form.value["widgets"]["lr_C"],
+            max_iter=int(model_form.value["widgets"]["lr_max_iter"]),
+        )
+    elif name == "RandomForestClassifier":
+        clf = RandomForestClassifier(
+            n_estimators=int(model_form.value["widgets"]["rf_n"]),
+            max_depth=int(model_form.value["widgets"]["rf_max_depth"]),
+            random_state=RANDOM_SEED,
+        )
+    else:  # GradientBoostingClassifier
+        clf = GradientBoostingClassifier(
+            n_estimators=int(model_form.value["widgets"]["gb_n"]),
+            learning_rate=float(model_form.value["widgets"]["gb_lr"]),
+            max_depth=int(model_form.value["widgets"]["gb_md"]),
+            random_state=RANDOM_SEED,
+        )
+    clf.fit(X_train, y_train)
+    if hasattr(clf, "feature_importances_"):
+        term_scores = clf.feature_importances_
+    else:
+        term_scores = abs(clf.coef_[0])
+    y_pred = clf.predict(X_test)
+    report = classification_report(y_test, y_pred, output_dict=True)
+    cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
+    cm_df = (
+        pd.DataFrame(cm, index=clf.classes_, columns=clf.classes_)
+        .reset_index()
+        .melt(
+            id_vars="index",
+            var_name="Predicted",
+            value_name="count",
+        )
+        .rename(columns={"index": "Actual"})
+    )
+    # pos_idx = list(clf.classes_).index(label_a.value)
+    # _proba, roc_auc = None, None
+    # roc_df = None
+    # if hasattr(clf, "predict_proba"):
+    #     probs = clf.predict_proba(X_test)[:, pos_idx]
+    #     y_test_arr = np.array(y_test)
+    #     fpr, tpr, _ = roc_curve((y_test_arr == label_a.value).astype(int), probs)
+    #     roc_auc = auc(fpr, tpr)
+    #     roc_df = pd.DataFrame({"fpr": fpr, "tpr": tpr})
+    feature_names = vectorizer.get_feature_names_out()
+    importances = (
+        pd.DataFrame({"単語": feature_names, "重要度": term_scores})
+        .sort_values("重要度", ascending=False)
+        .head(20)
+    )
+    imp_chart = (
+        alt.Chart(importances)
+        .mark_bar()
+        .encode(
+            x=alt.X("重要度:Q", title="重要度"),
+            y=alt.Y("単語:N", sort="-x"),
+        )
+        .properties(title="Top‐20 重要特徴語", width=600, height=400)
     )
+    cm_chart = (
+        alt.Chart(cm_df)
+        .mark_rect()
+        .encode(
+            x="Predicted:N",
+            y="Actual:N",
+            color=alt.Color("count:Q", title="Count"),
+            tooltip=["Actual", "Predicted", "count"],
+        )
+        .properties(title="Confusion Matrix", width=250, height=250)
+    )
+    # roc_chart = (
+    #     alt.Chart(roc_df)
+    #     .mark_line(point=True)
+    #     .encode(
+    #         x=alt.X("fpr:Q", title="False Positive Rate"),
+    #         y=alt.Y("tpr:Q", title="True Positive Rate"),
+    #     )
+    #     .properties(
+    #         title=f"ROC Curve (AUC={roc_auc:.2f})",
+    #         width=400,
+    #         height=300,
+    #     )
+    # )
+    mo.vstack(
+        [
+            mo.ui.altair_chart(imp_chart),
+            mo.ui.altair_chart(cm_chart),
+            # mo.ui.altair_chart(roc_chart), # Turned out to not be too informative as task is too easy?
+            mo.md(f"""
+        ## テストセット上の分類性能
+        - {label_a.value}: 精度 {report[label_a.value]["precision"]:.2%}, 再現率 {report[label_a.value]["recall"]:.2%}
+        - {label_b.value}: 精度 {report[label_b.value]["precision"]:.2%}, 再現率 {report[label_b.value]["recall"]:.2%}
+        """),
+        ]
+    )
+    return (term_scores,)
+@app.cell
+def _(
+    chunk_fnames,
+    data_form,
+    model_form,
+    run_model,
+    scikit_corpus,
+    term_scores,
+):
+    mo.stop(not run_model.value or not model_form.value)
+    with mo.status.spinner("分類モデルのScatterplotを作成中…"):
+        scikit_html = st.produce_scattertext_explorer(
+            corpus=scikit_corpus,
+            category=data_form.value["label_a"],
+            category_name=data_form.value["label_a"],
+            not_category_name=data_form.value["label_b"],
+            scores=term_scores,
+            terms_to_include=st.AutoTermSelector.get_selected_terms(
+                scikit_corpus, term_scores, 4000
+            ),
+            metadata=chunk_fnames,
+            transform=lambda freqs, _index, total: freqs / total.sum(),
+            rescale_x=lambda arr: arr,  # identity
+            rescale_y=lambda arr: arr,  # identity
+        )
+    mo.iframe(scikit_html)
+    return
 if __name__ == "__main__":

development.md CHANGED Viewed

@@ -3,6 +3,6 @@
 ## Testing your Dockerfile locally
 ```bash
-docker build -t marimo-app .
-docker run -it --rm -p 7860:7860 marimo-app
 ```

 ## Testing your Dockerfile locally
 ```bash
+docker build -t scattertext-ja-novels .
+docker run -it --rm -p 7860:7860 scattertext-ja-novels
 ```

pyproject.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[project]
+name = "scattertext-ja-novels"
+version = "0.1.0"
+description = "Scattertext on Japanese novels"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "altair>=5.5.0",
+    "fugashi-plus>=1.4.0.post1",
+    "marimo>=0.13.15",
+    "numpy>=2.2.6",
+    "pandas>=2.3.0",
+    "pyarrow>=20.0.0",
+    "scattertext==0.2.2",
+    "scikit-learn==1.7.0",
+    "scipy==1.13.1",
+]

requirements.txt DELETED Viewed

@@ -1,5 +0,0 @@
-marimo
-# Or a specific version
-# marimo>=0.9.0
-# Add other dependencies as needed

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff