Spaces:

bor
/

scattertext-en-novels

Running

App Files Files Community

Bor Hodošček commited on Jul 8

Commit

6b8ea95

1 Parent(s): c8648fb

feat: kwic & improved docs; fix: stopword edge cases

Browse files

Files changed (1) hide show

app.py +147 -48

app.py CHANGED Viewed

@@ -204,6 +204,40 @@ def function_export():
         return scikit_corpus, X_tfidf, tfv, categories, filenames
     def split_speech_text(text: str) -> tuple[str, str]:
         """
         Extract all quoted spans as 'speech' and the remainder as 'non-speech'
@@ -291,6 +325,7 @@ def function_export():
     return (
         build_corpus_cached,
         chunk_texts,
         parse_texts,
         prepare_files,
         train_scikit_cached,
@@ -676,77 +711,76 @@ def _():
 @app.cell
-def stopword_settings():
     stop_filter = mo.ui.switch(label="Enable stop-word filtering?", value=False)
     stop_filter
     return (stop_filter,)
 @app.cell
-def _(stop_filter):
-    mo.stop(not stop_filter.value)
-    sw_source = mo.ui.dropdown(
-        options=["spaCy", "Custom", "Both"],
-        value="spaCy",
-        label="Stop-word source",
-    )
-    empty_df = pd.DataFrame({"stopword": []}, dtype=pd.StringDtype())
-    editor = mo.ui.data_editor(empty_df).form(
-        label="Your custom stop-words (+ Add Row)", bordered=True
-    )
     sw_source
-    return editor, sw_source
 @app.cell
-def _(editor, sw_source):
-    mo.stop(sw_source.value == "spaCy")
     editor
-    return
 @app.cell
-def make_stopword_list(editor, sw_source):
-    mo.stop(
-        editor.value is None
-    )
-    sw = set()
-    if sw_source.value in ("spaCy", "Both"):
-        from spacy.lang.en.stop_words import STOP_WORDS
-        sw.update(STOP_WORDS)
-    if sw_source.value in ("Custom", "Both"):
-        # editor.value is a pandas DataFrame
-        for w in editor.value["stopword"].dropna().astype(str):
-            token = w.strip()
-            if token:
-                sw.add(token)
-    if sw:
         sw = list(sw)
     return (sw,)
-@app.cell
-def _(editor, stop_filter, sw):
-    final_stopwords = None
-    if editor.value is not None:
-        print(stop_filter.value)
-        final_stopwords = sw
-    return (final_stopwords,)
 @app.cell
 def _(
     cats,
-    final_stopwords,
     fnames,
     max_df_setting,
     max_features_setting,
     min_df_setting,
     texts,
     train_scikit_cached,
 ):
@@ -757,7 +791,7 @@ def _(
         min_df=min_df_setting.value,
         max_df=max_df_setting.value,
         max_features=max_features_setting.value,
-        stop_words=final_stopwords,
     )
     return chunk_cats, chunk_fnames, tfidf_X, vectorizer
@@ -865,6 +899,11 @@ def _(model, results, three_switch):
                 - 会話文と地の文サンプル間の分散に最も寄与する共起語彙パターン、および判別力の高い語彙が特定されます。
                 - PCAは傾度に沿った線形関係を仮定するため、言語スタイルの緩やかな変化も示されます。
                 - $\mathrm{{tfidf}}$スコアの連続性を保持したまま、次元削減が実現されます。
                 """
             ),
             mo.mpl.interactive(plt.gcf()),
@@ -889,6 +928,11 @@ def _():
     - サンプルと単語の両方をランダムな観測値として対称的に扱うことができる
     といった分析が可能となります。
     """
     )
     return
@@ -1129,12 +1173,15 @@ def _(
 @app.cell
 def sample_selector(fnames):
     text_selector = mo.ui.dropdown(
         options=list(sorted(fnames)),
         value=fnames[0] if fnames else None,
         label="Select a sample to view",
     )
-    text_selector
     return (text_selector,)
@@ -1147,6 +1194,58 @@ def sample_viewer(fnames, text_selector, texts):
     return
 @app.cell
 def _():
     mo.md(

         return scikit_corpus, X_tfidf, tfv, categories, filenames
+    @mo.cache
+    def kwic_search(
+        texts: list[str],
+        keyword: str,
+        context_chars: int = 20,
+    ) -> pd.DataFrame:
+        """
+        KWIC on a list of strings.
+        Returns rows with columns:
+          - original_index: index in `texts`
+          - before, keyword, after: context snippets
+        """
+        import re
+        import pandas as pd
+        pattern = rf"\b{re.escape(keyword)}\b"
+        results: list[dict] = []
+        for idx, txt in enumerate(texts):
+            txt = str(txt)
+            for m in re.finditer(pattern, txt, re.IGNORECASE):
+                s, e = m.span()
+                results.append(
+                    {
+                        "original_index": idx,
+                        "before": txt[max(0, s - context_chars) : s],
+                        "keyword": txt[s:e],
+                        "after": txt[e : min(len(txt), e + context_chars)],
+                    }
+                )
+        return pd.DataFrame(
+            results,
+            columns=["original_index", "before", "keyword", "after"],
+        )
     def split_speech_text(text: str) -> tuple[str, str]:
         """
         Extract all quoted spans as 'speech' and the remainder as 'non-speech'
     return (
         build_corpus_cached,
         chunk_texts,
+        kwic_search,
         parse_texts,
         prepare_files,
         train_scikit_cached,
 @app.cell
+def stopword_switch():
     stop_filter = mo.ui.switch(label="Enable stop-word filtering?", value=False)
     stop_filter
     return (stop_filter,)
 @app.cell
+def stopword_source(stop_filter):
+    if stop_filter.value:
+        sw_source = mo.ui.dropdown(
+            options=["spaCy", "Custom", "Both"],
+            value="spaCy",
+            label="Stop-word source",
+            full_width=True,
+        )
+    else:
+        sw_source = None
     sw_source
+    return (sw_source,)
 @app.cell
+def custom_stopword_editor(sw_source):
+    if sw_source and sw_source.value in ("Custom", "Both"):
+        empty = pd.DataFrame({"stopword": []}, dtype=pd.StringDtype())
+        editor = mo.ui.data_editor(empty).form(
+            label="Your custom stop-words", bordered=True
+        )
+    else:
+        editor = None
     editor
+    return (editor,)
 @app.cell
+def final_stopwords(editor, stop_filter, sw_source):
+    # if master switch off → no filtering
+    if stop_filter.value:
+        # require a source choice
+        mo.stop(sw_source is None, mo.md("Choose stop-word source"))
+        sw: set[str] = set()
+        if sw_source.value in ("spaCy", "Both"):
+            from spacy.lang.en.stop_words import STOP_WORDS
+            sw.update(STOP_WORDS)
+        if sw_source.value in ("Custom", "Both"):
+            mo.stop(
+                editor is None or editor.value is None,
+                mo.md("Enter at least one custom stop-word"),
+            )
+            for tok in editor.value["stopword"].dropna().astype(str):
+                tok = tok.strip()
+                if tok:
+                    sw.add(tok)
         sw = list(sw)
+    else:
+        sw = None
     return (sw,)
 @app.cell
 def _(
     cats,
     fnames,
     max_df_setting,
     max_features_setting,
     min_df_setting,
+    sw: set[str],
     texts,
     train_scikit_cached,
 ):
         min_df=min_df_setting.value,
         max_df=max_df_setting.value,
         max_features=max_features_setting.value,
+        stop_words=sw,
     )
     return chunk_cats, chunk_fnames, tfidf_X, vectorizer
                 - 会話文と地の文サンプル間の分散に最も寄与する共起語彙パターン、および判別力の高い語彙が特定されます。
                 - PCAは傾度に沿った線形関係を仮定するため、言語スタイルの緩やかな変化も示されます。
                 - $\mathrm{{tfidf}}$スコアの連続性を保持したまま、次元削減が実現されます。
+                **主成分とは？**
+                主成分は「データのばらつきを一番よく説明する単語の線形結合」です。
+                数式よりも「語彙の座標軸」と捉えてください。
                 """
             ),
             mo.mpl.interactive(plt.gcf()),
     - サンプルと単語の両方をランダムな観測値として対称的に扱うことができる
     といった分析が可能となります。
+    **CAの出力の読み取り方**
+    行（サンプル）と列（単語）が近いほど、その単語がそのサンプル群に特徴的です。
+    プロット上で原点に近い点は「どのカテゴリにも偏らない語」です。
     """
     )
     return
 @app.cell
 def sample_selector(fnames):
+    selector_explanation = mo.md("## データの確認\n\n### サンプルの確認\n\n以下の選択肢から任意のサンプルを選ぶとその中身が確認できます。")
     text_selector = mo.ui.dropdown(
         options=list(sorted(fnames)),
         value=fnames[0] if fnames else None,
         label="Select a sample to view",
     )
+    mo.vstack([selector_explanation, text_selector])
     return (text_selector,)
     return
+@app.cell
+def _():
+    kwic_explanation = mo.md("### KWIC検索\n\nKeyWord In Context (KWIC)は検索語の左右コンテクストを効率的に確認できる可視化方法です。")
+    keyword = mo.ui.text(label="Search keyword")
+    context_chars = mo.ui.number(label="Context chars", start=0, value=50)
+    run_btn = mo.ui.run_button(label="Search")
+    mo.vstack([kwic_explanation, keyword, context_chars, run_btn])
+    return context_chars, keyword, run_btn
+@app.cell
+def _(
+    authors,
+    context_chars,
+    keyword,
+    kwic_search,
+    run_btn,
+    speech_types,
+    texts,
+    works,
+):
+    mo.stop(not run_btn.value, mo.md("Type a keyword and click Search."))
+    kwic_df = kwic_search(texts, keyword.value, context_chars.value)
+    if kwic_df.empty:
+        kwic_display = mo.md(f"No occurrences of “{keyword.value}” found.")
+    else:
+        # reattach metadata
+        meta = pd.DataFrame(
+            {
+                "sample_index": range(len(texts)),
+                "author": authors,
+                "work": works,
+                "speech_type": speech_types,
+            }
+        )
+        merged = (
+            kwic_df
+            .merge(
+                meta,
+                left_on="original_index",
+                right_on="sample_index",
+                validate="many_to_one",
+            )
+            .drop(columns=["original_index", "sample_index"])
+        )
+        kwic_display = mo.ui.table(merged, selection=None)
+    kwic_display
+    return
 @app.cell
 def _():
     mo.md(