Spaces:

bor
/

scattertext-en-novels

Sleeping

App Files Files Community

Bor Hodošček commited on Jul 1

Commit

dd4089f

1 Parent(s): 66b97d8

feat: improve explanation, feature selection

Browse files

Files changed (1) hide show

app.py +61 -10

app.py CHANGED Viewed

@@ -620,23 +620,73 @@ def _():
 @app.cell
-def _(cats, fnames, texts, train_scikit_cached):
     scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
-        texts, cats, fnames
     )
     return chunk_cats, chunk_fnames, tfidf_X, vectorizer
 @app.cell
 def _(chunk_cats, tfidf_X):
-    from sklearn.model_selection import train_test_split
-    X_train, X_test, y_train, y_test = train_test_split(
-        tfidf_X,
-        chunk_cats,
-        test_size=None,
-        random_state=RANDOM_SEED,
-    )
     return
@@ -702,7 +752,8 @@ def _():
     dendrogram_height = mo.ui.number(
         label="Dendrogram plot height (increase if hard to see labels)",
         start=800,
-        value=1600,
     )
     d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")

 @app.cell
+def _():
+    min_df_setting = mo.ui.slider(start=0.0, stop=1.0, step=0.05, value=0.25, show_value=True, label="Minimum proportion of samples feature appears in")
+    max_df_setting = mo.ui.slider(start=0.0, stop=1.0, step=0.05, value=0.8, show_value=True, label="Maximum proportion of samples feature appears in")
+    max_features_setting = mo.ui.slider(start=10, stop=10_000, step=1, value=100, show_value=True, label="Maximum number of features to use")
+    mo.vstack([mo.md("### 素性設定\n\nどのような単語を分析に使用するかを下記のスライダーで決めます。標準では、ほとんど全ての文章に現る単語、または極端に少ない文章にしか現れない単語が除外されています。そのうえで、$\\mathrm{tfidf}$の値上位100件まで素性としています。"), min_df_setting, max_df_setting, max_features_setting])
+    return max_df_setting, max_features_setting, min_df_setting
+@app.cell
+def _(
+    cats,
+    fnames,
+    max_df_setting,
+    max_features_setting,
+    min_df_setting,
+    texts,
+    train_scikit_cached,
+):
     scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
+        texts, cats, fnames, min_df=min_df_setting.value, max_df=max_df_setting.value, max_features=max_features_setting.value,
     )
     return chunk_cats, chunk_fnames, tfidf_X, vectorizer
 @app.cell
 def _(chunk_cats, tfidf_X):
+    # from sklearn.model_selection import train_test_split
+    # X_train, X_test, y_train, y_test = train_test_split(
+    #     tfidf_X,
+    #     chunk_cats,
+    #     test_size=None,
+    #     random_state=RANDOM_SEED,
+    # )
+    X_train, X_test, y_train, y_test = tfidf_X, chunk_cats, [], []
+    return (X_train,)
+@app.cell
+def _(X_train, chunk_fnames, vectorizer):
+    tf_idf_formula = r"$\mathrm{tfidf}(t,d,D)=\mathrm{tf} (t,d)\cdot \mathrm{idf}(t,D)$"
+    D_formula = r"|\{d:d\in D{\text{ and }}t\in d\}|"
+    idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
+    tf_formula = r"${\displaystyle \mathrm {tf} (t,d)=\textrm{number of times }t\textrm{ appears in }d}$"
+    mo.md(rf"""
+    ### サンプルと素性の行列
+    各セルには、そのテキスト（行）に出現する素性（＝単語）（列）の$\mathrm{{tfidf}}$の値です。
+    $\mathrm{{tfidf}}$が高いほど、その単語の重要度が高いという意味になります。
+    単語が多くの文章に出現する場合は、低い値になります。
+    {tf_idf_formula}
+    {idf_formula}
+    {tf_formula}
+    - ${{\displaystyle D}}$: is the set of all documents in the corpus
+    - ${{\displaystyle N}}$: total number of documents in the corpus ${{\displaystyle N={{|D|}}}}$
+    - ${D_formula}$: number of documents with $t$
+    {mo.ui.table(pd.DataFrame(X_train.toarray(), index=chunk_fnames, columns=vectorizer.get_feature_names_out()))}
+    """)
     return
     dendrogram_height = mo.ui.number(
         label="Dendrogram plot height (increase if hard to see labels)",
         start=800,
+        value=1200,
+        step=100,
     )
     d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")