Spaces:
Sleeping
Sleeping
Bor Hodošček
commited on
Commit
·
dd4089f
1
Parent(s):
66b97d8
feat: improve explanation, feature selection
Browse files
app.py
CHANGED
@@ -620,23 +620,73 @@ def _():
|
|
620 |
|
621 |
|
622 |
@app.cell
|
623 |
-
def _(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
624 |
scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
|
625 |
-
texts, cats, fnames
|
626 |
)
|
627 |
return chunk_cats, chunk_fnames, tfidf_X, vectorizer
|
628 |
|
629 |
|
630 |
@app.cell
|
631 |
def _(chunk_cats, tfidf_X):
|
632 |
-
from sklearn.model_selection import train_test_split
|
633 |
|
634 |
-
X_train, X_test, y_train, y_test = train_test_split(
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
640 |
return
|
641 |
|
642 |
|
@@ -702,7 +752,8 @@ def _():
|
|
702 |
dendrogram_height = mo.ui.number(
|
703 |
label="Dendrogram plot height (increase if hard to see labels)",
|
704 |
start=800,
|
705 |
-
value=
|
|
|
706 |
)
|
707 |
|
708 |
d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")
|
|
|
620 |
|
621 |
|
622 |
@app.cell
|
623 |
+
def _():
|
624 |
+
min_df_setting = mo.ui.slider(start=0.0, stop=1.0, step=0.05, value=0.25, show_value=True, label="Minimum proportion of samples feature appears in")
|
625 |
+
max_df_setting = mo.ui.slider(start=0.0, stop=1.0, step=0.05, value=0.8, show_value=True, label="Maximum proportion of samples feature appears in")
|
626 |
+
max_features_setting = mo.ui.slider(start=10, stop=10_000, step=1, value=100, show_value=True, label="Maximum number of features to use")
|
627 |
+
|
628 |
+
mo.vstack([mo.md("### 素性設定\n\nどのような単語を分析に使用するかを下記のスライダーで決めます。標準では、ほとんど全ての文章に現る単語、または極端に少ない文章にしか現れない単語が除外されています。そのうえで、$\\mathrm{tfidf}$の値上位100件まで素性としています。"), min_df_setting, max_df_setting, max_features_setting])
|
629 |
+
return max_df_setting, max_features_setting, min_df_setting
|
630 |
+
|
631 |
+
|
632 |
+
@app.cell
|
633 |
+
def _(
|
634 |
+
cats,
|
635 |
+
fnames,
|
636 |
+
max_df_setting,
|
637 |
+
max_features_setting,
|
638 |
+
min_df_setting,
|
639 |
+
texts,
|
640 |
+
train_scikit_cached,
|
641 |
+
):
|
642 |
scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
|
643 |
+
texts, cats, fnames, min_df=min_df_setting.value, max_df=max_df_setting.value, max_features=max_features_setting.value,
|
644 |
)
|
645 |
return chunk_cats, chunk_fnames, tfidf_X, vectorizer
|
646 |
|
647 |
|
648 |
@app.cell
|
649 |
def _(chunk_cats, tfidf_X):
|
650 |
+
# from sklearn.model_selection import train_test_split
|
651 |
|
652 |
+
# X_train, X_test, y_train, y_test = train_test_split(
|
653 |
+
# tfidf_X,
|
654 |
+
# chunk_cats,
|
655 |
+
# test_size=None,
|
656 |
+
# random_state=RANDOM_SEED,
|
657 |
+
# )
|
658 |
+
|
659 |
+
X_train, X_test, y_train, y_test = tfidf_X, chunk_cats, [], []
|
660 |
+
return (X_train,)
|
661 |
+
|
662 |
+
|
663 |
+
@app.cell
|
664 |
+
def _(X_train, chunk_fnames, vectorizer):
|
665 |
+
tf_idf_formula = r"$\mathrm{tfidf}(t,d,D)=\mathrm{tf} (t,d)\cdot \mathrm{idf}(t,D)$"
|
666 |
+
D_formula = r"|\{d:d\in D{\text{ and }}t\in d\}|"
|
667 |
+
idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
|
668 |
+
tf_formula = r"${\displaystyle \mathrm {tf} (t,d)=\textrm{number of times }t\textrm{ appears in }d}$"
|
669 |
+
|
670 |
+
|
671 |
+
mo.md(rf"""
|
672 |
+
### サンプルと素性の行列
|
673 |
+
|
674 |
+
各セルには、そのテキスト(行)に出現する素性(=単語)(列)の$\mathrm{{tfidf}}$の値です。
|
675 |
+
$\mathrm{{tfidf}}$が高いほど、その単語の重要度が高いという意味になります。
|
676 |
+
単語が多くの文章に出現する場合は、低い値になります。
|
677 |
+
|
678 |
+
{tf_idf_formula}
|
679 |
+
|
680 |
+
{idf_formula}
|
681 |
+
|
682 |
+
{tf_formula}
|
683 |
+
|
684 |
+
- ${{\displaystyle D}}$: is the set of all documents in the corpus
|
685 |
+
- ${{\displaystyle N}}$: total number of documents in the corpus ${{\displaystyle N={{|D|}}}}$
|
686 |
+
- ${D_formula}$: number of documents with $t$
|
687 |
+
|
688 |
+
{mo.ui.table(pd.DataFrame(X_train.toarray(), index=chunk_fnames, columns=vectorizer.get_feature_names_out()))}
|
689 |
+
""")
|
690 |
return
|
691 |
|
692 |
|
|
|
752 |
dendrogram_height = mo.ui.number(
|
753 |
label="Dendrogram plot height (increase if hard to see labels)",
|
754 |
start=800,
|
755 |
+
value=1200,
|
756 |
+
step=100,
|
757 |
)
|
758 |
|
759 |
d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")
|