Bor Hodošček commited on
Commit
dd4089f
·
1 Parent(s): 66b97d8

feat: improve explanation, feature selection

Browse files
Files changed (1) hide show
  1. app.py +61 -10
app.py CHANGED
@@ -620,23 +620,73 @@ def _():
620
 
621
 
622
  @app.cell
623
- def _(cats, fnames, texts, train_scikit_cached):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
624
  scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
625
- texts, cats, fnames
626
  )
627
  return chunk_cats, chunk_fnames, tfidf_X, vectorizer
628
 
629
 
630
  @app.cell
631
  def _(chunk_cats, tfidf_X):
632
- from sklearn.model_selection import train_test_split
633
 
634
- X_train, X_test, y_train, y_test = train_test_split(
635
- tfidf_X,
636
- chunk_cats,
637
- test_size=None,
638
- random_state=RANDOM_SEED,
639
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640
  return
641
 
642
 
@@ -702,7 +752,8 @@ def _():
702
  dendrogram_height = mo.ui.number(
703
  label="Dendrogram plot height (increase if hard to see labels)",
704
  start=800,
705
- value=1600,
 
706
  )
707
 
708
  d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")
 
620
 
621
 
622
  @app.cell
623
+ def _():
624
+ min_df_setting = mo.ui.slider(start=0.0, stop=1.0, step=0.05, value=0.25, show_value=True, label="Minimum proportion of samples feature appears in")
625
+ max_df_setting = mo.ui.slider(start=0.0, stop=1.0, step=0.05, value=0.8, show_value=True, label="Maximum proportion of samples feature appears in")
626
+ max_features_setting = mo.ui.slider(start=10, stop=10_000, step=1, value=100, show_value=True, label="Maximum number of features to use")
627
+
628
+ mo.vstack([mo.md("### 素性設定\n\nどのような単語を分析に使用するかを下記のスライダーで決めます。標準では、ほとんど全ての文章に現る単語、または極端に少ない文章にしか現れない単語が除外されています。そのうえで、$\\mathrm{tfidf}$の値上位100件まで素性としています。"), min_df_setting, max_df_setting, max_features_setting])
629
+ return max_df_setting, max_features_setting, min_df_setting
630
+
631
+
632
+ @app.cell
633
+ def _(
634
+ cats,
635
+ fnames,
636
+ max_df_setting,
637
+ max_features_setting,
638
+ min_df_setting,
639
+ texts,
640
+ train_scikit_cached,
641
+ ):
642
  scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
643
+ texts, cats, fnames, min_df=min_df_setting.value, max_df=max_df_setting.value, max_features=max_features_setting.value,
644
  )
645
  return chunk_cats, chunk_fnames, tfidf_X, vectorizer
646
 
647
 
648
  @app.cell
649
  def _(chunk_cats, tfidf_X):
650
+ # from sklearn.model_selection import train_test_split
651
 
652
+ # X_train, X_test, y_train, y_test = train_test_split(
653
+ # tfidf_X,
654
+ # chunk_cats,
655
+ # test_size=None,
656
+ # random_state=RANDOM_SEED,
657
+ # )
658
+
659
+ X_train, X_test, y_train, y_test = tfidf_X, chunk_cats, [], []
660
+ return (X_train,)
661
+
662
+
663
+ @app.cell
664
+ def _(X_train, chunk_fnames, vectorizer):
665
+ tf_idf_formula = r"$\mathrm{tfidf}(t,d,D)=\mathrm{tf} (t,d)\cdot \mathrm{idf}(t,D)$"
666
+ D_formula = r"|\{d:d\in D{\text{ and }}t\in d\}|"
667
+ idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
668
+ tf_formula = r"${\displaystyle \mathrm {tf} (t,d)=\textrm{number of times }t\textrm{ appears in }d}$"
669
+
670
+
671
+ mo.md(rf"""
672
+ ### サンプルと素性の行列
673
+
674
+ 各セルには、そのテキスト(行)に出現する素性(=単語)(列)の$\mathrm{{tfidf}}$の値です。
675
+ $\mathrm{{tfidf}}$が高いほど、その単語の重要度が高いという意味になります。
676
+ 単語が多くの文章に出現する場合は、低い値になります。
677
+
678
+ {tf_idf_formula}
679
+
680
+ {idf_formula}
681
+
682
+ {tf_formula}
683
+
684
+ - ${{\displaystyle D}}$: is the set of all documents in the corpus
685
+ - ${{\displaystyle N}}$: total number of documents in the corpus ${{\displaystyle N={{|D|}}}}$
686
+ - ${D_formula}$: number of documents with $t$
687
+
688
+ {mo.ui.table(pd.DataFrame(X_train.toarray(), index=chunk_fnames, columns=vectorizer.get_feature_names_out()))}
689
+ """)
690
  return
691
 
692
 
 
752
  dendrogram_height = mo.ui.number(
753
  label="Dendrogram plot height (increase if hard to see labels)",
754
  start=800,
755
+ value=1200,
756
+ step=100,
757
  )
758
 
759
  d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")