Bor Hodošček commited on
Commit
6b8ea95
·
1 Parent(s): c8648fb

feat: kwic & improved docs; fix: stopword edge cases

Browse files
Files changed (1) hide show
  1. app.py +147 -48
app.py CHANGED
@@ -204,6 +204,40 @@ def function_export():
204
 
205
  return scikit_corpus, X_tfidf, tfv, categories, filenames
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  def split_speech_text(text: str) -> tuple[str, str]:
208
  """
209
  Extract all quoted spans as 'speech' and the remainder as 'non-speech'
@@ -291,6 +325,7 @@ def function_export():
291
  return (
292
  build_corpus_cached,
293
  chunk_texts,
 
294
  parse_texts,
295
  prepare_files,
296
  train_scikit_cached,
@@ -676,77 +711,76 @@ def _():
676
 
677
 
678
  @app.cell
679
- def stopword_settings():
680
  stop_filter = mo.ui.switch(label="Enable stop-word filtering?", value=False)
681
  stop_filter
682
  return (stop_filter,)
683
 
684
 
685
  @app.cell
686
- def _(stop_filter):
687
- mo.stop(not stop_filter.value)
688
-
689
- sw_source = mo.ui.dropdown(
690
- options=["spaCy", "Custom", "Both"],
691
- value="spaCy",
692
- label="Stop-word source",
693
- )
694
-
695
- empty_df = pd.DataFrame({"stopword": []}, dtype=pd.StringDtype())
696
- editor = mo.ui.data_editor(empty_df).form(
697
- label="Your custom stop-words (+ Add Row)", bordered=True
698
- )
699
  sw_source
700
- return editor, sw_source
701
 
702
 
703
  @app.cell
704
- def _(editor, sw_source):
705
- mo.stop(sw_source.value == "spaCy")
706
-
 
 
 
 
 
707
  editor
708
- return
709
 
710
 
711
  @app.cell
712
- def make_stopword_list(editor, sw_source):
713
- mo.stop(
714
- editor.value is None
715
- )
716
-
717
- sw = set()
718
- if sw_source.value in ("spaCy", "Both"):
719
- from spacy.lang.en.stop_words import STOP_WORDS
720
-
721
- sw.update(STOP_WORDS)
722
- if sw_source.value in ("Custom", "Both"):
723
- # editor.value is a pandas DataFrame
724
- for w in editor.value["stopword"].dropna().astype(str):
725
- token = w.strip()
726
- if token:
727
- sw.add(token)
728
- if sw:
 
 
 
 
729
  sw = list(sw)
 
 
730
  return (sw,)
731
 
732
 
733
- @app.cell
734
- def _(editor, stop_filter, sw):
735
- final_stopwords = None
736
- if editor.value is not None:
737
- print(stop_filter.value)
738
- final_stopwords = sw
739
- return (final_stopwords,)
740
-
741
-
742
  @app.cell
743
  def _(
744
  cats,
745
- final_stopwords,
746
  fnames,
747
  max_df_setting,
748
  max_features_setting,
749
  min_df_setting,
 
750
  texts,
751
  train_scikit_cached,
752
  ):
@@ -757,7 +791,7 @@ def _(
757
  min_df=min_df_setting.value,
758
  max_df=max_df_setting.value,
759
  max_features=max_features_setting.value,
760
- stop_words=final_stopwords,
761
  )
762
  return chunk_cats, chunk_fnames, tfidf_X, vectorizer
763
 
@@ -865,6 +899,11 @@ def _(model, results, three_switch):
865
  - 会話文と地の文サンプル間の分散に最も寄与する共起語彙パターン、および判別力の高い語彙が特定されます。
866
  - PCAは傾度に沿った線形関係を仮定するため、言語スタイルの緩やかな変化も示されます。
867
  - $\mathrm{{tfidf}}$スコアの連続性を保持したまま、次元削減が実現されます。
 
 
 
 
 
868
  """
869
  ),
870
  mo.mpl.interactive(plt.gcf()),
@@ -889,6 +928,11 @@ def _():
889
  - サンプルと単語の両方をランダムな観測値として対称的に扱うことができる
890
 
891
  といった分析が可能となります。
 
 
 
 
 
892
  """
893
  )
894
  return
@@ -1129,12 +1173,15 @@ def _(
1129
 
1130
  @app.cell
1131
  def sample_selector(fnames):
 
 
1132
  text_selector = mo.ui.dropdown(
1133
  options=list(sorted(fnames)),
1134
  value=fnames[0] if fnames else None,
1135
  label="Select a sample to view",
1136
  )
1137
- text_selector
 
1138
  return (text_selector,)
1139
 
1140
 
@@ -1147,6 +1194,58 @@ def sample_viewer(fnames, text_selector, texts):
1147
  return
1148
 
1149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1150
  @app.cell
1151
  def _():
1152
  mo.md(
 
204
 
205
  return scikit_corpus, X_tfidf, tfv, categories, filenames
206
 
207
+ @mo.cache
208
+ def kwic_search(
209
+ texts: list[str],
210
+ keyword: str,
211
+ context_chars: int = 20,
212
+ ) -> pd.DataFrame:
213
+ """
214
+ KWIC on a list of strings.
215
+ Returns rows with columns:
216
+ - original_index: index in `texts`
217
+ - before, keyword, after: context snippets
218
+ """
219
+ import re
220
+ import pandas as pd
221
+
222
+ pattern = rf"\b{re.escape(keyword)}\b"
223
+ results: list[dict] = []
224
+ for idx, txt in enumerate(texts):
225
+ txt = str(txt)
226
+ for m in re.finditer(pattern, txt, re.IGNORECASE):
227
+ s, e = m.span()
228
+ results.append(
229
+ {
230
+ "original_index": idx,
231
+ "before": txt[max(0, s - context_chars) : s],
232
+ "keyword": txt[s:e],
233
+ "after": txt[e : min(len(txt), e + context_chars)],
234
+ }
235
+ )
236
+ return pd.DataFrame(
237
+ results,
238
+ columns=["original_index", "before", "keyword", "after"],
239
+ )
240
+
241
  def split_speech_text(text: str) -> tuple[str, str]:
242
  """
243
  Extract all quoted spans as 'speech' and the remainder as 'non-speech'
 
325
  return (
326
  build_corpus_cached,
327
  chunk_texts,
328
+ kwic_search,
329
  parse_texts,
330
  prepare_files,
331
  train_scikit_cached,
 
711
 
712
 
713
  @app.cell
714
+ def stopword_switch():
715
  stop_filter = mo.ui.switch(label="Enable stop-word filtering?", value=False)
716
  stop_filter
717
  return (stop_filter,)
718
 
719
 
720
  @app.cell
721
+ def stopword_source(stop_filter):
722
+ if stop_filter.value:
723
+ sw_source = mo.ui.dropdown(
724
+ options=["spaCy", "Custom", "Both"],
725
+ value="spaCy",
726
+ label="Stop-word source",
727
+ full_width=True,
728
+ )
729
+ else:
730
+ sw_source = None
 
 
 
731
  sw_source
732
+ return (sw_source,)
733
 
734
 
735
  @app.cell
736
+ def custom_stopword_editor(sw_source):
737
+ if sw_source and sw_source.value in ("Custom", "Both"):
738
+ empty = pd.DataFrame({"stopword": []}, dtype=pd.StringDtype())
739
+ editor = mo.ui.data_editor(empty).form(
740
+ label="Your custom stop-words", bordered=True
741
+ )
742
+ else:
743
+ editor = None
744
  editor
745
+ return (editor,)
746
 
747
 
748
  @app.cell
749
+ def final_stopwords(editor, stop_filter, sw_source):
750
+ # if master switch off → no filtering
751
+ if stop_filter.value:
752
+ # require a source choice
753
+ mo.stop(sw_source is None, mo.md("Choose stop-word source"))
754
+
755
+ sw: set[str] = set()
756
+ if sw_source.value in ("spaCy", "Both"):
757
+ from spacy.lang.en.stop_words import STOP_WORDS
758
+
759
+ sw.update(STOP_WORDS)
760
+
761
+ if sw_source.value in ("Custom", "Both"):
762
+ mo.stop(
763
+ editor is None or editor.value is None,
764
+ mo.md("Enter at least one custom stop-word"),
765
+ )
766
+ for tok in editor.value["stopword"].dropna().astype(str):
767
+ tok = tok.strip()
768
+ if tok:
769
+ sw.add(tok)
770
  sw = list(sw)
771
+ else:
772
+ sw = None
773
  return (sw,)
774
 
775
 
 
 
 
 
 
 
 
 
 
776
  @app.cell
777
  def _(
778
  cats,
 
779
  fnames,
780
  max_df_setting,
781
  max_features_setting,
782
  min_df_setting,
783
+ sw: set[str],
784
  texts,
785
  train_scikit_cached,
786
  ):
 
791
  min_df=min_df_setting.value,
792
  max_df=max_df_setting.value,
793
  max_features=max_features_setting.value,
794
+ stop_words=sw,
795
  )
796
  return chunk_cats, chunk_fnames, tfidf_X, vectorizer
797
 
 
899
  - 会話文と地の文サンプル間の分散に最も寄与する共起語彙パターン、および判別力の高い語彙が特定されます。
900
  - PCAは傾度に沿った線形関係を仮定するため、言語スタイルの緩やかな変化も示されます。
901
  - $\mathrm{{tfidf}}$スコアの連続性を保持したまま、次元削減が実現されます。
902
+
903
+ **主成分とは?**
904
+
905
+ 主成分は「データのばらつきを一番よく説明する単語の線形結合」です。
906
+ 数式よりも「語彙の座標軸」と捉えてください。
907
  """
908
  ),
909
  mo.mpl.interactive(plt.gcf()),
 
928
  - サンプルと単語の両方をランダムな観測値として対称的に扱うことができる
929
 
930
  といった分析が可能となります。
931
+
932
+ **CAの出力の読み取り方**
933
+
934
+ 行(サンプル)と列(単語)が近いほど、その単語がそのサンプル群に特徴的です。
935
+ プロット上で原点に近い点は「どのカテゴリにも偏らない語」です。
936
  """
937
  )
938
  return
 
1173
 
1174
  @app.cell
1175
  def sample_selector(fnames):
1176
+ selector_explanation = mo.md("## データの確認\n\n### サンプルの確認\n\n以下の選択肢から任意のサンプルを選ぶとその中身が確認できます。")
1177
+
1178
  text_selector = mo.ui.dropdown(
1179
  options=list(sorted(fnames)),
1180
  value=fnames[0] if fnames else None,
1181
  label="Select a sample to view",
1182
  )
1183
+
1184
+ mo.vstack([selector_explanation, text_selector])
1185
  return (text_selector,)
1186
 
1187
 
 
1194
  return
1195
 
1196
 
1197
+ @app.cell
1198
+ def _():
1199
+ kwic_explanation = mo.md("### KWIC検索\n\nKeyWord In Context (KWIC)は検索語の左右コンテクストを効率的に確認できる可視化方法です。")
1200
+ keyword = mo.ui.text(label="Search keyword")
1201
+ context_chars = mo.ui.number(label="Context chars", start=0, value=50)
1202
+ run_btn = mo.ui.run_button(label="Search")
1203
+ mo.vstack([kwic_explanation, keyword, context_chars, run_btn])
1204
+ return context_chars, keyword, run_btn
1205
+
1206
+
1207
+ @app.cell
1208
+ def _(
1209
+ authors,
1210
+ context_chars,
1211
+ keyword,
1212
+ kwic_search,
1213
+ run_btn,
1214
+ speech_types,
1215
+ texts,
1216
+ works,
1217
+ ):
1218
+ mo.stop(not run_btn.value, mo.md("Type a keyword and click Search."))
1219
+
1220
+ kwic_df = kwic_search(texts, keyword.value, context_chars.value)
1221
+ if kwic_df.empty:
1222
+ kwic_display = mo.md(f"No occurrences of “{keyword.value}” found.")
1223
+ else:
1224
+ # reattach metadata
1225
+ meta = pd.DataFrame(
1226
+ {
1227
+ "sample_index": range(len(texts)),
1228
+ "author": authors,
1229
+ "work": works,
1230
+ "speech_type": speech_types,
1231
+ }
1232
+ )
1233
+ merged = (
1234
+ kwic_df
1235
+ .merge(
1236
+ meta,
1237
+ left_on="original_index",
1238
+ right_on="sample_index",
1239
+ validate="many_to_one",
1240
+ )
1241
+ .drop(columns=["original_index", "sample_index"])
1242
+ )
1243
+ kwic_display = mo.ui.table(merged, selection=None)
1244
+
1245
+ kwic_display
1246
+ return
1247
+
1248
+
1249
  @app.cell
1250
  def _():
1251
  mo.md(