Spaces:
Running
Running
Bor Hodošček
commited on
Commit
·
6b8ea95
1
Parent(s):
c8648fb
feat: kwic & improved docs; fix: stopword edge cases
Browse files
app.py
CHANGED
@@ -204,6 +204,40 @@ def function_export():
|
|
204 |
|
205 |
return scikit_corpus, X_tfidf, tfv, categories, filenames
|
206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
def split_speech_text(text: str) -> tuple[str, str]:
|
208 |
"""
|
209 |
Extract all quoted spans as 'speech' and the remainder as 'non-speech'
|
@@ -291,6 +325,7 @@ def function_export():
|
|
291 |
return (
|
292 |
build_corpus_cached,
|
293 |
chunk_texts,
|
|
|
294 |
parse_texts,
|
295 |
prepare_files,
|
296 |
train_scikit_cached,
|
@@ -676,77 +711,76 @@ def _():
|
|
676 |
|
677 |
|
678 |
@app.cell
|
679 |
-
def
|
680 |
stop_filter = mo.ui.switch(label="Enable stop-word filtering?", value=False)
|
681 |
stop_filter
|
682 |
return (stop_filter,)
|
683 |
|
684 |
|
685 |
@app.cell
|
686 |
-
def
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
editor = mo.ui.data_editor(empty_df).form(
|
697 |
-
label="Your custom stop-words (+ Add Row)", bordered=True
|
698 |
-
)
|
699 |
sw_source
|
700 |
-
return
|
701 |
|
702 |
|
703 |
@app.cell
|
704 |
-
def
|
705 |
-
|
706 |
-
|
|
|
|
|
|
|
|
|
|
|
707 |
editor
|
708 |
-
return
|
709 |
|
710 |
|
711 |
@app.cell
|
712 |
-
def
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
|
|
|
|
|
|
|
|
729 |
sw = list(sw)
|
|
|
|
|
730 |
return (sw,)
|
731 |
|
732 |
|
733 |
-
@app.cell
|
734 |
-
def _(editor, stop_filter, sw):
|
735 |
-
final_stopwords = None
|
736 |
-
if editor.value is not None:
|
737 |
-
print(stop_filter.value)
|
738 |
-
final_stopwords = sw
|
739 |
-
return (final_stopwords,)
|
740 |
-
|
741 |
-
|
742 |
@app.cell
|
743 |
def _(
|
744 |
cats,
|
745 |
-
final_stopwords,
|
746 |
fnames,
|
747 |
max_df_setting,
|
748 |
max_features_setting,
|
749 |
min_df_setting,
|
|
|
750 |
texts,
|
751 |
train_scikit_cached,
|
752 |
):
|
@@ -757,7 +791,7 @@ def _(
|
|
757 |
min_df=min_df_setting.value,
|
758 |
max_df=max_df_setting.value,
|
759 |
max_features=max_features_setting.value,
|
760 |
-
stop_words=
|
761 |
)
|
762 |
return chunk_cats, chunk_fnames, tfidf_X, vectorizer
|
763 |
|
@@ -865,6 +899,11 @@ def _(model, results, three_switch):
|
|
865 |
- 会話文と地の文サンプル間の分散に最も寄与する共起語彙パターン、および判別力の高い語彙が特定されます。
|
866 |
- PCAは傾度に沿った線形関係を仮定するため、言語スタイルの緩やかな変化も示されます。
|
867 |
- $\mathrm{{tfidf}}$スコアの連続性を保持したまま、次元削減が実現されます。
|
|
|
|
|
|
|
|
|
|
|
868 |
"""
|
869 |
),
|
870 |
mo.mpl.interactive(plt.gcf()),
|
@@ -889,6 +928,11 @@ def _():
|
|
889 |
- サンプルと単語の両方をランダムな観測値として対称的に扱うことができる
|
890 |
|
891 |
といった分析が可能となります。
|
|
|
|
|
|
|
|
|
|
|
892 |
"""
|
893 |
)
|
894 |
return
|
@@ -1129,12 +1173,15 @@ def _(
|
|
1129 |
|
1130 |
@app.cell
|
1131 |
def sample_selector(fnames):
|
|
|
|
|
1132 |
text_selector = mo.ui.dropdown(
|
1133 |
options=list(sorted(fnames)),
|
1134 |
value=fnames[0] if fnames else None,
|
1135 |
label="Select a sample to view",
|
1136 |
)
|
1137 |
-
|
|
|
1138 |
return (text_selector,)
|
1139 |
|
1140 |
|
@@ -1147,6 +1194,58 @@ def sample_viewer(fnames, text_selector, texts):
|
|
1147 |
return
|
1148 |
|
1149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1150 |
@app.cell
|
1151 |
def _():
|
1152 |
mo.md(
|
|
|
204 |
|
205 |
return scikit_corpus, X_tfidf, tfv, categories, filenames
|
206 |
|
207 |
+
@mo.cache
|
208 |
+
def kwic_search(
|
209 |
+
texts: list[str],
|
210 |
+
keyword: str,
|
211 |
+
context_chars: int = 20,
|
212 |
+
) -> pd.DataFrame:
|
213 |
+
"""
|
214 |
+
KWIC on a list of strings.
|
215 |
+
Returns rows with columns:
|
216 |
+
- original_index: index in `texts`
|
217 |
+
- before, keyword, after: context snippets
|
218 |
+
"""
|
219 |
+
import re
|
220 |
+
import pandas as pd
|
221 |
+
|
222 |
+
pattern = rf"\b{re.escape(keyword)}\b"
|
223 |
+
results: list[dict] = []
|
224 |
+
for idx, txt in enumerate(texts):
|
225 |
+
txt = str(txt)
|
226 |
+
for m in re.finditer(pattern, txt, re.IGNORECASE):
|
227 |
+
s, e = m.span()
|
228 |
+
results.append(
|
229 |
+
{
|
230 |
+
"original_index": idx,
|
231 |
+
"before": txt[max(0, s - context_chars) : s],
|
232 |
+
"keyword": txt[s:e],
|
233 |
+
"after": txt[e : min(len(txt), e + context_chars)],
|
234 |
+
}
|
235 |
+
)
|
236 |
+
return pd.DataFrame(
|
237 |
+
results,
|
238 |
+
columns=["original_index", "before", "keyword", "after"],
|
239 |
+
)
|
240 |
+
|
241 |
def split_speech_text(text: str) -> tuple[str, str]:
|
242 |
"""
|
243 |
Extract all quoted spans as 'speech' and the remainder as 'non-speech'
|
|
|
325 |
return (
|
326 |
build_corpus_cached,
|
327 |
chunk_texts,
|
328 |
+
kwic_search,
|
329 |
parse_texts,
|
330 |
prepare_files,
|
331 |
train_scikit_cached,
|
|
|
711 |
|
712 |
|
713 |
@app.cell
|
714 |
+
def stopword_switch():
|
715 |
stop_filter = mo.ui.switch(label="Enable stop-word filtering?", value=False)
|
716 |
stop_filter
|
717 |
return (stop_filter,)
|
718 |
|
719 |
|
720 |
@app.cell
|
721 |
+
def stopword_source(stop_filter):
|
722 |
+
if stop_filter.value:
|
723 |
+
sw_source = mo.ui.dropdown(
|
724 |
+
options=["spaCy", "Custom", "Both"],
|
725 |
+
value="spaCy",
|
726 |
+
label="Stop-word source",
|
727 |
+
full_width=True,
|
728 |
+
)
|
729 |
+
else:
|
730 |
+
sw_source = None
|
|
|
|
|
|
|
731 |
sw_source
|
732 |
+
return (sw_source,)
|
733 |
|
734 |
|
735 |
@app.cell
|
736 |
+
def custom_stopword_editor(sw_source):
|
737 |
+
if sw_source and sw_source.value in ("Custom", "Both"):
|
738 |
+
empty = pd.DataFrame({"stopword": []}, dtype=pd.StringDtype())
|
739 |
+
editor = mo.ui.data_editor(empty).form(
|
740 |
+
label="Your custom stop-words", bordered=True
|
741 |
+
)
|
742 |
+
else:
|
743 |
+
editor = None
|
744 |
editor
|
745 |
+
return (editor,)
|
746 |
|
747 |
|
748 |
@app.cell
|
749 |
+
def final_stopwords(editor, stop_filter, sw_source):
|
750 |
+
# if master switch off → no filtering
|
751 |
+
if stop_filter.value:
|
752 |
+
# require a source choice
|
753 |
+
mo.stop(sw_source is None, mo.md("Choose stop-word source"))
|
754 |
+
|
755 |
+
sw: set[str] = set()
|
756 |
+
if sw_source.value in ("spaCy", "Both"):
|
757 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
758 |
+
|
759 |
+
sw.update(STOP_WORDS)
|
760 |
+
|
761 |
+
if sw_source.value in ("Custom", "Both"):
|
762 |
+
mo.stop(
|
763 |
+
editor is None or editor.value is None,
|
764 |
+
mo.md("Enter at least one custom stop-word"),
|
765 |
+
)
|
766 |
+
for tok in editor.value["stopword"].dropna().astype(str):
|
767 |
+
tok = tok.strip()
|
768 |
+
if tok:
|
769 |
+
sw.add(tok)
|
770 |
sw = list(sw)
|
771 |
+
else:
|
772 |
+
sw = None
|
773 |
return (sw,)
|
774 |
|
775 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
776 |
@app.cell
|
777 |
def _(
|
778 |
cats,
|
|
|
779 |
fnames,
|
780 |
max_df_setting,
|
781 |
max_features_setting,
|
782 |
min_df_setting,
|
783 |
+
sw: set[str],
|
784 |
texts,
|
785 |
train_scikit_cached,
|
786 |
):
|
|
|
791 |
min_df=min_df_setting.value,
|
792 |
max_df=max_df_setting.value,
|
793 |
max_features=max_features_setting.value,
|
794 |
+
stop_words=sw,
|
795 |
)
|
796 |
return chunk_cats, chunk_fnames, tfidf_X, vectorizer
|
797 |
|
|
|
899 |
- 会話文と地の文サンプル間の分散に最も寄与する共起語彙パターン、および判別力の高い語彙が特定されます。
|
900 |
- PCAは傾度に沿った線形関係を仮定するため、言語スタイルの緩やかな変化も示されます。
|
901 |
- $\mathrm{{tfidf}}$スコアの連続性を保持したまま、次元削減が実現されます。
|
902 |
+
|
903 |
+
**主成分とは?**
|
904 |
+
|
905 |
+
主成分は「データのばらつきを一番よく説明する単語の線形結合」です。
|
906 |
+
数式よりも「語彙の座標軸」と捉えてください。
|
907 |
"""
|
908 |
),
|
909 |
mo.mpl.interactive(plt.gcf()),
|
|
|
928 |
- サンプルと単語の両方をランダムな観測値として対称的に扱うことができる
|
929 |
|
930 |
といった分析が可能となります。
|
931 |
+
|
932 |
+
**CAの出力の読み取り方**
|
933 |
+
|
934 |
+
行(サンプル)と列(単語)が近いほど、その単語がそのサンプル群に特徴的です。
|
935 |
+
プロット上で原点に近い点は「どのカテゴリにも偏らない語」です。
|
936 |
"""
|
937 |
)
|
938 |
return
|
|
|
1173 |
|
1174 |
@app.cell
|
1175 |
def sample_selector(fnames):
|
1176 |
+
selector_explanation = mo.md("## データの確認\n\n### サンプルの確認\n\n以下の選択肢から任意のサンプルを選ぶとその中身が確認できます。")
|
1177 |
+
|
1178 |
text_selector = mo.ui.dropdown(
|
1179 |
options=list(sorted(fnames)),
|
1180 |
value=fnames[0] if fnames else None,
|
1181 |
label="Select a sample to view",
|
1182 |
)
|
1183 |
+
|
1184 |
+
mo.vstack([selector_explanation, text_selector])
|
1185 |
return (text_selector,)
|
1186 |
|
1187 |
|
|
|
1194 |
return
|
1195 |
|
1196 |
|
1197 |
+
@app.cell
|
1198 |
+
def _():
|
1199 |
+
kwic_explanation = mo.md("### KWIC検索\n\nKeyWord In Context (KWIC)は検索語の左右コンテクストを効率的に確認できる可視化方法です。")
|
1200 |
+
keyword = mo.ui.text(label="Search keyword")
|
1201 |
+
context_chars = mo.ui.number(label="Context chars", start=0, value=50)
|
1202 |
+
run_btn = mo.ui.run_button(label="Search")
|
1203 |
+
mo.vstack([kwic_explanation, keyword, context_chars, run_btn])
|
1204 |
+
return context_chars, keyword, run_btn
|
1205 |
+
|
1206 |
+
|
1207 |
+
@app.cell
|
1208 |
+
def _(
|
1209 |
+
authors,
|
1210 |
+
context_chars,
|
1211 |
+
keyword,
|
1212 |
+
kwic_search,
|
1213 |
+
run_btn,
|
1214 |
+
speech_types,
|
1215 |
+
texts,
|
1216 |
+
works,
|
1217 |
+
):
|
1218 |
+
mo.stop(not run_btn.value, mo.md("Type a keyword and click Search."))
|
1219 |
+
|
1220 |
+
kwic_df = kwic_search(texts, keyword.value, context_chars.value)
|
1221 |
+
if kwic_df.empty:
|
1222 |
+
kwic_display = mo.md(f"No occurrences of “{keyword.value}” found.")
|
1223 |
+
else:
|
1224 |
+
# reattach metadata
|
1225 |
+
meta = pd.DataFrame(
|
1226 |
+
{
|
1227 |
+
"sample_index": range(len(texts)),
|
1228 |
+
"author": authors,
|
1229 |
+
"work": works,
|
1230 |
+
"speech_type": speech_types,
|
1231 |
+
}
|
1232 |
+
)
|
1233 |
+
merged = (
|
1234 |
+
kwic_df
|
1235 |
+
.merge(
|
1236 |
+
meta,
|
1237 |
+
left_on="original_index",
|
1238 |
+
right_on="sample_index",
|
1239 |
+
validate="many_to_one",
|
1240 |
+
)
|
1241 |
+
.drop(columns=["original_index", "sample_index"])
|
1242 |
+
)
|
1243 |
+
kwic_display = mo.ui.table(merged, selection=None)
|
1244 |
+
|
1245 |
+
kwic_display
|
1246 |
+
return
|
1247 |
+
|
1248 |
+
|
1249 |
@app.cell
|
1250 |
def _():
|
1251 |
mo.md(
|