Spaces:
Running
Running
Bor Hodošček
commited on
feat: speech support for category mode; fix: run only when form filled in
Browse files
app.py
CHANGED
@@ -226,6 +226,7 @@ def function_export():
|
|
226 |
chunk_texts,
|
227 |
make_speech_df,
|
228 |
parse_texts,
|
|
|
229 |
train_scikit_cached,
|
230 |
)
|
231 |
|
@@ -285,6 +286,10 @@ def data_settings():
|
|
285 |
files_b = mo.ui.file(
|
286 |
label="Bのファイルアップロード(UTF-8、.txt形式)", multiple=True, kind="area"
|
287 |
)
|
|
|
|
|
|
|
|
|
288 |
author_tpl = r"""
|
289 |
## Category Comparisonモード
|
290 |
|
@@ -300,6 +305,7 @@ def data_settings():
|
|
300 |
### グループB
|
301 |
{label_b}
|
302 |
{files_b}
|
|
|
303 |
"""
|
304 |
category_form = (
|
305 |
mo.md(author_tpl)
|
@@ -309,6 +315,7 @@ def data_settings():
|
|
309 |
files_a=files_a,
|
310 |
label_b=label_b,
|
311 |
files_b=files_b,
|
|
|
312 |
)
|
313 |
.form(show_clear_button=True, bordered=True)
|
314 |
)
|
@@ -340,7 +347,7 @@ def data_settings():
|
|
340 |
}
|
341 |
)
|
342 |
mode_tabs
|
343 |
-
return category_form, mode_tabs, speech_form
|
344 |
|
345 |
|
346 |
@app.cell
|
@@ -350,8 +357,11 @@ def data_check(
|
|
350 |
mode_tabs,
|
351 |
parse_texts,
|
352 |
speech_form,
|
|
|
|
|
353 |
):
|
354 |
-
mo.stop(
|
|
|
355 |
|
356 |
validation_messages: list[str] = []
|
357 |
|
@@ -395,6 +405,16 @@ def data_check(
|
|
395 |
category_a_texts = [Path(default_a).read_text(encoding="utf-8")]
|
396 |
category_a_names = [default_a]
|
397 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
# Group B: either uploaded files or default
|
399 |
if category_form.value["files_b"]:
|
400 |
category_b_texts = (
|
@@ -407,6 +427,17 @@ def data_check(
|
|
407 |
category_b_texts = [Path(default_b).read_text(encoding="utf-8")]
|
408 |
category_b_names = [default_b]
|
409 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
# infer categories: use UI labels when files uploaded,
|
411 |
# otherwise derive from filename‐stem
|
412 |
# (e.g. "e-r-eddison_..." -> "E R Eddison")
|
@@ -458,17 +489,17 @@ def data_check(
|
|
458 |
{"\n".join(map(lambda x: f"- {x}", validation_messages))}
|
459 |
|
460 |
解析済テキスト一覧:
|
461 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
462 |
""")
|
463 |
return data, data_form
|
464 |
|
465 |
|
466 |
-
@app.cell
|
467 |
-
def _(data):
|
468 |
-
max_tokens = data["text"].map(lambda s: len(s.split())).max()
|
469 |
-
return
|
470 |
-
|
471 |
-
|
472 |
@app.cell
|
473 |
def sampling_controls_setup():
|
474 |
chunk_size = mo.ui.slider(
|
|
|
226 |
chunk_texts,
|
227 |
make_speech_df,
|
228 |
parse_texts,
|
229 |
+
split_speech_text,
|
230 |
train_scikit_cached,
|
231 |
)
|
232 |
|
|
|
286 |
files_b = mo.ui.file(
|
287 |
label="Bのファイルアップロード(UTF-8、.txt形式)", multiple=True, kind="area"
|
288 |
)
|
289 |
+
split_speech = mo.ui.switch(
|
290 |
+
label="Split speech vs non-speech segments?",
|
291 |
+
value=True,
|
292 |
+
)
|
293 |
author_tpl = r"""
|
294 |
## Category Comparisonモード
|
295 |
|
|
|
305 |
### グループB
|
306 |
{label_b}
|
307 |
{files_b}
|
308 |
+
{split_speech}
|
309 |
"""
|
310 |
category_form = (
|
311 |
mo.md(author_tpl)
|
|
|
315 |
files_a=files_a,
|
316 |
label_b=label_b,
|
317 |
files_b=files_b,
|
318 |
+
split_speech=split_speech,
|
319 |
)
|
320 |
.form(show_clear_button=True, bordered=True)
|
321 |
)
|
|
|
347 |
}
|
348 |
)
|
349 |
mode_tabs
|
350 |
+
return category_form, mode_tabs, speech_form, split_speech
|
351 |
|
352 |
|
353 |
@app.cell
|
|
|
357 |
mode_tabs,
|
358 |
parse_texts,
|
359 |
speech_form,
|
360 |
+
split_speech,
|
361 |
+
split_speech_text,
|
362 |
):
|
363 |
+
mo.stop(mode_tabs.value == "Speech vs Non-Speech" and speech_form.value is None)
|
364 |
+
mo.stop(mode_tabs.value == "Category Comparison" and category_form.value is None)
|
365 |
|
366 |
validation_messages: list[str] = []
|
367 |
|
|
|
405 |
category_a_texts = [Path(default_a).read_text(encoding="utf-8")]
|
406 |
category_a_names = [default_a]
|
407 |
|
408 |
+
if split_speech.value:
|
409 |
+
texts_list = list(category_a_texts)
|
410 |
+
names_list = list(category_a_names)
|
411 |
+
expanded_txt, expanded_names = [], []
|
412 |
+
for nm, raw in zip(names_list, texts_list):
|
413 |
+
sp, ns = split_speech_text(raw)
|
414 |
+
expanded_txt.extend([sp, ns])
|
415 |
+
expanded_names.extend([f"{nm} (speech)", f"{nm} (non-speech)"])
|
416 |
+
category_a_texts, category_a_names = expanded_txt, expanded_names
|
417 |
+
|
418 |
# Group B: either uploaded files or default
|
419 |
if category_form.value["files_b"]:
|
420 |
category_b_texts = (
|
|
|
427 |
category_b_texts = [Path(default_b).read_text(encoding="utf-8")]
|
428 |
category_b_names = [default_b]
|
429 |
|
430 |
+
# same splitting for B‐side
|
431 |
+
if split_speech.value:
|
432 |
+
texts_list = list(category_b_texts)
|
433 |
+
names_list = list(category_b_names)
|
434 |
+
expanded_txt, expanded_names = [], []
|
435 |
+
for nm, raw in zip(names_list, texts_list):
|
436 |
+
sp, ns = split_speech_text(raw)
|
437 |
+
expanded_txt.extend([sp, ns])
|
438 |
+
expanded_names.extend([f"{nm} (speech)", f"{nm} (non-speech)"])
|
439 |
+
category_b_texts, category_b_names = expanded_txt, expanded_names
|
440 |
+
|
441 |
# infer categories: use UI labels when files uploaded,
|
442 |
# otherwise derive from filename‐stem
|
443 |
# (e.g. "e-r-eddison_..." -> "E R Eddison")
|
|
|
489 |
{"\n".join(map(lambda x: f"- {x}", validation_messages))}
|
490 |
|
491 |
解析済テキスト一覧:
|
492 |
+
{
|
493 |
+
mo.ui.table(
|
494 |
+
data, selection="multi", format_mapping={"text": lambda s: s[:20] + "..."}
|
495 |
+
)
|
496 |
+
if (data is not None and not data.empty)
|
497 |
+
else "No data"
|
498 |
+
}
|
499 |
""")
|
500 |
return data, data_form
|
501 |
|
502 |
|
|
|
|
|
|
|
|
|
|
|
|
|
503 |
@app.cell
|
504 |
def sampling_controls_setup():
|
505 |
chunk_size = mo.ui.slider(
|