Bor Hodošček commited on
Commit
66b97d8
·
unverified ·
1 Parent(s): 1df67da

feat: speech support for category mode; fix: run only when form filled in

Browse files
Files changed (1) hide show
  1. app.py +40 -9
app.py CHANGED
@@ -226,6 +226,7 @@ def function_export():
226
  chunk_texts,
227
  make_speech_df,
228
  parse_texts,
 
229
  train_scikit_cached,
230
  )
231
 
@@ -285,6 +286,10 @@ def data_settings():
285
  files_b = mo.ui.file(
286
  label="Bのファイルアップロード(UTF-8、.txt形式)", multiple=True, kind="area"
287
  )
 
 
 
 
288
  author_tpl = r"""
289
  ## Category Comparisonモード
290
 
@@ -300,6 +305,7 @@ def data_settings():
300
  ### グループB
301
  {label_b}
302
  {files_b}
 
303
  """
304
  category_form = (
305
  mo.md(author_tpl)
@@ -309,6 +315,7 @@ def data_settings():
309
  files_a=files_a,
310
  label_b=label_b,
311
  files_b=files_b,
 
312
  )
313
  .form(show_clear_button=True, bordered=True)
314
  )
@@ -340,7 +347,7 @@ def data_settings():
340
  }
341
  )
342
  mode_tabs
343
- return category_form, mode_tabs, speech_form
344
 
345
 
346
  @app.cell
@@ -350,8 +357,11 @@ def data_check(
350
  mode_tabs,
351
  parse_texts,
352
  speech_form,
 
 
353
  ):
354
- mo.stop(category_form.value is None and speech_form.value is None)
 
355
 
356
  validation_messages: list[str] = []
357
 
@@ -395,6 +405,16 @@ def data_check(
395
  category_a_texts = [Path(default_a).read_text(encoding="utf-8")]
396
  category_a_names = [default_a]
397
 
 
 
 
 
 
 
 
 
 
 
398
  # Group B: either uploaded files or default
399
  if category_form.value["files_b"]:
400
  category_b_texts = (
@@ -407,6 +427,17 @@ def data_check(
407
  category_b_texts = [Path(default_b).read_text(encoding="utf-8")]
408
  category_b_names = [default_b]
409
 
 
 
 
 
 
 
 
 
 
 
 
410
  # infer categories: use UI labels when files uploaded,
411
  # otherwise derive from filename‐stem
412
  # (e.g. "e-r-eddison_..." -> "E R Eddison")
@@ -458,17 +489,17 @@ def data_check(
458
  {"\n".join(map(lambda x: f"- {x}", validation_messages))}
459
 
460
  解析済テキスト一覧:
461
- {mo.ui.table(data, selection="multi", format_mapping={"text": lambda s: s[:20] + "..."}) if not data.empty else "No data"}
 
 
 
 
 
 
462
  """)
463
  return data, data_form
464
 
465
 
466
- @app.cell
467
- def _(data):
468
- max_tokens = data["text"].map(lambda s: len(s.split())).max()
469
- return
470
-
471
-
472
  @app.cell
473
  def sampling_controls_setup():
474
  chunk_size = mo.ui.slider(
 
226
  chunk_texts,
227
  make_speech_df,
228
  parse_texts,
229
+ split_speech_text,
230
  train_scikit_cached,
231
  )
232
 
 
286
  files_b = mo.ui.file(
287
  label="Bのファイルアップロード(UTF-8、.txt形式)", multiple=True, kind="area"
288
  )
289
+ split_speech = mo.ui.switch(
290
+ label="Split speech vs non-speech segments?",
291
+ value=True,
292
+ )
293
  author_tpl = r"""
294
  ## Category Comparisonモード
295
 
 
305
  ### グループB
306
  {label_b}
307
  {files_b}
308
+ {split_speech}
309
  """
310
  category_form = (
311
  mo.md(author_tpl)
 
315
  files_a=files_a,
316
  label_b=label_b,
317
  files_b=files_b,
318
+ split_speech=split_speech,
319
  )
320
  .form(show_clear_button=True, bordered=True)
321
  )
 
347
  }
348
  )
349
  mode_tabs
350
+ return category_form, mode_tabs, speech_form, split_speech
351
 
352
 
353
  @app.cell
 
357
  mode_tabs,
358
  parse_texts,
359
  speech_form,
360
+ split_speech,
361
+ split_speech_text,
362
  ):
363
+ mo.stop(mode_tabs.value == "Speech vs Non-Speech" and speech_form.value is None)
364
+ mo.stop(mode_tabs.value == "Category Comparison" and category_form.value is None)
365
 
366
  validation_messages: list[str] = []
367
 
 
405
  category_a_texts = [Path(default_a).read_text(encoding="utf-8")]
406
  category_a_names = [default_a]
407
 
408
+ if split_speech.value:
409
+ texts_list = list(category_a_texts)
410
+ names_list = list(category_a_names)
411
+ expanded_txt, expanded_names = [], []
412
+ for nm, raw in zip(names_list, texts_list):
413
+ sp, ns = split_speech_text(raw)
414
+ expanded_txt.extend([sp, ns])
415
+ expanded_names.extend([f"{nm} (speech)", f"{nm} (non-speech)"])
416
+ category_a_texts, category_a_names = expanded_txt, expanded_names
417
+
418
  # Group B: either uploaded files or default
419
  if category_form.value["files_b"]:
420
  category_b_texts = (
 
427
  category_b_texts = [Path(default_b).read_text(encoding="utf-8")]
428
  category_b_names = [default_b]
429
 
430
+ # same splitting for B‐side
431
+ if split_speech.value:
432
+ texts_list = list(category_b_texts)
433
+ names_list = list(category_b_names)
434
+ expanded_txt, expanded_names = [], []
435
+ for nm, raw in zip(names_list, texts_list):
436
+ sp, ns = split_speech_text(raw)
437
+ expanded_txt.extend([sp, ns])
438
+ expanded_names.extend([f"{nm} (speech)", f"{nm} (non-speech)"])
439
+ category_b_texts, category_b_names = expanded_txt, expanded_names
440
+
441
  # infer categories: use UI labels when files uploaded,
442
  # otherwise derive from filename‐stem
443
  # (e.g. "e-r-eddison_..." -> "E R Eddison")
 
489
  {"\n".join(map(lambda x: f"- {x}", validation_messages))}
490
 
491
  解析済テキスト一覧:
492
+ {
493
+ mo.ui.table(
494
+ data, selection="multi", format_mapping={"text": lambda s: s[:20] + "..."}
495
+ )
496
+ if (data is not None and not data.empty)
497
+ else "No data"
498
+ }
499
  """)
500
  return data, data_form
501
 
502
 
 
 
 
 
 
 
503
  @app.cell
504
  def sampling_controls_setup():
505
  chunk_size = mo.ui.slider(