Spaces:

bor
/

aozora-bunko-preprocessor

Sleeping

App Files Files Community

Bor Hodošček commited on 29 days ago

Commit

3d692b2

unverified ·

1 Parent(s): 63c91a0

fix: diff rendering and stop criterion; docs

Browse files

Files changed (2) hide show

app.py +26 -17
development.md +6 -0

app.py CHANGED Viewed

@@ -57,19 +57,21 @@ def _(mo):
     8. トークンマッチ結果を可視化する。
     9. 係り受け（依存）関係マッチング用パターンを定義する（アプリの場合は編集不可）。
     10. 係り受け関係マッチ結果を可視化する。
-    {
-            mo.callout('''
-            By default, this demo uses Natsume Soseki's _‘Wagahai wa neko de aru’_
-            ファイルをアップロードしない場合は、デフォルトで夏目漱石『吾輩は猫である』が使用されます。
-            ''')
-        }
     """
     )
     return
 @app.cell
 def _():
     import re
@@ -426,9 +428,9 @@ def _():
                     )
             # equal → skip entirely (we want only changes)
-        rendered = HTML(f'<div style="{WRAP_STYLE}">{"".join(html_chunks)}</div>')
         if auto_display:
-            display(rendered)
         return rendered
     return (diff_changes,)
@@ -453,13 +455,11 @@ def compare_preprocessed_vs_old(
     Compare our cleaned text against the original Aozora‐processed text.
     """
-    diff_result = None
-    if run_diff.value:
-        # run the expensive diff only when checked
-        diff_result = diff_changes(
-            cleaned_text, aozora_xhtml_processed_text, auto_display=False
-        )
     mo.md(f"""
     -   赤: 正規表現版のみにある文字列
@@ -491,7 +491,16 @@ def _(mo):
 @app.cell
-def process_aozora_text(Doc, cleaned_text, mo, nlp, re):
     """
     Turn each paragraph into one Doc.  If any paragraph > MAX_BYTES,
     fall back to sentence‐splitting, then raw‐byte‐splitting, and only

     8. トークンマッチ結果を可視化する。
     9. 係り受け（依存）関係マッチング用パターンを定義する（アプリの場合は編集不可）。
     10. 係り受け関係マッチ結果を可視化する。
     """
     )
     return
+@app.cell
+def _(mo):
+    mo.md('''
+    -   By default, this demo uses Natsume Soseki's _‘Wagahai wa neko de aru’_
+    -   ファイルをアップロードしない場合は、デフォルトで夏目漱石『吾輩は猫である』が使用されます。
+    ''').callout(kind="info")
+    return
 @app.cell
 def _():
     import re
                     )
             # equal → skip entirely (we want only changes)
+        rendered = f'<div style="{WRAP_STYLE}">{"".join(html_chunks)}</div>'
         if auto_display:
+            display(HTML(rendered))
         return rendered
     return (diff_changes,)
     Compare our cleaned text against the original Aozora‐processed text.
     """
+    mo.stop(not run_diff.value)
+    diff_result = diff_changes(
+        cleaned_text, aozora_xhtml_processed_text, auto_display=False
+    )
     mo.md(f"""
     -   赤: 正規表現版のみにある文字列
 @app.cell
+def _(mo):
+    run_spacy = mo.ui.switch(label="spaCyで解析する", value=False)
+    run_spacy
+    return (run_spacy,)
+@app.cell
+def process_aozora_text(Doc, cleaned_text, mo, nlp, re, run_spacy):
+    mo.stop(not run_spacy.value)
     """
     Turn each paragraph into one Doc.  If any paragraph > MAX_BYTES,
     fall back to sentence‐splitting, then raw‐byte‐splitting, and only

development.md CHANGED Viewed

@@ -6,3 +6,9 @@
 docker build -t marimo-app .
 docker run -it --rm -p 7860:7860 marimo-app
 ```

 docker build -t marimo-app .
 docker run -it --rm -p 7860:7860 marimo-app
 ```
+## Nix setup
+```bash
+nix-shell -p libxml2 -p libxslt -p libz
+```