Bor Hodošček commited on
Commit
8cef38d
·
unverified ·
1 Parent(s): 257858f

feat: inital commit of working demo

Browse files
Files changed (7) hide show
  1. 789_14547.html +0 -0
  2. Dockerfile +11 -9
  3. app.py +637 -353
  4. pyproject.toml +23 -0
  5. requirements.txt +0 -5
  6. uv.lock +0 -0
  7. wagahaiwa_nekodearu.txt +0 -0
789_14547.html ADDED
The diff for this file is too large to render. See raw diff
 
Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
- FROM python:3.12
2
- COPY --from=ghcr.io/astral-sh/uv:0.4.20 /uv /bin/uv
3
 
4
  RUN useradd -m -u 1000 user
5
  ENV PATH="/home/user/.local/bin:$PATH"
@@ -7,13 +7,15 @@ ENV UV_SYSTEM_PYTHON=1
7
 
8
  WORKDIR /app
9
 
10
- COPY --chown=user ./requirements.txt requirements.txt
11
- RUN uv pip install -r requirements.txt
 
 
 
 
12
 
13
- COPY --chown=user . /app
14
- RUN mkdir -p /app/__marimo__ && \
15
- chown -R user:user /app && \
16
- chmod -R 755 /app
17
  USER user
18
 
19
- CMD ["marimo", "run", "app.py", "--include-code", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
1
+ FROM python:3.12-slim
2
+ COPY --from=ghcr.io/astral-sh/uv:0.7.12 /uv /bin/uv
3
 
4
  RUN useradd -m -u 1000 user
5
  ENV PATH="/home/user/.local/bin:$PATH"
 
7
 
8
  WORKDIR /app
9
 
10
+ RUN apt update && apt install -y git pkg-config libxml2-dev libxslt-dev libz-dev gcc
11
+ RUN mkdir -p /app && chown -R user:user /app
12
+
13
+ COPY --chown=user ./pyproject.toml ./uv.lock ./app.py ./789_14547.html ./wagahaiwa_nekodearu.txt /app
14
+
15
+ RUN chmod -R u+w /app
16
 
 
 
 
 
17
  USER user
18
 
19
+ RUN uv sync
20
+
21
+ CMD ["uv", "run", "marimo", "run", "app.py", "--no-sandbox", "--include-code", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,469 +1,753 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import marimo
2
 
3
- __generated_with = "0.9.2"
4
  app = marimo.App()
5
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  @app.cell
8
- def __():
 
9
  import marimo as mo
 
 
 
10
 
11
- mo.md("# Welcome to marimo! 🌊🍃")
12
- return (mo,)
 
 
13
 
14
 
15
  @app.cell
16
- def __(mo):
17
- slider = mo.ui.slider(1, 22)
18
- return (slider,)
 
 
 
 
19
 
20
 
21
  @app.cell
22
- def __(mo, slider):
23
- mo.md(
24
- f"""
25
- marimo is a **reactive** Python notebook.
26
-
27
- This means that unlike traditional notebooks, marimo notebooks **run
28
- automatically** when you modify them or
29
- interact with UI elements, like this slider: {slider}.
30
-
31
- {"##" + "🍃" * slider.value}
32
- """
33
  )
34
- return
35
 
36
 
37
- @app.cell(hide_code=True)
38
- def __(mo):
39
- mo.accordion(
40
- {
41
- "Tip: disabling automatic execution": mo.md(
42
- rf"""
43
- marimo lets you disable automatic execution: just go into the
44
- notebook settings and set
45
-
46
- "Runtime > On Cell Change" to "lazy".
47
-
48
- When the runtime is lazy, after running a cell, marimo marks its
49
- descendants as stale instead of automatically running them. The
50
- lazy runtime puts you in control over when cells are run, while
51
- still giving guarantees about the notebook state.
52
- """
53
- )
54
- }
55
- )
56
  return
57
 
58
 
59
- @app.cell(hide_code=True)
60
- def __(mo):
61
- mo.md(
62
- """
63
- Tip: This is a tutorial notebook. You can create your own notebooks
64
- by entering `marimo edit` at the command line.
65
- """
66
- ).callout()
67
- return
 
 
 
 
 
68
 
69
 
70
- @app.cell(hide_code=True)
71
- def __(mo):
72
  mo.md(
73
- """
74
- ## 1. Reactive execution
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- A marimo notebook is made up of small blocks of Python code called
77
- cells.
78
 
79
- marimo reads your cells and models the dependencies among them: whenever
80
- a cell that defines a global variable is run, marimo
81
- **automatically runs** all cells that reference that variable.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- Reactivity keeps your program state and outputs in sync with your code,
84
- making for a dynamic programming environment that prevents bugs before they
85
- happen.
86
- """
 
 
 
 
87
  )
88
- return
89
 
 
90
 
91
- @app.cell(hide_code=True)
92
- def __(changed, mo):
93
- (
94
- mo.md(
95
- f"""
96
- **✨ Nice!** The value of `changed` is now {changed}.
97
-
98
- When you updated the value of the variable `changed`, marimo
99
- **reacted** by running this cell automatically, because this cell
100
- references the global variable `changed`.
101
-
102
- Reactivity ensures that your notebook state is always
103
- consistent, which is crucial for doing good science; it's also what
104
- enables marimo notebooks to double as tools and apps.
105
- """
106
- )
107
- if changed
108
- else mo.md(
109
- """
110
- **🌊 See it in action.** In the next cell, change the value of the
111
- variable `changed` to `True`, then click the run button.
112
- """
113
- )
114
  )
115
- return
116
 
117
 
118
  @app.cell
119
- def __():
120
- changed = False
121
- return (changed,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
 
124
- @app.cell(hide_code=True)
125
- def __(mo):
126
- mo.accordion(
127
- {
128
- "Tip: execution order": (
129
- """
130
- The order of cells on the page has no bearing on
131
- the order in which cells are executed: marimo knows that a cell
132
- reading a variable must run after the cell that defines it. This
133
- frees you to organize your code in the way that makes the most
134
- sense for you.
135
- """
136
- )
137
- }
138
  )
 
 
 
 
139
  return
140
 
141
 
142
- @app.cell(hide_code=True)
143
- def __(mo):
144
- mo.md(
145
- """
146
- **Global names must be unique.** To enable reactivity, marimo imposes a
147
- constraint on how names appear in cells: no two cells may define the same
148
- variable.
149
- """
150
  )
151
- return
152
 
153
-
154
- @app.cell(hide_code=True)
155
- def __(mo):
156
- mo.accordion(
157
- {
158
- "Tip: encapsulation": (
159
- """
160
- By encapsulating logic in functions, classes, or Python modules,
161
- you can minimize the number of global variables in your notebook.
162
- """
163
- )
164
- }
165
  )
166
- return
167
 
 
 
168
 
169
- @app.cell(hide_code=True)
170
- def __(mo):
171
- mo.accordion(
172
- {
173
- "Tip: private variables": (
174
- """
175
- Variables prefixed with an underscore are "private" to a cell, so
176
- they can be defined by multiple cells.
177
- """
178
- )
179
- }
 
 
 
 
 
 
 
 
 
 
 
180
  )
181
- return
182
 
 
183
 
184
- @app.cell(hide_code=True)
185
- def __(mo):
186
- mo.md(
187
- """
188
- ## 2. UI elements
189
 
190
- Cells can output interactive UI elements. Interacting with a UI
191
- element **automatically triggers notebook execution**: when
192
- you interact with a UI element, its value is sent back to Python, and
193
- every cell that references that element is re-run.
194
 
195
- marimo provides a library of UI elements to choose from under
196
- `marimo.ui`.
197
- """
198
- )
199
- return
 
 
200
 
201
 
202
  @app.cell
203
- def __(mo):
204
- mo.md("""**🌊 Some UI elements.** Try interacting with the below elements.""")
 
 
 
 
 
 
 
 
205
  return
206
 
207
 
208
  @app.cell
209
- def __(mo):
210
- icon = mo.ui.dropdown(["🍃", "🌊", "✨"], value="🍃")
211
- return (icon,)
212
-
 
 
 
 
 
 
 
 
 
 
 
213
 
214
- @app.cell
215
- def __(icon, mo):
216
- repetitions = mo.ui.slider(1, 16, label=f"number of {icon.value}: ")
217
- return (repetitions,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
 
220
  @app.cell
221
- def __(icon, repetitions):
222
- icon, repetitions
223
- return
 
224
 
225
 
226
  @app.cell
227
- def __(icon, mo, repetitions):
228
- mo.md("# " + icon.value * repetitions.value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  return
230
 
231
 
232
- @app.cell(hide_code=True)
233
- def __(mo):
234
  mo.md(
235
- """
236
- ## 3. marimo is just Python
237
 
238
- marimo cells parse Python (and only Python), and marimo notebooks are
239
- stored as pure Python files — outputs are _not_ included. There's no
240
- magical syntax.
241
 
242
- The Python files generated by marimo are:
 
243
 
244
- - easily versioned with git, yielding minimal diffs
245
- - legible for both humans and machines
246
- - formattable using your tool of choice,
247
- - usable as Python scripts, with UI elements taking their default
248
- values, and
249
- - importable by other modules (more on that in the future).
250
- """
251
  )
252
  return
253
 
254
 
255
- @app.cell(hide_code=True)
256
- def __(mo):
257
- mo.md(
258
- """
259
- ## 4. Running notebooks as apps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
- marimo notebooks can double as apps. Click the app window icon in the
262
- bottom-right to see this notebook in "app view."
263
 
264
- Serve a notebook as an app with `marimo run` at the command-line.
265
- Of course, you can use marimo just to level-up your
266
- notebooking, without ever making apps.
267
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  )
 
 
 
 
 
 
 
 
 
269
  return
270
 
271
 
272
  @app.cell(hide_code=True)
273
- def __(mo):
274
  mo.md(
275
  """
276
- ## 5. The `marimo` command-line tool
277
 
278
- **Creating and editing notebooks.** Use
279
 
280
- ```
281
- marimo edit
282
- ```
283
-
284
- in a terminal to start the marimo notebook server. From here
285
- you can create a new notebook or edit existing ones.
286
-
287
-
288
- **Running as apps.** Use
289
-
290
- ```
291
- marimo run notebook.py
292
- ```
293
-
294
- to start a webserver that serves your notebook as an app in read-only mode,
295
- with code cells hidden.
296
 
297
- **Convert a Jupyter notebook.** Convert a Jupyter notebook to a marimo
298
- notebook using `marimo convert`:
299
 
300
- ```
301
- marimo convert your_notebook.ipynb > your_app.py
302
- ```
303
 
304
- **Tutorials.** marimo comes packaged with tutorials:
 
 
 
 
305
 
306
- - `dataflow`: more on marimo's automatic execution
307
- - `ui`: how to use UI elements
308
- - `markdown`: how to write markdown, with interpolated values and
309
- LaTeX
310
- - `plots`: how plotting works in marimo
311
- - `sql`: how to use SQL
312
- - `layout`: layout elements in marimo
313
- - `fileformat`: how marimo's file format works
314
- - `markdown-format`: for using `.md` files in marimo
315
- - `for-jupyter-users`: if you are coming from Jupyter
316
 
317
- Start a tutorial with `marimo tutorial`; for example,
318
 
319
- ```
320
- marimo tutorial dataflow
321
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
- In addition to tutorials, we have examples in our
324
- [our GitHub repo](https://www.github.com/marimo-team/marimo/tree/main/examples).
325
- """
 
 
 
 
326
  )
 
 
 
327
  return
328
 
329
 
330
  @app.cell(hide_code=True)
331
- def __(mo):
332
  mo.md(
333
  """
334
- ## 6. The marimo editor
335
 
336
- Here are some tips to help you get started with the marimo editor.
337
- """
 
 
338
  )
339
  return
340
 
341
 
342
  @app.cell
343
- def __(mo, tips):
344
- mo.accordion(tips)
345
- return
346
 
 
347
 
348
- @app.cell(hide_code=True)
349
- def __(mo):
350
- mo.md("""## Finally, a fun fact""")
351
- return
 
 
 
 
 
352
 
 
 
353
 
354
- @app.cell(hide_code=True)
355
- def __(mo):
356
- mo.md(
357
- """
358
- The name "marimo" is a reference to a type of algae that, under
359
- the right conditions, clumps together to form a small sphere
360
- called a "marimo moss ball". Made of just strands of algae, these
361
- beloved assemblages are greater than the sum of their parts.
362
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  )
364
  return
365
 
366
 
367
- @app.cell(hide_code=True)
368
- def __():
369
- tips = {
370
- "Saving": (
371
- """
372
- **Saving**
373
-
374
- - _Name_ your app using the box at the top of the screen, or
375
- with `Ctrl/Cmd+s`. You can also create a named app at the
376
- command line, e.g., `marimo edit app_name.py`.
377
-
378
- - _Save_ by clicking the save icon on the bottom right, or by
379
- inputting `Ctrl/Cmd+s`. By default marimo is configured
380
- to autosave.
381
- """
382
- ),
383
- "Running": (
384
- """
385
- 1. _Run a cell_ by clicking the play ( ▷ ) button on the top
386
- right of a cell, or by inputting `Ctrl/Cmd+Enter`.
387
-
388
- 2. _Run a stale cell_ by clicking the yellow run button on the
389
- right of the cell, or by inputting `Ctrl/Cmd+Enter`. A cell is
390
- stale when its code has been modified but not run.
391
-
392
- 3. _Run all stale cells_ by clicking the play ( ▷ ) button on
393
- the bottom right of the screen, or input `Ctrl/Cmd+Shift+r`.
394
- """
395
- ),
396
- "Console Output": (
397
- """
398
- Console output (e.g., `print()` statements) is shown below a
399
- cell.
400
- """
401
- ),
402
- "Creating, Moving, and Deleting Cells": (
403
- """
404
- 1. _Create_ a new cell above or below a given one by clicking
405
- the plus button to the left of the cell, which appears on
406
- mouse hover.
407
-
408
- 2. _Move_ a cell up or down by dragging on the handle to the
409
- right of the cell, which appears on mouse hover.
410
-
411
- 3. _Delete_ a cell by clicking the trash bin icon. Bring it
412
- back by clicking the undo button on the bottom right of the
413
- screen, or with `Ctrl/Cmd+Shift+z`.
414
- """
415
- ),
416
- "Disabling Automatic Execution": (
417
- """
418
- Via the notebook settings (gear icon) or footer panel, you
419
- can disable automatic execution. This is helpful when
420
- working with expensive notebooks or notebooks that have
421
- side-effects like database transactions.
422
- """
423
- ),
424
- "Disabling Cells": (
425
- """
426
- You can disable a cell via the cell context menu.
427
- marimo will never run a disabled cell or any cells that depend on it.
428
- This can help prevent accidental execution of expensive computations
429
- when editing a notebook.
430
- """
431
- ),
432
- "Code Folding": (
433
- """
434
- You can collapse or fold the code in a cell by clicking the arrow
435
- icons in the line number column to the left, or by using keyboard
436
- shortcuts.
437
-
438
- Use the command palette (`Ctrl/Cmd+k`) or a keyboard shortcut to
439
- quickly fold or unfold all cells.
440
- """
441
- ),
442
- "Code Formatting": (
443
- """
444
- If you have [ruff](https://github.com/astral-sh/ruff) installed,
445
- you can format a cell with the keyboard shortcut `Ctrl/Cmd+b`.
446
- """
447
- ),
448
- "Command Palette": (
449
- """
450
- Use `Ctrl/Cmd+k` to open the command palette.
451
- """
452
- ),
453
- "Keyboard Shortcuts": (
454
- """
455
- Open the notebook menu (top-right) or input `Ctrl/Cmd+Shift+h` to
456
- view a list of all keyboard shortcuts.
457
- """
458
- ),
459
- "Configuration": (
460
- """
461
- Configure the editor by clicking the gears icon near the top-right
462
- of the screen.
463
- """
464
- ),
465
- }
466
- return (tips,)
467
 
468
 
469
  if __name__ == "__main__":
 
1
+ # /// script
2
+ # requires-python = ">=3.12"
3
+ # dependencies = [
4
+ # "aozora-corpus-generator==0.1.1",
5
+ # "cdifflib==1.2.9",
6
+ # "ginza",
7
+ # "ja-ginza",
8
+ # "ipython==7.23.1",
9
+ # "marimo",
10
+ # "polars==1.30.0",
11
+ # "spacy==3.8.7",
12
+ # "wcwidth",
13
+ # ]
14
+ #
15
+ # [tool.uv.sources]
16
+ # aozora-corpus-generator = { git = "https://github.com/borh/aozora-corpus-generator.git" }
17
+ # ///
18
+
19
  import marimo
20
 
21
+ __generated_with = "0.13.15"
22
  app = marimo.App()
23
 
24
 
25
+ @app.cell(hide_code=True)
26
+ def _(mo):
27
+ mo.md(
28
+ rf"""
29
+ # Aozora Bunko Text Processing Pipeline Demo
30
+
31
+ ### Summary
32
+
33
+ 1. Upload a text file from Aozora Bunko (or use the default sample).
34
+ 2. Preprocess using customizable regex patterns.
35
+ 3. Preview the first and last 50 lines of the cleaned text.
36
+ 4. Download the cleaned text.
37
+ 5. Process the XHTML version with a Python library.
38
+ 6. Compare against the regex variant.
39
+ 6. Define token matching patterns.
40
+ 7. Visualize token matches.
41
+ 8. Define dependency matching patterns.
42
+ 9. Visualize dependency matches.
43
+
44
+ ### 概要
45
+
46
+ 1. 青空文庫のテキストファイルをアップロードする(またはデフォルトサンプルを利用する)。
47
+ 2. 編集可能な正規表現で前処理する。
48
+ 3. 前処理済みテキストの先頭50行と末尾50行をプレビューする。
49
+ 4. 前処理済みテキストをダウンロードする。
50
+ 5. XHTML版をPythonのパッケージで処理する。
51
+ 6. 正規表現処理版と比較する。
52
+ 7. トークンマッチング用パターンを定義する。
53
+ 8. トークンマッチ結果を可視化する。
54
+ 9. 係り受け(依存)関係マッチング用パターンを定義する。
55
+ 10. 係り受け関係マッチ結果を可視化する。
56
+
57
+ {mo.callout("By default, this demo uses Natsume Soseki's _‘Wagahai wa neko de aru’_")}
58
+ """
59
+ )
60
+ return
61
+
62
+
63
  @app.cell
64
+ def _():
65
+ import re
66
  import marimo as mo
67
+ import polars as pl
68
+ import spacy
69
+ from spacy.tokens import Doc
70
 
71
+ nlp = spacy.load(
72
+ "ja_ginza"
73
+ ) # or "ja_ginza_electra"/"ja_ginza_bert_large" if installed
74
+ return Doc, mo, nlp, pl, re, spacy
75
 
76
 
77
  @app.cell
78
+ def upload_aozora_text(mo):
79
+ """
80
+ UI element to upload an Aozora‐Bunko text file.
81
+ Falls back to local file if none is provided.
82
+ """
83
+ aozora_file = mo.ui.file(label="Upload Aozora-Bunko text (.txt)", multiple=False)
84
+ return (aozora_file,)
85
 
86
 
87
  @app.cell
88
+ def select_encoding(mo):
89
+ """
90
+ Let the user choose the text‐file encoding.
91
+ """
92
+ encoding = mo.ui.dropdown(
93
+ options=["shift-jis", "utf-8"],
94
+ value="shift-jis",
95
+ label="Text file encoding",
96
+ full_width=False,
 
 
97
  )
98
+ return (encoding,)
99
 
100
 
101
+ @app.cell
102
+ def _(aozora_file, encoding, mo):
103
+ ab_upload_ui = mo.hstack([aozora_file, encoding])
104
+ mo.md(f"## 青空文庫テキストファイル設定\n{ab_upload_ui}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  return
106
 
107
 
108
+ @app.cell
109
+ def load_aozora_text(aozora_file, encoding):
110
+ """
111
+ Load the uploaded file if provided; otherwise read the local wagahaiwa_nekodearu.txt.
112
+ Returns the raw text.
113
+ """
114
+ enc = encoding.value
115
+ if aozora_file.value:
116
+ uploaded = aozora_file.contents()
117
+ text_raw = uploaded.decode(enc)
118
+ else:
119
+ with open("wagahaiwa_nekodearu.txt", encoding="shift-jis") as f:
120
+ text_raw = f.read()
121
+ return (text_raw,)
122
 
123
 
124
+ @app.cell
125
+ def show_raw_head(mo, text_raw):
126
  mo.md(
127
+ f"""
128
+ ## 青空文庫のヘッダーとフッターを確認
129
+
130
+ 最初の500字
131
+ ```raw
132
+ {text_raw[:500]}
133
+ ```
134
+
135
+ 最後の500字
136
+ ```raw
137
+ {text_raw[-500:]}
138
+ ```
139
+ """
140
+ )
141
+ return
142
 
 
 
143
 
144
+ @app.cell
145
+ def regex_inputs(mo):
146
+ ruby_pattern = mo.ui.text(
147
+ value=r"《[^》]+》",
148
+ label="Ruby‐annotation regex",
149
+ full_width=True,
150
+ )
151
+ ruby_bar_pattern = mo.ui.text(
152
+ value=r"|",
153
+ label="Ruby‐bar regex",
154
+ full_width=True,
155
+ )
156
+ annotation_pattern = mo.ui.text(
157
+ value=r"[#[^]]+?]",
158
+ label="Inline‐annotation regex",
159
+ full_width=True,
160
+ )
161
+ hajime_pattern = mo.ui.text(
162
+ value=r"-{55}(.|\n)+?-{55}",
163
+ label="Start‐marker regex",
164
+ full_width=True,
165
+ )
166
+ owari_pattern = mo.ui.text(
167
+ value=(
168
+ r"^[ 【]?(底本:|訳者あとがき|この翻訳は|この作品.*翻訳|"
169
+ r"この翻訳.*全訳)"
170
+ ),
171
+ label="End‐marker regex",
172
+ full_width=True,
173
+ )
174
 
175
+ regexes = mo.vstack(
176
+ [
177
+ ruby_pattern,
178
+ ruby_bar_pattern,
179
+ annotation_pattern,
180
+ hajime_pattern,
181
+ owari_pattern,
182
+ ]
183
  )
 
184
 
185
+ mo.md(f"""## 正規表現による前処理
186
 
187
+ (必要な場合は修正)
188
+
189
+ {regexes}
190
+ """)
191
+ return (
192
+ annotation_pattern,
193
+ hajime_pattern,
194
+ owari_pattern,
195
+ ruby_bar_pattern,
196
+ ruby_pattern,
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  )
 
198
 
199
 
200
  @app.cell
201
+ def clean_aozora(
202
+ annotation_pattern,
203
+ hajime_pattern,
204
+ mo,
205
+ owari_pattern,
206
+ re,
207
+ ruby_bar_pattern,
208
+ ruby_pattern,
209
+ text_raw,
210
+ ):
211
+ # compile from user‐editable patterns
212
+ ruby_rx = re.compile(ruby_pattern.value)
213
+ ruby_bar_rx = re.compile(ruby_bar_pattern.value)
214
+ annotation_rx = re.compile(annotation_pattern.value)
215
+ hajime_rx = re.compile(hajime_pattern.value)
216
+ owari_rx = re.compile(owari_pattern.value, re.M)
217
+
218
+ def clean_text(text: str) -> tuple[str, str, str]:
219
+ """青空文庫テキスト形式の文字列textを入力とし,改行方式の統一,ルビーと各種のアノーテーションの削除,
220
+ 青空文庫特有の"""
221
+
222
+ title, author, text = (text.split("\n", 2) + ["", ""])[:3]
223
+
224
+ # 青空文庫では改行がDOS形式の\r\nのため,それをUNIX形式の\nに統一する。
225
+ cleaned_text = re.sub(r"(\r\n)+", "\n", text)
226
+ # ルビ《...》の記号とその中身を削除
227
+ cleaned_text = re.sub(ruby_rx, "", cleaned_text)
228
+ # ルビのもう一つの書き方に対応:「一番|獰悪《どうあく》」
229
+ cleaned_text = re.sub(ruby_bar_rx, "", cleaned_text)
230
+ # 注釈対応:「※[#「言+墟のつくり」、第4水準2-88-74]」
231
+ cleaned_text = re.sub(annotation_rx, "", cleaned_text)
232
+ # 本文までのテキストを削除
233
+ cleaned_text = re.sub(hajime_rx, "", cleaned_text)
234
+ # 本文の後のテキストを削除
235
+ maybe_owari = owari_rx.search(cleaned_text)
236
+ if maybe_owari:
237
+ return (title, author, cleaned_text[0 : maybe_owari.start()].strip())
238
+
239
+ return (title, author, cleaned_text.strip())
240
+
241
+ title, author, cleaned_text = clean_text(text_raw)
242
+
243
+ mo.md(f"""### 前処理結果の確認
244
+
245
+ - 著者:`{author}`
246
+ - タイトル:`{title}`
247
+
248
+ 最初の100字
249
+ ```raw
250
+ {cleaned_text[:100]}
251
+ ```
252
+
253
+ 最後の100字
254
+ ```raw
255
+ {cleaned_text[-100:]}
256
+ ```
257
+ """)
258
+ return author, cleaned_text, title
259
 
260
 
261
+ @app.cell
262
+ def download_cleaned_text(author, cleaned_text, mo, title):
263
+ """
264
+ Provide a download link for the cleaned Aozora text.
265
+ """
266
+ download_link = mo.download(
267
+ data=cleaned_text.encode("utf-8"),
268
+ filename=f"{author}_{title}.txt",
269
+ mimetype="text/plain",
 
 
 
 
 
270
  )
271
+ mo.md(f"""
272
+ 前処理済みファイルのダウンロード:
273
+ {download_link}
274
+ """)
275
  return
276
 
277
 
278
+ @app.cell
279
+ def get_alternative_file(mo):
280
+ aozora_xhtml_file = mo.ui.file(
281
+ label="Upload Aozora-Bunko text (.html)", multiple=False
 
 
 
 
282
  )
 
283
 
284
+ xhtml_encoding = mo.ui.dropdown(
285
+ options=["shift-jis", "utf-8"],
286
+ value="shift-jis",
287
+ label="Text file encoding",
288
+ full_width=False,
 
 
 
 
 
 
 
289
  )
 
290
 
291
+ mo.md(f"""
292
+ ## HTMLを使用した前処理との比較(オプショナル)
293
 
294
+ プレインテキスト版を正規表現で前処理した結果を、(X)HTML版をPythonで処理した結果を比較したい場合は同じ作品のHTMLファイルをアップします。
295
+
296
+ {aozora_xhtml_file}
297
+ {xhtml_encoding}
298
+ """)
299
+ return aozora_xhtml_file, xhtml_encoding
300
+
301
+
302
+ @app.cell
303
+ def show_natsume_head(aozora_xhtml_file, mo, xhtml_encoding):
304
+ from aozora_corpus_generator.aozora import parse_aozora_bunko_xml_content
305
+
306
+ xhtml_enc = xhtml_encoding.value
307
+ if aozora_xhtml_file.value:
308
+ uploaded_xhtml = aozora_xhtml_file.contents()
309
+ xhtml_raw = uploaded_xhtml.decode(xhtml_enc)
310
+ else:
311
+ with open("789_14547.html", "rb") as xhtml_f:
312
+ xhtml_raw = xhtml_f.read()
313
+
314
+ aozora_xhtml_processed = parse_aozora_bunko_xml_content(
315
+ xhtml_raw, do_tokenize=False
316
  )
 
317
 
318
+ aozora_xhtml_processed_text = aozora_xhtml_processed["text"]
319
 
320
+ mo.md(f"""
321
+ HTML版の最初の200字
 
 
 
322
 
323
+ ```raw
324
+ {aozora_xhtml_processed_text[:200]}
325
+ ```
 
326
 
327
+ HTML版の最後の200字
328
+
329
+ ```raw
330
+ {aozora_xhtml_processed_text[-200:]}
331
+ ```
332
+ """)
333
+ return (aozora_xhtml_processed_text,)
334
 
335
 
336
  @app.cell
337
+ def _(aozora_xhtml_processed_text, author, mo, title):
338
+ xhtml_download_link = mo.download(
339
+ data=aozora_xhtml_processed_text.encode("utf-8"),
340
+ filename=f"{author}_{title}_xhtml.txt",
341
+ mimetype="text/plain",
342
+ )
343
+ mo.md(f"""
344
+ HTML版の前処理済みファイルをダウンロード:
345
+ {xhtml_download_link}
346
+ """)
347
  return
348
 
349
 
350
  @app.cell
351
+ def _():
352
+ import difflib
353
+ import html
354
+ from cdifflib import CSequenceMatcher
355
+ from IPython.display import HTML
356
+ from IPython.display import display_html as display
357
+
358
+ difflib.SequenceMatcher = CSequenceMatcher
359
+
360
+ DEL_STYLE = "background-color:#f6c6c6;color:#000;" # red bg, black text
361
+ INS_STYLE = "background-color:#c6f6c6;color:#000;" # green bg, black text
362
+ WRAP_STYLE = (
363
+ "font-family: ui-monospace, monospace; "
364
+ "white-space: pre-wrap; line-height:1.4; color:#000;"
365
+ )
366
 
367
+ WS_MAP = str.maketrans({" ": "␣", "\t": "⇥", "\n": "↩\n"})
368
+
369
+ def _escape(txt: str) -> str:
370
+ return html.escape(txt.translate(WS_MAP))
371
+
372
+ def _char_changes(a: str, b: str) -> str:
373
+ """Return HTML for *only* the changed chars between a and b."""
374
+ sm = difflib.SequenceMatcher(None, a, b, autojunk=False)
375
+ pieces = []
376
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
377
+ if tag == "delete":
378
+ pieces.append(f'<span style="{DEL_STYLE}">{_escape(a[i1:i2])}</span>')
379
+ elif tag == "insert":
380
+ pieces.append(f'<span style="{INS_STYLE}">{_escape(b[j1:j2])}</span>')
381
+ elif tag == "replace":
382
+ pieces.append(f'<span style="{DEL_STYLE}">{_escape(a[i1:i2])}</span>')
383
+ pieces.append(f'<span style="{INS_STYLE}">{_escape(b[j1:j2])}</span>')
384
+ # equal → ignore
385
+ return "".join(pieces)
386
+
387
+ def diff_changes(a: str, b: str, auto_display: bool = True):
388
+ """
389
+ Colab/Jupyter-friendly inline diff that shows *only the changed segments*.
390
+ Lightning-fast on large, mostly-identical texts.
391
+ """
392
+ a_lines = a.splitlines(keepends=True)
393
+ b_lines = b.splitlines(keepends=True)
394
+
395
+ outer = difflib.SequenceMatcher(None, a_lines, b_lines, autojunk=True)
396
+ html_chunks = []
397
+
398
+ for tag, i1, i2, j1, j2 in outer.get_opcodes():
399
+ if tag == "replace": # both sides present
400
+ for la, lb in zip(a_lines[i1:i2], b_lines[j1:j2]):
401
+ html_chunks.append(_char_changes(la, lb))
402
+ # handle length mismatch
403
+ for la in a_lines[i1 + (j2 - j1) : i2]:
404
+ html_chunks.append(
405
+ f'<span style="{DEL_STYLE}">{_escape(la)}</span>'
406
+ )
407
+ for lb in b_lines[j1 + (i2 - i1) : j2]:
408
+ html_chunks.append(
409
+ f'<span style="{INS_STYLE}">{_escape(lb)}</span>'
410
+ )
411
+ elif tag == "delete":
412
+ for la in a_lines[i1:i2]:
413
+ html_chunks.append(
414
+ f'<span style="{DEL_STYLE}">{_escape(la)}</span>'
415
+ )
416
+ elif tag == "insert":
417
+ for lb in b_lines[j1:j2]:
418
+ html_chunks.append(
419
+ f'<span style="{INS_STYLE}">{_escape(lb)}</span>'
420
+ )
421
+ # equal → skip entirely (we want only changes)
422
+
423
+ rendered = HTML(f'<div style="{WRAP_STYLE}">{"".join(html_chunks)}</div>')
424
+ if auto_display:
425
+ display(rendered)
426
+ return rendered
427
+
428
+ return (diff_changes,)
429
 
430
 
431
  @app.cell
432
+ def toggle_diff(mo):
433
+ run_diff = mo.ui.switch(label="文章間の比較(差分)を表示", value=False)
434
+ run_diff
435
+ return (run_diff,)
436
 
437
 
438
  @app.cell
439
+ def compare_preprocessed_vs_old(
440
+ aozora_xhtml_processed_text,
441
+ cleaned_text,
442
+ diff_changes,
443
+ run_diff,
444
+ ):
445
+ """
446
+ Compare our cleaned text against the original Aozora‐processed text.
447
+ """
448
+
449
+ diff_result = None
450
+
451
+ if run_diff.value:
452
+ # run the expensive diff only when checked
453
+ diff_result = diff_changes(
454
+ cleaned_text, aozora_xhtml_processed_text, auto_display=False
455
+ )
456
+ # else:
457
+ # diff_result = mo.md("Diff comparison is turned off.")
458
+
459
+ diff_result
460
  return
461
 
462
 
463
+ @app.cell
464
+ def _(mo):
465
  mo.md(
466
+ r"""
467
+ ## spaCy (GiNZA) による解析
468
 
469
+ 以下からは、正規表現で前処理したテキストに対して、
 
 
470
 
471
+ - 形態素解析
472
+ - 係り受け解析
473
 
474
+ を行う。
475
+
476
+ > 作品によっては時間がかかる。
477
+ """
 
 
 
478
  )
479
  return
480
 
481
 
482
+ @app.cell
483
+ def process_aozora_text(Doc, cleaned_text, mo, nlp, re):
484
+ """
485
+ Turn each paragraph into one Doc. If any paragraph > MAX_BYTES,
486
+ fall back to sentence‐splitting, then raw‐byte‐splitting, and only
487
+ in that fallback re‐assemble via Doc.from_docs.
488
+ """
489
+
490
+ def split_text_to_paragraphs(text: str) -> list[str]:
491
+ """Split on one or more blank lines."""
492
+ return re.split(r"\n+\s*", text)
493
+
494
+ MAX_BYTES = 40000
495
+ paras = split_text_to_paragraphs(cleaned_text)
496
+ aozora_docs: list[Doc] = []
497
+
498
+ with mo.status.progress_bar(total=len(paras), title="spaCy processing") as bar:
499
+ for para in paras:
500
+ b = len(para.encode("utf-8"))
501
+ if b <= MAX_BYTES:
502
+ doc = nlp(para)
503
+ else:
504
+ # 1) try sentence‐level split
505
+ parts = re.split(r"([。!?])", para)
506
+ sents = [
507
+ parts[i] + (parts[i + 1] if i + 1 < len(parts) else "")
508
+ for i in range(0, len(parts), 2)
509
+ ]
510
+ # 2) accumulate into <= MAX_BYTES
511
+ chunks: list[str] = []
512
+ cur, cur_b = "", 0
513
+ for s in sents:
514
+ sb = len(s.encode("utf-8"))
515
+ if cur_b + sb > MAX_BYTES:
516
+ if cur:
517
+ chunks.append(cur)
518
+ cur, cur_b = s, sb
519
+ else:
520
+ cur += s
521
+ cur_b += sb
522
+ if cur:
523
+ chunks.append(cur)
524
+ # 3) raw‐byte fallback for any too‐large piece
525
+ final_chunks: list[str] = []
526
+ for c in chunks:
527
+ if len(c.encode("utf-8")) <= MAX_BYTES:
528
+ final_chunks.append(c)
529
+ else:
530
+ rem = c
531
+ while rem:
532
+ pb = rem.encode("utf-8")[:MAX_BYTES]
533
+ part = pb.decode("utf-8", "ignore")
534
+ final_chunks.append(part)
535
+ rem = rem[len(part) :]
536
+ # 4) merge into one Doc for this paragraph
537
+ subdocs = list(nlp.pipe(final_chunks, batch_size=20))
538
+ doc = Doc.from_docs(subdocs)
539
+ aozora_docs.append(doc)
540
+ bar.update()
541
+ return (aozora_docs,)
542
 
 
 
543
 
544
+ @app.cell
545
+ def display_noun_chunks(aozora_docs: "list[Doc]", mo, pl):
546
+ """
547
+ Show the most frequent noun-chunks in the entire text made up of at least two tokens, along with the number of tokens in each chunk.
548
+ """
549
+ # build, filter (>=2 tokens), group and sort in one go
550
+ top_chunks = (
551
+ pl.DataFrame(
552
+ {
553
+ "chunk_text": [c.text for doc in aozora_docs for c in doc.noun_chunks],
554
+ "token_count": [len(c) for doc in aozora_docs for c in doc.noun_chunks],
555
+ }
556
+ )
557
+ .filter(pl.col("token_count") >= 2)
558
+ .group_by("chunk_text")
559
+ .agg([pl.len().alias("frequency"), pl.first("token_count")])
560
+ .sort("frequency", descending=True)
561
  )
562
+
563
+ mo.md(f"""
564
+ spaCyには様々な機能が内蔵されていて、例えば、`noun_chunks`では[名詞句](https://spacy.io/usage/linguistic-features#noun-chunks)を構文(係り受け)解析結果に基づいて。ここでいう名詞句、すなわち「NPチャンク」とは、他の名詞句がその中に入れ子にならない名詞句のことで、名詞句レベルの並列や前置詞句、関係節は含まない。
565
+
566
+ ### 2語以上からなる名詞句トップ25
567
+ {mo.ui.dataframe(top_chunks, page_size=25)}
568
+
569
+ > カスタマイズも[可能](https://github.com/explosion/spaCy/blob/41e07772dc5805594bab2997a090a9033e26bf56/spacy/lang/ja/syntax_iterators.py#L12)
570
+ """)
571
  return
572
 
573
 
574
  @app.cell(hide_code=True)
575
+ def _(mo):
576
  mo.md(
577
  """
578
+ ## Token Pattern Matching
579
 
580
+ トークンベースのルールを使用して、短単位で分割された動詞の塊をまとめ上げて観察する。
581
 
582
+ > ここで使用されるルールはあくまでも例で、完璧に動詞の塊をまとめ上げていない。また、短単位より長い単位でテキスト分析する場合は長単位による解析も[可能](https://github.com/komiya-lab/monaka)。
583
+ """
584
+ )
585
+ return
 
 
 
 
 
 
 
 
 
 
 
 
586
 
 
 
587
 
588
+ @app.cell
589
+ def token_pattern():
590
+ ###### ここにサイトからコピーしたパターンを入れ変える
591
 
592
+ pattern = [
593
+ {"POS": "NOUN", "OP": "+"},
594
+ {"POS": "VERB", "OP": "+"},
595
+ {"POS": {"REGEX": "VERB|AUX"}, "OP": "+"},
596
+ ]
597
 
598
+ #####################################################
599
+ return (pattern,)
 
 
 
 
 
 
 
 
600
 
 
601
 
602
+ @app.cell
603
+ def token_pattern_match(aozora_docs: "list[Doc]", mo, nlp, pattern, pl, spacy):
604
+ # https://spacy.io/usage/rule-based-matching#example1
605
+ from spacy.matcher import Matcher
606
+
607
+ matcher = Matcher(nlp.vocab)
608
+ matched_sentences = [] # Collect data of matched sentences to be visualized
609
+ match_texts: list[str] = []
610
+
611
+ def collect_sents(matcher, doc, i, matches):
612
+ match_id, start, end = matches[i]
613
+ span = doc[start:end] # Matched span
614
+ sent = span.sent # Sentence containing matched span
615
+ # get the match span by offsetting the start/end of the span
616
+ match_ents = [
617
+ {
618
+ "start": span.start_char - sent.start_char,
619
+ "end": span.end_char - sent.start_char,
620
+ "label": "ヒット",
621
+ }
622
+ ]
623
+ matched_sentences.append({"text": sent.text, "ents": match_ents})
624
+ match_texts.append(span.text)
625
+
626
+ matcher.add("MyPattern", [pattern], on_match=collect_sents) # add pattern
627
+ # run matcher over each paragraph
628
+ for p_doc2 in aozora_docs:
629
+ matcher(p_doc2)
630
+
631
+ # only show first 10 matches
632
+ MAX_PATTERN_MATCHES = 10
633
+ viz_html = spacy.displacy.render(
634
+ matched_sentences[:MAX_PATTERN_MATCHES], style="ent", manual=True
635
+ )
636
 
637
+ # build top‐25 frequency table of matched span texts
638
+ df = pl.DataFrame({"match_text": match_texts})
639
+ top_matches = (
640
+ df.group_by("match_text")
641
+ .agg(pl.len().alias("frequency"))
642
+ .sort("frequency", descending=True)
643
+ .head(25)
644
  )
645
+
646
+ # display the displaCy‐rendered HTML *and* the frequency table
647
+ mo.vstack([mo.Html(viz_html), top_matches])
648
  return
649
 
650
 
651
  @app.cell(hide_code=True)
652
+ def _(mo):
653
  mo.md(
654
  """
655
+ ## Dependency Pattern Matching
656
 
657
+ 係り受けパターンのルールを記述し、動詞と名詞が[nsubj](https://universaldependencies.org/ja/dep/nsubj.html) (nominal subject) という係り受け関係にあるもの、すなわち動詞とその主語を抽出する。
658
+
659
+ > 係り受け解析は形態素解析のタスクより複雑、その解析制度がより低い。ここでは`ja_ginza`という軽量なモデルを使用しているが、解析制度を求めるのであれば、Transformerベースモデルを使用するとよい。
660
+ """
661
  )
662
  return
663
 
664
 
665
  @app.cell
666
+ def dependency_pattern():
667
+ ###### ここにサイトからコピーしたパターンを入れ変える
 
668
 
669
+ # this is your dependency‐matcher pattern
670
 
671
+ dep_pattern = [
672
+ {"RIGHT_ID": "anchor_verb", "RIGHT_ATTRS": {"POS": "VERB"}},
673
+ {
674
+ "LEFT_ID": "anchor_verb",
675
+ "REL_OP": ">",
676
+ "RIGHT_ID": "verb_subject",
677
+ "RIGHT_ATTRS": {"DEP": "nsubj"},
678
+ },
679
+ ]
680
 
681
+ #####################################################
682
+ return (dep_pattern,)
683
 
684
+
685
+ @app.cell
686
+ def show_dependency_matches(
687
+ aozora_docs: "list[Doc]",
688
+ dep_pattern,
689
+ mo,
690
+ nlp,
691
+ pl,
692
+ spacy,
693
+ ):
694
+ from spacy.matcher import DependencyMatcher
695
+
696
+ dep_matcher = DependencyMatcher(nlp.vocab)
697
+ viz_dep_sents: list[dict] = []
698
+ dep_pairs: list[dict[str, str]] = []
699
+
700
+ def collect_deps(matcher, doc, i, matches):
701
+ _, token_ids = matches[i]
702
+ sent = doc[token_ids[0]].sent
703
+ # map each RIGHT_ID to its matched Token
704
+ rid_to_tok = {
705
+ pat["RIGHT_ID"]: doc[tok_id] for pat, tok_id in zip(dep_pattern, token_ids)
706
+ }
707
+ verb = rid_to_tok["anchor_verb"]
708
+ subj = rid_to_tok["verb_subject"]
709
+
710
+ # build ents for displaCy
711
+ ents = []
712
+ for rid, tok in rid_to_tok.items():
713
+ label = "subject" if rid == "verb_subject" else "verb"
714
+ ents.append(
715
+ {
716
+ "start": tok.idx - sent.start_char,
717
+ "end": tok.idx + len(tok) - sent.start_char,
718
+ "label": label,
719
+ "text": tok.text,
720
+ }
721
+ )
722
+
723
+ viz_dep_sents.append({"text": sent.text, "ents": ents})
724
+ dep_pairs.append({"subject": subj.text, "verb": verb.text})
725
+
726
+ dep_matcher.add("MyDepPattern", [dep_pattern], on_match=collect_deps)
727
+ for dep_doc in aozora_docs:
728
+ dep_matcher(dep_doc)
729
+
730
+ dep_viz_html = spacy.displacy.render(viz_dep_sents[:10], style="ent", manual=True)
731
+
732
+ dep_df = pl.DataFrame(dep_pairs)
733
+ top_dep_matches = (
734
+ dep_df.group_by(["subject", "verb"])
735
+ .agg(pl.len().alias("frequency"))
736
+ .sort("frequency", descending=True)
737
+ )
738
+
739
+ mo.vstack(
740
+ [
741
+ mo.Html(dep_viz_html),
742
+ top_dep_matches,
743
+ ]
744
  )
745
  return
746
 
747
 
748
+ @app.cell
749
+ def _():
750
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
 
752
 
753
  if __name__ == "__main__":
pyproject.toml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = ""
3
+ version = "0.1.0"
4
+ description = "Regex and Python based preprocessing demo"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "aozora-corpus-generator==0.1.1",
9
+ "cdifflib==1.2.9",
10
+ "ginza",
11
+ "ja-ginza",
12
+ "ipython==7.23.1",
13
+ "marimo",
14
+ "polars==1.30.0",
15
+ "spacy==3.8.7",
16
+ "wcwidth",
17
+ ]
18
+
19
+ [tool.uv.sources]
20
+ aozora-corpus-generator = { git = "https://github.com/borh/aozora-corpus-generator.git" }
21
+
22
+ [tool.uv]
23
+ no-binary-package = ["html5-parser", "lxml"]
requirements.txt DELETED
@@ -1,5 +0,0 @@
1
- marimo
2
- # Or a specific version
3
- # marimo>=0.9.0
4
-
5
- # Add other dependencies as needed
 
 
 
 
 
 
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
wagahaiwa_nekodearu.txt ADDED
The diff for this file is too large to render. See raw diff