Spaces:
Running
Running
Bor Hodošček
commited on
feat: inital commit of working demo
Browse files- Dockerfile +12 -9
- Natsume_S_Bocchan.txt +0 -0
- Natsume_S_Kokoro.txt +0 -0
- README.md +1 -1
- Unno_J_Chikyuuyousa.txt +0 -0
- Unno_J_Kaseiheidan.txt +0 -0
- app.py +563 -376
- development.md +2 -2
- pyproject.toml +17 -0
- requirements.txt +0 -5
- uv.lock +0 -0
Dockerfile
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
FROM python:3.12
|
2 |
-
COPY --from=ghcr.io/astral-sh/uv:0.
|
3 |
|
4 |
RUN useradd -m -u 1000 user
|
5 |
ENV PATH="/home/user/.local/bin:$PATH"
|
@@ -7,13 +7,16 @@ ENV UV_SYSTEM_PYTHON=1
|
|
7 |
|
8 |
WORKDIR /app
|
9 |
|
10 |
-
|
11 |
-
RUN
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
COPY --chown=user . /app
|
14 |
-
RUN mkdir -p /app/__marimo__ && \
|
15 |
-
chown -R user:user /app && \
|
16 |
-
chmod -R 755 /app
|
17 |
USER user
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.12-slim
|
2 |
+
COPY --from=ghcr.io/astral-sh/uv:0.7.13 /uv /bin/uv
|
3 |
|
4 |
RUN useradd -m -u 1000 user
|
5 |
ENV PATH="/home/user/.local/bin:$PATH"
|
|
|
7 |
|
8 |
WORKDIR /app
|
9 |
|
10 |
+
RUN apt update && apt install -y curl unzip gcc g++
|
11 |
+
RUN mkdir -p /app && chown -R user:user /app
|
12 |
+
|
13 |
+
COPY --chown=user ./pyproject.toml ./uv.lock ./app.py ./*.txt /app
|
14 |
+
|
15 |
+
RUN chmod -R u+w /app
|
16 |
|
|
|
|
|
|
|
|
|
17 |
USER user
|
18 |
|
19 |
+
RUN curl -O https://clrd.ninjal.ac.jp/unidic_archive/2308/unidic-novel-v202308.zip && unzip -x unidic-novel-v202308.zip
|
20 |
+
RUN uv sync
|
21 |
+
|
22 |
+
CMD ["uv", "run", "marimo", "run", "app.py", "--no-sandbox", "--include-code", "--host", "0.0.0.0", "--port", "7860"]
|
Natsume_S_Bocchan.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Natsume_S_Kokoro.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🍃
|
4 |
colorFrom: indigo
|
5 |
colorTo: purple
|
|
|
1 |
---
|
2 |
+
title: scattertext-ja-novels
|
3 |
emoji: 🍃
|
4 |
colorFrom: indigo
|
5 |
colorTo: purple
|
Unno_J_Chikyuuyousa.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Unno_J_Kaseiheidan.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
CHANGED
@@ -1,469 +1,656 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
app = marimo.App()
|
5 |
|
|
|
|
|
6 |
|
7 |
-
|
8 |
-
def __():
|
9 |
import marimo as mo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
|
|
|
13 |
|
14 |
|
15 |
@app.cell
|
16 |
-
def
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
@app.cell
|
22 |
-
def
|
23 |
mo.md(
|
24 |
-
|
25 |
-
|
26 |
|
27 |
-
|
28 |
-
automatically** when you modify them or
|
29 |
-
interact with UI elements, like this slider: {slider}.
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
)
|
34 |
-
return
|
35 |
|
|
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
rf"""
|
43 |
-
marimo lets you disable automatic execution: just go into the
|
44 |
-
notebook settings and set
|
45 |
|
46 |
-
|
47 |
-
|
48 |
-
When the runtime is lazy, after running a cell, marimo marks its
|
49 |
-
descendants as stale instead of automatically running them. The
|
50 |
-
lazy runtime puts you in control over when cells are run, while
|
51 |
-
still giving guarantees about the notebook state.
|
52 |
-
"""
|
53 |
-
)
|
54 |
-
}
|
55 |
)
|
56 |
return
|
57 |
|
58 |
|
59 |
-
@app.cell
|
60 |
-
def
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
""
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
|
77 |
-
|
78 |
|
79 |
-
|
80 |
-
a cell that defines a global variable is run, marimo
|
81 |
-
**automatically runs** all cells that reference that variable.
|
82 |
|
83 |
-
|
84 |
-
making for a dynamic programming environment that prevents bugs before they
|
85 |
-
happen.
|
86 |
-
"""
|
87 |
-
)
|
88 |
-
return
|
89 |
|
|
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
mo.md(
|
95 |
-
f"""
|
96 |
-
**✨ Nice!** The value of `changed` is now {changed}.
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
variable `changed` to `True`, then click the run button.
|
112 |
-
"""
|
113 |
)
|
|
|
114 |
)
|
115 |
-
|
|
|
116 |
|
117 |
|
118 |
@app.cell
|
119 |
-
def
|
120 |
-
|
121 |
-
return (changed,)
|
122 |
-
|
123 |
-
|
124 |
-
@app.cell(hide_code=True)
|
125 |
-
def __(mo):
|
126 |
-
mo.accordion(
|
127 |
-
{
|
128 |
-
"Tip: execution order": (
|
129 |
-
"""
|
130 |
-
The order of cells on the page has no bearing on
|
131 |
-
the order in which cells are executed: marimo knows that a cell
|
132 |
-
reading a variable must run after the cell that defines it. This
|
133 |
-
frees you to organize your code in the way that makes the most
|
134 |
-
sense for you.
|
135 |
-
"""
|
136 |
-
)
|
137 |
-
}
|
138 |
-
)
|
139 |
-
return
|
140 |
|
|
|
141 |
|
142 |
-
|
143 |
-
def __(mo):
|
144 |
-
mo.md(
|
145 |
-
"""
|
146 |
-
**Global names must be unique.** To enable reactivity, marimo imposes a
|
147 |
-
constraint on how names appear in cells: no two cells may define the same
|
148 |
-
variable.
|
149 |
-
"""
|
150 |
-
)
|
151 |
-
return
|
152 |
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
"Tip: encapsulation": (
|
159 |
-
"""
|
160 |
-
By encapsulating logic in functions, classes, or Python modules,
|
161 |
-
you can minimize the number of global variables in your notebook.
|
162 |
-
"""
|
163 |
-
)
|
164 |
-
}
|
165 |
-
)
|
166 |
-
return
|
167 |
-
|
168 |
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
"""
|
175 |
-
Variables prefixed with an underscore are "private" to a cell, so
|
176 |
-
they can be defined by multiple cells.
|
177 |
-
"""
|
178 |
)
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
|
|
|
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
|
190 |
-
|
191 |
-
|
192 |
-
you interact with a UI element, its value is sent back to Python, and
|
193 |
-
every cell that references that element is re-run.
|
194 |
|
195 |
-
|
196 |
-
|
197 |
-
"""
|
198 |
-
)
|
199 |
-
return
|
200 |
|
|
|
|
|
201 |
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
return
|
206 |
|
207 |
|
208 |
@app.cell
|
209 |
-
def
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
|
214 |
@app.cell
|
215 |
-
def
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
|
220 |
@app.cell
|
221 |
-
def
|
222 |
-
|
|
|
|
|
223 |
return
|
224 |
|
225 |
|
226 |
@app.cell
|
227 |
-
def
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
- easily versioned with git, yielding minimal diffs
|
245 |
-
- legible for both humans and machines
|
246 |
-
- formattable using your tool of choice,
|
247 |
-
- usable as Python scripts, with UI elements taking their default
|
248 |
-
values, and
|
249 |
-
- importable by other modules (more on that in the future).
|
250 |
-
"""
|
251 |
)
|
252 |
-
return
|
253 |
-
|
254 |
-
|
255 |
-
@app.cell(hide_code=True)
|
256 |
-
def __(mo):
|
257 |
-
mo.md(
|
258 |
-
"""
|
259 |
-
## 4. Running notebooks as apps
|
260 |
|
261 |
-
marimo notebooks can double as apps. Click the app window icon in the
|
262 |
-
bottom-right to see this notebook in "app view."
|
263 |
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
|
|
|
|
268 |
)
|
269 |
-
return
|
270 |
|
|
|
|
|
271 |
|
272 |
-
@app.cell(hide_code=True)
|
273 |
-
def __(mo):
|
274 |
-
mo.md(
|
275 |
-
"""
|
276 |
-
## 5. The `marimo` command-line tool
|
277 |
-
|
278 |
-
**Creating and editing notebooks.** Use
|
279 |
-
|
280 |
-
```
|
281 |
-
marimo edit
|
282 |
-
```
|
283 |
|
284 |
-
|
285 |
-
|
|
|
|
|
|
|
286 |
|
287 |
|
288 |
-
|
|
|
|
|
289 |
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
|
294 |
-
|
295 |
-
|
|
|
|
|
|
|
296 |
|
297 |
-
**Convert a Jupyter notebook.** Convert a Jupyter notebook to a marimo
|
298 |
-
notebook using `marimo convert`:
|
299 |
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
|
304 |
-
|
|
|
|
|
|
|
305 |
|
306 |
-
- `dataflow`: more on marimo's automatic execution
|
307 |
-
- `ui`: how to use UI elements
|
308 |
-
- `markdown`: how to write markdown, with interpolated values and
|
309 |
-
LaTeX
|
310 |
-
- `plots`: how plotting works in marimo
|
311 |
-
- `sql`: how to use SQL
|
312 |
-
- `layout`: layout elements in marimo
|
313 |
-
- `fileformat`: how marimo's file format works
|
314 |
-
- `markdown-format`: for using `.md` files in marimo
|
315 |
-
- `for-jupyter-users`: if you are coming from Jupyter
|
316 |
|
317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
|
319 |
-
```
|
320 |
-
marimo tutorial dataflow
|
321 |
-
```
|
322 |
|
323 |
-
|
324 |
-
|
325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
)
|
327 |
-
return
|
328 |
|
|
|
|
|
329 |
|
330 |
-
@app.cell(hide_code=True)
|
331 |
-
def __(mo):
|
332 |
-
mo.md(
|
333 |
-
"""
|
334 |
-
## 6. The marimo editor
|
335 |
|
336 |
-
|
337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
)
|
339 |
-
|
340 |
-
|
341 |
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
|
|
|
|
346 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
"""
|
363 |
)
|
364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
|
366 |
|
367 |
-
@app.cell
|
368 |
-
def
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
""
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
),
|
396 |
-
"Console Output": (
|
397 |
-
"""
|
398 |
-
Console output (e.g., `print()` statements) is shown below a
|
399 |
-
cell.
|
400 |
-
"""
|
401 |
-
),
|
402 |
-
"Creating, Moving, and Deleting Cells": (
|
403 |
-
"""
|
404 |
-
1. _Create_ a new cell above or below a given one by clicking
|
405 |
-
the plus button to the left of the cell, which appears on
|
406 |
-
mouse hover.
|
407 |
-
|
408 |
-
2. _Move_ a cell up or down by dragging on the handle to the
|
409 |
-
right of the cell, which appears on mouse hover.
|
410 |
-
|
411 |
-
3. _Delete_ a cell by clicking the trash bin icon. Bring it
|
412 |
-
back by clicking the undo button on the bottom right of the
|
413 |
-
screen, or with `Ctrl/Cmd+Shift+z`.
|
414 |
-
"""
|
415 |
-
),
|
416 |
-
"Disabling Automatic Execution": (
|
417 |
-
"""
|
418 |
-
Via the notebook settings (gear icon) or footer panel, you
|
419 |
-
can disable automatic execution. This is helpful when
|
420 |
-
working with expensive notebooks or notebooks that have
|
421 |
-
side-effects like database transactions.
|
422 |
-
"""
|
423 |
-
),
|
424 |
-
"Disabling Cells": (
|
425 |
-
"""
|
426 |
-
You can disable a cell via the cell context menu.
|
427 |
-
marimo will never run a disabled cell or any cells that depend on it.
|
428 |
-
This can help prevent accidental execution of expensive computations
|
429 |
-
when editing a notebook.
|
430 |
-
"""
|
431 |
-
),
|
432 |
-
"Code Folding": (
|
433 |
-
"""
|
434 |
-
You can collapse or fold the code in a cell by clicking the arrow
|
435 |
-
icons in the line number column to the left, or by using keyboard
|
436 |
-
shortcuts.
|
437 |
-
|
438 |
-
Use the command palette (`Ctrl/Cmd+k`) or a keyboard shortcut to
|
439 |
-
quickly fold or unfold all cells.
|
440 |
-
"""
|
441 |
-
),
|
442 |
-
"Code Formatting": (
|
443 |
-
"""
|
444 |
-
If you have [ruff](https://github.com/astral-sh/ruff) installed,
|
445 |
-
you can format a cell with the keyboard shortcut `Ctrl/Cmd+b`.
|
446 |
-
"""
|
447 |
-
),
|
448 |
-
"Command Palette": (
|
449 |
-
"""
|
450 |
-
Use `Ctrl/Cmd+k` to open the command palette.
|
451 |
-
"""
|
452 |
-
),
|
453 |
-
"Keyboard Shortcuts": (
|
454 |
-
"""
|
455 |
-
Open the notebook menu (top-right) or input `Ctrl/Cmd+Shift+h` to
|
456 |
-
view a list of all keyboard shortcuts.
|
457 |
-
"""
|
458 |
-
),
|
459 |
-
"Configuration": (
|
460 |
-
"""
|
461 |
-
Configure the editor by clicking the gears icon near the top-right
|
462 |
-
of the screen.
|
463 |
-
"""
|
464 |
-
),
|
465 |
-
}
|
466 |
-
return (tips,)
|
467 |
|
468 |
|
469 |
if __name__ == "__main__":
|
|
|
1 |
+
# /// script
|
2 |
+
# requires-python = ">=3.12"
|
3 |
+
# dependencies = [
|
4 |
+
# "altair==5.5.0",
|
5 |
+
# "fugashi-plus",
|
6 |
+
# "marimo",
|
7 |
+
# "numpy==2.2.6",
|
8 |
+
# "pandas==2.3.0",
|
9 |
+
# "pyarrow",
|
10 |
+
# "scattertext==0.2.2",
|
11 |
+
# "scikit-learn==1.7.0",
|
12 |
+
# "scipy==1.13.1",
|
13 |
+
# ]
|
14 |
+
# ///
|
15 |
|
16 |
+
import marimo
|
|
|
17 |
|
18 |
+
__generated_with = "0.13.15"
|
19 |
+
app = marimo.App(width="full", app_title="Scattertext on Japanese novels")
|
20 |
|
21 |
+
with app.setup:
|
|
|
22 |
import marimo as mo
|
23 |
+
import itertools
|
24 |
+
import fugashi
|
25 |
+
import pandas as pd
|
26 |
+
import scipy
|
27 |
+
import numpy as np
|
28 |
+
import random
|
29 |
+
import scattertext as st
|
30 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
31 |
|
32 |
+
RANDOM_SEED = 42
|
33 |
+
random.seed(RANDOM_SEED)
|
34 |
+
np.random.seed(RANDOM_SEED)
|
35 |
|
36 |
|
37 |
@app.cell
|
38 |
+
def function_export():
|
39 |
+
@mo.cache
|
40 |
+
def parse_texts(texts: list[str]) -> list[str]:
|
41 |
+
"""Tokenize a list of raw strings via fugashi (MeCab)."""
|
42 |
+
|
43 |
+
tagger = fugashi.Tagger("-Owakati -d ./unidic-novel -r ./unidic-novel/dicrc")
|
44 |
+
return [tagger.parse(txt).strip() for txt in texts]
|
45 |
+
|
46 |
+
@mo.cache
|
47 |
+
def build_corpus_cached(
|
48 |
+
texts: list[str],
|
49 |
+
categories: list[str],
|
50 |
+
) -> st.Corpus:
|
51 |
+
"""Build or reuse cached Scattertext corpus."""
|
52 |
+
|
53 |
+
df = pd.DataFrame({"text": texts, "category": categories})
|
54 |
+
return (
|
55 |
+
st.CorpusFromPandas(
|
56 |
+
df,
|
57 |
+
category_col="category",
|
58 |
+
text_col="text",
|
59 |
+
nlp=st.whitespace_nlp_with_sentences,
|
60 |
+
)
|
61 |
+
.build()
|
62 |
+
.get_unigram_corpus()
|
63 |
+
.compact(st.AssociationCompactor(2000))
|
64 |
+
)
|
65 |
+
|
66 |
+
@mo.cache
|
67 |
+
def chunk_texts(
|
68 |
+
texts: list[str],
|
69 |
+
categories: list[str],
|
70 |
+
filenames: list[str],
|
71 |
+
chunk_size: int = 2000,
|
72 |
+
) -> tuple[list[str], list[str], list[str]]:
|
73 |
+
"""Chunk each text into segments of chunk_size tokens, preserving category and filename."""
|
74 |
+
chunked_texts = []
|
75 |
+
chunked_cats = []
|
76 |
+
chunked_fnames = []
|
77 |
+
for text, cat, fname in zip(texts, categories, filenames):
|
78 |
+
tokens = text.split()
|
79 |
+
for i in range(0, len(tokens), chunk_size):
|
80 |
+
chunk = " ".join(tokens[i : i + chunk_size])
|
81 |
+
chunked_texts.append(chunk)
|
82 |
+
chunked_cats.append(cat)
|
83 |
+
chunked_fnames.append(f"{fname}#{i // chunk_size + 1}")
|
84 |
+
return chunked_texts, chunked_cats, chunked_fnames
|
85 |
+
|
86 |
+
@mo.cache
|
87 |
+
def train_scikit_cached(
|
88 |
+
texts: list[str], categories: list[str], filenames: list[str]
|
89 |
+
) -> tuple[
|
90 |
+
st.Corpus,
|
91 |
+
scipy.sparse.spmatrix,
|
92 |
+
TfidfVectorizer,
|
93 |
+
list[str],
|
94 |
+
list[str],
|
95 |
+
]:
|
96 |
+
"""Fit TF-IDF + CountVectorizer & build a st.Corpus on chunked data."""
|
97 |
+
|
98 |
+
chunk_texts_out, chunk_cats, chunk_fnames = chunk_texts(
|
99 |
+
texts, categories, filenames
|
100 |
+
)
|
101 |
+
tfv = TfidfVectorizer()
|
102 |
+
X_tfidf = tfv.fit_transform(chunk_texts_out)
|
103 |
+
cv = CountVectorizer(vocabulary=tfv.vocabulary_, max_features=100)
|
104 |
+
y_codes = pd.Categorical(
|
105 |
+
chunk_cats, categories=pd.Categorical(chunk_cats).categories
|
106 |
+
).codes
|
107 |
+
|
108 |
+
scikit_corpus = st.CorpusFromScikit(
|
109 |
+
X=cv.fit_transform(chunk_texts_out),
|
110 |
+
y=y_codes,
|
111 |
+
feature_vocabulary=tfv.vocabulary_,
|
112 |
+
category_names=list(pd.Categorical(chunk_cats).categories),
|
113 |
+
raw_texts=chunk_texts_out,
|
114 |
+
).build()
|
115 |
+
|
116 |
+
return (
|
117 |
+
scikit_corpus,
|
118 |
+
X_tfidf,
|
119 |
+
tfv,
|
120 |
+
chunk_cats,
|
121 |
+
chunk_fnames,
|
122 |
+
)
|
123 |
+
|
124 |
+
return build_corpus_cached, chunk_texts, parse_texts, train_scikit_cached
|
125 |
|
126 |
|
127 |
@app.cell
|
128 |
+
def intro():
|
129 |
mo.md(
|
130 |
+
r"""
|
131 |
+
# Scattertext on Japanese novels / 近代文学作品のScattertext可視化
|
132 |
|
133 |
+
## 概要
|
|
|
|
|
134 |
|
135 |
+
2つの異なるカテゴリのテキストファイル群をアップロードし、その差異をScattertextで可視化します。
|
136 |
+
オプショナルで機械学習モデルで分類をし、モデルの分類制度とモデルが識別に用いるトークンも確認できます。
|
|
|
|
|
137 |
|
138 |
+
## ワークフロー
|
139 |
|
140 |
+
1. テキストファイルをアップロード(デフォルトを使う場合はそのままSubmitしてください)
|
141 |
+
2. データ内容を確認・修正
|
142 |
+
3. チャンク&サンプリング設定
|
143 |
+
4. Scattertextによる可視化
|
144 |
+
5. (任意)分類モデルによる性能検証
|
|
|
|
|
|
|
145 |
|
146 |
+
> 単語分割には、[近現代口語小説UniDic](https://clrd.ninjal.ac.jp/unidic/download_all.html#unidic_novel)を使用しています。異なる時代やジャンルのテキストには不向きです。
|
147 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
)
|
149 |
return
|
150 |
|
151 |
|
152 |
+
@app.cell
|
153 |
+
def data_settings():
|
154 |
+
# 1) Create each widget
|
155 |
+
category_name = mo.ui.text(
|
156 |
+
label="カテゴリ名(例:著者名・時代区分など)",
|
157 |
+
placeholder="例:時代・性別・著者など",
|
158 |
+
value="著者",
|
159 |
+
full_width=True,
|
160 |
+
)
|
161 |
+
label_a = mo.ui.text(
|
162 |
+
label="Aのラベル", placeholder="例:夏目漱石", value="夏目漱石", full_width=True
|
163 |
+
)
|
164 |
+
files_a = mo.ui.file(
|
165 |
+
label="Aのファイルアップロード(UTF-8、.txt形式)", multiple=True, kind="area"
|
166 |
+
)
|
167 |
+
label_b = mo.ui.text(
|
168 |
+
label="Bのラベル", placeholder="例:海野十三", value="海野十三", full_width=True
|
169 |
+
)
|
170 |
+
files_b = mo.ui.file(
|
171 |
+
label="Bのファイルアップロード(UTF-8、.txt形式)", multiple=True, kind="area"
|
172 |
+
)
|
173 |
|
174 |
+
tpl = r"""
|
175 |
+
## データと分析の設定
|
176 |
|
177 |
+
※ 初期では夏目漱石と海野十三から各2作品をサンプルコーパスにしています。設定を変更せずSubmitすると、サンプルコーパスでの分析になります。ファイルをアップロードする場合は忘れずにカテゴリとラベルも変更してください。
|
|
|
|
|
178 |
|
179 |
+
※ ファイルはプレインテキスト形式必須(.txt, UTF-8エンコーディング)
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
+
{category_name}
|
182 |
|
183 |
+
### グループA
|
184 |
+
{label_a}
|
185 |
+
{files_a}
|
|
|
|
|
|
|
186 |
|
187 |
+
### グループB
|
188 |
+
{label_b}
|
189 |
+
{files_b}
|
190 |
+
"""
|
191 |
|
192 |
+
data_form = (
|
193 |
+
mo.md(tpl)
|
194 |
+
.batch(
|
195 |
+
# info_box=info_box,
|
196 |
+
category_name=category_name,
|
197 |
+
label_a=label_a,
|
198 |
+
files_a=files_a,
|
199 |
+
label_b=label_b,
|
200 |
+
files_b=files_b,
|
|
|
|
|
201 |
)
|
202 |
+
.form(show_clear_button=True, bordered=True)
|
203 |
)
|
204 |
+
data_form
|
205 |
+
return data_form, label_a, label_b
|
206 |
|
207 |
|
208 |
@app.cell
|
209 |
+
def data_check(data_form, parse_texts):
|
210 |
+
mo.stop(data_form.value is None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
+
from pathlib import Path
|
213 |
|
214 |
+
validation_messages: list[str] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
+
if data_form.value["label_a"] == data_form.value["label_b"]:
|
217 |
+
print("a")
|
218 |
+
validation_messages.append(
|
219 |
+
"⚠️ **警告**: グループAとBのラベルが同じです。AとBは異なるラベルを設定してください。\n"
|
220 |
+
)
|
221 |
|
222 |
+
if not data_form.value["files_a"] and not data_form.value["files_b"]:
|
223 |
+
validation_messages.append(
|
224 |
+
"ℹ️ ファイルが未指定のため、デフォルトサンプルを使用しています。\n"
|
225 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
+
try:
|
228 |
+
# Group A: either uploaded files or default (坊っちゃん + こころ)
|
229 |
+
if data_form.value["files_a"]:
|
230 |
+
category_a_texts = (
|
231 |
+
f.contents.decode("utf-8") for f in data_form.value["files_a"]
|
|
|
|
|
|
|
|
|
232 |
)
|
233 |
+
category_a_names = (f.name for f in data_form.value["files_a"])
|
234 |
+
else:
|
235 |
+
natsume_1 = Path("Natsume_S_Bocchan.txt").read_text(encoding="utf-8")
|
236 |
+
natsume_2 = Path("Natsume_S_Kokoro.txt").read_text(encoding="utf-8")
|
237 |
+
category_a_texts = [natsume_1, natsume_2]
|
238 |
+
category_a_names = ["Natsume_S_Bocchan.txt", "Natsume_S_Kokoro.txt"]
|
239 |
+
|
240 |
+
# Group B: either uploaded files or default (地球要塞 + 火星兵団)
|
241 |
+
if data_form.value["files_b"]:
|
242 |
+
category_b_texts = (
|
243 |
+
f.contents.decode("utf-8") for f in data_form.value["files_b"]
|
244 |
+
)
|
245 |
+
category_b_names = (f.name for f in data_form.value["files_b"])
|
246 |
+
else:
|
247 |
+
unno_1 = Path("Unno_J_Chikyuuyousa.txt").read_text(encoding="utf-8")
|
248 |
+
unno_2 = Path("Unno_J_Kaseiheidan.txt").read_text(encoding="utf-8")
|
249 |
+
|
250 |
+
category_b_texts = [unno_1, unno_2]
|
251 |
+
category_b_names = ["Unno_J_Chikyuuyousa.txt", "Unno_J_Kaseiheidan.txt"]
|
252 |
+
|
253 |
+
data = pd.DataFrame(
|
254 |
+
{
|
255 |
+
"category": (
|
256 |
+
[data_form.value["label_a"]]
|
257 |
+
* (
|
258 |
+
len(data_form.value["files_a"])
|
259 |
+
if data_form.value["files_a"]
|
260 |
+
else 2
|
261 |
+
)
|
262 |
+
)
|
263 |
+
+ (
|
264 |
+
[data_form.value["label_b"]]
|
265 |
+
* (
|
266 |
+
len(data_form.value["files_b"])
|
267 |
+
if data_form.value["files_b"]
|
268 |
+
else 2
|
269 |
+
)
|
270 |
+
),
|
271 |
+
"filename": itertools.chain(category_a_names, category_b_names),
|
272 |
+
"text": itertools.chain(category_a_texts, category_b_texts),
|
273 |
+
}
|
274 |
+
)
|
275 |
|
276 |
+
with mo.status.spinner("コーパスを形態素解析中..."):
|
277 |
+
data["text"] = parse_texts(list(data["text"]))
|
278 |
|
279 |
+
except Exception as e:
|
280 |
+
data = None
|
281 |
+
validation_messages.append(
|
282 |
+
f"❌ **エラー**: ファイルの読み込みに失敗しました: {str(e)}\n"
|
283 |
+
)
|
284 |
|
285 |
+
# We need the maximum number of tokens for the slider
|
286 |
+
max_tokens = data["text"].map(lambda s: len(s.split())).max()
|
|
|
|
|
287 |
|
288 |
+
mo.md(f"""
|
289 |
+
## データ確認
|
|
|
|
|
|
|
290 |
|
291 |
+
{"**警告**:\n" if validation_messages else ""}
|
292 |
+
{"\n".join(map(lambda x: f"- {x}", validation_messages))}
|
293 |
|
294 |
+
解析済テキスト一覧:
|
295 |
+
{mo.ui.table(data, selection="multi", format_mapping={"text": lambda s: s[:20] + "..."})}
|
296 |
+
""")
|
297 |
+
return (data,)
|
298 |
|
299 |
|
300 |
@app.cell
|
301 |
+
def sampling_controls_setup():
|
302 |
+
chunk_size = mo.ui.slider(
|
303 |
+
start=500,
|
304 |
+
stop=50_000,
|
305 |
+
value=2000,
|
306 |
+
step=500,
|
307 |
+
label="1チャンクあたり最大トークン数",
|
308 |
+
full_width=True,
|
309 |
+
)
|
310 |
+
sample_frac = mo.ui.slider(
|
311 |
+
start=0.1,
|
312 |
+
stop=1.0,
|
313 |
+
value=0.2,
|
314 |
+
step=0.05,
|
315 |
+
label="使用割合(1.0で全データ)",
|
316 |
+
full_width=True,
|
317 |
+
)
|
318 |
+
sampling_form = (
|
319 |
+
mo.md("{chunk_size}\n{sample_frac}")
|
320 |
+
.batch(chunk_size=chunk_size, sample_frac=sample_frac)
|
321 |
+
.form(show_clear_button=True, bordered=False)
|
322 |
+
)
|
323 |
+
sampling_form
|
324 |
+
return chunk_size, sample_frac, sampling_form
|
325 |
|
326 |
|
327 |
@app.cell
|
328 |
+
def _(build_corpus_cached, chunk_texts, data, sample_frac, sampling_form):
|
329 |
+
mo.stop(sampling_form.value is None)
|
330 |
+
|
331 |
+
with mo.status.spinner("コーパスをサンプリング中…"):
|
332 |
+
texts, cats, fnames = chunk_texts(
|
333 |
+
list(data.text),
|
334 |
+
list(data.category),
|
335 |
+
list(data.filename),
|
336 |
+
sampling_form.value["chunk_size"],
|
337 |
+
)
|
338 |
+
|
339 |
+
if sample_frac.value < 1.0:
|
340 |
+
N = len(texts)
|
341 |
+
k = int(N * sampling_form.value["sample_frac"])
|
342 |
+
idx = random.sample(range(N), k)
|
343 |
+
texts = [texts[i] for i in idx]
|
344 |
+
cats = [cats[i] for i in idx]
|
345 |
+
fnames = [fnames[i] for i in idx]
|
346 |
+
|
347 |
+
corpus = build_corpus_cached(
|
348 |
+
texts,
|
349 |
+
cats,
|
350 |
+
)
|
351 |
+
return cats, corpus, fnames, texts
|
352 |
|
353 |
|
354 |
@app.cell
|
355 |
+
def sampling_controls(chunk_size):
|
356 |
+
mo.md("トークン数を増やすと処理時間が長くなります").callout(
|
357 |
+
kind="info"
|
358 |
+
) if chunk_size.value > 30_000 else None
|
359 |
return
|
360 |
|
361 |
|
362 |
@app.cell
|
363 |
+
def plot_main_scatterplot(corpus, data_form, fnames):
|
364 |
+
cat_name = data_form.value["category_name"]
|
365 |
+
with mo.status.spinner("Scatterplot作成中…"):
|
366 |
+
html = st.produce_scattertext_explorer(
|
367 |
+
corpus,
|
368 |
+
category=data_form.value["label_a"],
|
369 |
+
category_name=f"{cat_name}: {data_form.value['label_a']}",
|
370 |
+
not_category_name=f"{cat_name}: {data_form.value['label_b']}",
|
371 |
+
width_in_pixels=1000,
|
372 |
+
metadata=fnames,
|
373 |
+
)
|
374 |
|
375 |
+
mo.vstack(
|
376 |
+
[
|
377 |
+
mo.md(f"""
|
378 |
+
# Scattertextの結果
|
379 |
+
### Scattertext可視化の見方
|
380 |
+
- (縦)上に行くほど{data_form.value["label_a"]}で相対的に多く使われるトークン
|
381 |
+
- (横)右に行くほど{data_form.value["label_b"]}で相対的に多く使われるトークン
|
382 |
+
|
383 |
+
HTMLをダウンロードしてブラウザで開くと見やすい
|
384 |
+
"""),
|
385 |
+
mo.iframe(html),
|
386 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
)
|
388 |
+
return (html,)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
|
|
|
|
|
390 |
|
391 |
+
@app.cell
|
392 |
+
def _(html):
|
393 |
+
download_button = mo.download(
|
394 |
+
data=html.encode(),
|
395 |
+
filename="scattertext_analysis.html",
|
396 |
+
label="可視化結果をダウンロード",
|
397 |
)
|
|
|
398 |
|
399 |
+
mo.md(f"{download_button}")
|
400 |
+
return
|
401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
|
403 |
+
@app.cell
|
404 |
+
def classification_toggle():
|
405 |
+
run_model = mo.ui.switch(label="分類モデルを適用する")
|
406 |
+
run_model
|
407 |
+
return (run_model,)
|
408 |
|
409 |
|
410 |
+
@app.cell
|
411 |
+
def _(run_model):
|
412 |
+
mo.stop(not run_model.value)
|
413 |
|
414 |
+
mo.md(
|
415 |
+
r"""
|
416 |
+
# 分類モデルによる検証
|
417 |
|
418 |
+
2つのカテゴリを分類するモデルを学習し、それぞれのカテゴリを分ける有効な蘇生(単語)がどれなのかもScattertextで観察できます。
|
419 |
+
ここはRandom Forestという機械学習モデルを使用しています。
|
420 |
+
"""
|
421 |
+
)
|
422 |
+
return
|
423 |
|
|
|
|
|
424 |
|
425 |
+
@app.cell
|
426 |
+
def _(cats, fnames, run_model, texts, train_scikit_cached):
|
427 |
+
mo.stop(not run_model.value)
|
428 |
|
429 |
+
scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
|
430 |
+
texts, cats, fnames
|
431 |
+
)
|
432 |
+
return chunk_cats, chunk_fnames, scikit_corpus, tfidf_X, vectorizer
|
433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
|
435 |
+
@app.cell
|
436 |
+
def model_selection(run_model):
|
437 |
+
mo.stop(not run_model.value)
|
438 |
+
|
439 |
+
model_dropdown = mo.ui.dropdown(
|
440 |
+
options=[
|
441 |
+
"LogisticRegression",
|
442 |
+
"RandomForestClassifier",
|
443 |
+
"GradientBoostingClassifier",
|
444 |
+
],
|
445 |
+
value="LogisticRegression",
|
446 |
+
label="モデル選択",
|
447 |
+
)
|
448 |
+
model_dropdown
|
449 |
+
return (model_dropdown,)
|
450 |
|
|
|
|
|
|
|
451 |
|
452 |
+
@app.cell
|
453 |
+
def hyperparameters(model_dropdown):
|
454 |
+
lr_C = mo.ui.slider(0.01, 10.0, value=1.0, step=0.01, label="LR C")
|
455 |
+
lr_max_iter = mo.ui.slider(100, 2000, value=1000, step=100, label="LR max_iter")
|
456 |
+
rf_n = mo.ui.slider(10, 500, value=100, step=10, label="RF n_estimators")
|
457 |
+
rf_max_depth = mo.ui.slider(1, 50, value=10, step=1, label="RF max_depth")
|
458 |
+
gb_n = mo.ui.slider(10, 500, value=100, step=10, label="GB n_estimators")
|
459 |
+
gb_lr = mo.ui.slider(0.01, 1.0, value=0.1, step=0.01, label="GB learning_rate")
|
460 |
+
gb_md = mo.ui.slider(1, 10, value=3, step=1, label="GB max_depth")
|
461 |
+
|
462 |
+
widgets = []
|
463 |
+
if model_dropdown.value == "LogisticRegression":
|
464 |
+
widgets = {"lr_C": lr_C, "lr_max_iter": lr_max_iter}
|
465 |
+
elif model_dropdown.value == "RandomForestClassifier":
|
466 |
+
widgets = {"rf_n": rf_n, "rf_max_depth": rf_max_depth}
|
467 |
+
else: # GradientBoostingClassifier
|
468 |
+
widgets = {"gb_n": gb_n, "gb_lr": gb_lr, "gb_md": gb_md}
|
469 |
+
|
470 |
+
test_size = mo.ui.slider(0.1, 0.5, value=0.3, step=0.05, label="テストデータ比率")
|
471 |
+
|
472 |
+
model_form = (
|
473 |
+
mo.md("### モデルのパラメータ設定\n{widgets}\n{test_size}")
|
474 |
+
.batch(
|
475 |
+
widgets=mo.ui.dictionary(widgets),
|
476 |
+
test_size=test_size,
|
477 |
+
)
|
478 |
+
.form(show_clear_button=True, bordered=False)
|
479 |
)
|
|
|
480 |
|
481 |
+
model_form
|
482 |
+
return (model_form,)
|
483 |
|
|
|
|
|
|
|
|
|
|
|
484 |
|
485 |
+
@app.cell
|
486 |
+
def _(
|
487 |
+
chunk_cats,
|
488 |
+
label_a,
|
489 |
+
label_b,
|
490 |
+
model_dropdown,
|
491 |
+
model_form,
|
492 |
+
roc_auc,
|
493 |
+
roc_df,
|
494 |
+
run_model,
|
495 |
+
tfidf_X,
|
496 |
+
vectorizer,
|
497 |
+
):
|
498 |
+
mo.stop(not run_model.value or not model_form.value)
|
499 |
+
|
500 |
+
import altair as alt
|
501 |
+
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
|
502 |
+
from sklearn.linear_model import LogisticRegression
|
503 |
+
from sklearn.metrics import (
|
504 |
+
auc,
|
505 |
+
classification_report,
|
506 |
+
confusion_matrix,
|
507 |
+
roc_curve,
|
508 |
)
|
509 |
+
from sklearn.model_selection import train_test_split
|
|
|
510 |
|
511 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
512 |
+
tfidf_X,
|
513 |
+
chunk_cats,
|
514 |
+
test_size=model_form.value["test_size"],
|
515 |
+
random_state=RANDOM_SEED,
|
516 |
+
)
|
517 |
|
518 |
+
name = model_dropdown.value
|
519 |
+
if name == "LogisticRegression":
|
520 |
+
clf = LogisticRegression(
|
521 |
+
C=model_form.value["widgets"]["lr_C"],
|
522 |
+
max_iter=int(model_form.value["widgets"]["lr_max_iter"]),
|
523 |
+
)
|
524 |
+
elif name == "RandomForestClassifier":
|
525 |
+
clf = RandomForestClassifier(
|
526 |
+
n_estimators=int(model_form.value["widgets"]["rf_n"]),
|
527 |
+
max_depth=int(model_form.value["widgets"]["rf_max_depth"]),
|
528 |
+
random_state=RANDOM_SEED,
|
529 |
+
)
|
530 |
+
else: # GradientBoostingClassifier
|
531 |
+
clf = GradientBoostingClassifier(
|
532 |
+
n_estimators=int(model_form.value["widgets"]["gb_n"]),
|
533 |
+
learning_rate=float(model_form.value["widgets"]["gb_lr"]),
|
534 |
+
max_depth=int(model_form.value["widgets"]["gb_md"]),
|
535 |
+
random_state=RANDOM_SEED,
|
536 |
+
)
|
537 |
|
538 |
+
clf.fit(X_train, y_train)
|
539 |
+
if hasattr(clf, "feature_importances_"):
|
540 |
+
term_scores = clf.feature_importances_
|
541 |
+
else:
|
542 |
+
term_scores = abs(clf.coef_[0])
|
543 |
+
|
544 |
+
y_pred = clf.predict(X_test)
|
545 |
+
report = classification_report(y_test, y_pred, output_dict=True)
|
546 |
+
|
547 |
+
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
|
548 |
+
cm_df = (
|
549 |
+
pd.DataFrame(cm, index=clf.classes_, columns=clf.classes_)
|
550 |
+
.reset_index()
|
551 |
+
.melt(
|
552 |
+
id_vars="index",
|
553 |
+
var_name="Predicted",
|
554 |
+
value_name="count",
|
555 |
+
)
|
556 |
+
.rename(columns={"index": "Actual"})
|
557 |
+
)
|
558 |
|
559 |
+
# pos_idx = list(clf.classes_).index(label_a.value)
|
560 |
+
# _proba, roc_auc = None, None
|
561 |
+
# roc_df = None
|
562 |
+
# if hasattr(clf, "predict_proba"):
|
563 |
+
# probs = clf.predict_proba(X_test)[:, pos_idx]
|
564 |
+
# y_test_arr = np.array(y_test)
|
565 |
+
# fpr, tpr, _ = roc_curve((y_test_arr == label_a.value).astype(int), probs)
|
566 |
+
# roc_auc = auc(fpr, tpr)
|
567 |
+
# roc_df = pd.DataFrame({"fpr": fpr, "tpr": tpr})
|
568 |
+
|
569 |
+
feature_names = vectorizer.get_feature_names_out()
|
570 |
+
importances = (
|
571 |
+
pd.DataFrame({"単語": feature_names, "重要度": term_scores})
|
572 |
+
.sort_values("重要度", ascending=False)
|
573 |
+
.head(20)
|
574 |
+
)
|
575 |
|
576 |
+
imp_chart = (
|
577 |
+
alt.Chart(importances)
|
578 |
+
.mark_bar()
|
579 |
+
.encode(
|
580 |
+
x=alt.X("重要度:Q", title="重要度"),
|
581 |
+
y=alt.Y("単語:N", sort="-x"),
|
582 |
+
)
|
583 |
+
.properties(title="Top‐20 重要特徴語", width=600, height=400)
|
|
|
584 |
)
|
585 |
+
cm_chart = (
|
586 |
+
alt.Chart(cm_df)
|
587 |
+
.mark_rect()
|
588 |
+
.encode(
|
589 |
+
x="Predicted:N",
|
590 |
+
y="Actual:N",
|
591 |
+
color=alt.Color("count:Q", title="Count"),
|
592 |
+
tooltip=["Actual", "Predicted", "count"],
|
593 |
+
)
|
594 |
+
.properties(title="Confusion Matrix", width=250, height=250)
|
595 |
+
)
|
596 |
+
# roc_chart = (
|
597 |
+
# alt.Chart(roc_df)
|
598 |
+
# .mark_line(point=True)
|
599 |
+
# .encode(
|
600 |
+
# x=alt.X("fpr:Q", title="False Positive Rate"),
|
601 |
+
# y=alt.Y("tpr:Q", title="True Positive Rate"),
|
602 |
+
# )
|
603 |
+
# .properties(
|
604 |
+
# title=f"ROC Curve (AUC={roc_auc:.2f})",
|
605 |
+
# width=400,
|
606 |
+
# height=300,
|
607 |
+
# )
|
608 |
+
# )
|
609 |
+
|
610 |
+
mo.vstack(
|
611 |
+
[
|
612 |
+
mo.ui.altair_chart(imp_chart),
|
613 |
+
mo.ui.altair_chart(cm_chart),
|
614 |
+
# mo.ui.altair_chart(roc_chart), # Turned out to not be too informative as task is too easy?
|
615 |
+
mo.md(f"""
|
616 |
+
## テストセット上の分類性能
|
617 |
+
|
618 |
+
- {label_a.value}: 精度 {report[label_a.value]["precision"]:.2%}, 再現率 {report[label_a.value]["recall"]:.2%}
|
619 |
+
- {label_b.value}: 精度 {report[label_b.value]["precision"]:.2%}, 再現率 {report[label_b.value]["recall"]:.2%}
|
620 |
+
"""),
|
621 |
+
]
|
622 |
+
)
|
623 |
+
return (term_scores,)
|
624 |
|
625 |
|
626 |
+
@app.cell
|
627 |
+
def _(
|
628 |
+
chunk_fnames,
|
629 |
+
data_form,
|
630 |
+
model_form,
|
631 |
+
run_model,
|
632 |
+
scikit_corpus,
|
633 |
+
term_scores,
|
634 |
+
):
|
635 |
+
mo.stop(not run_model.value or not model_form.value)
|
636 |
+
|
637 |
+
with mo.status.spinner("分類モデルのScatterplotを作成中…"):
|
638 |
+
scikit_html = st.produce_scattertext_explorer(
|
639 |
+
corpus=scikit_corpus,
|
640 |
+
category=data_form.value["label_a"],
|
641 |
+
category_name=data_form.value["label_a"],
|
642 |
+
not_category_name=data_form.value["label_b"],
|
643 |
+
scores=term_scores,
|
644 |
+
terms_to_include=st.AutoTermSelector.get_selected_terms(
|
645 |
+
scikit_corpus, term_scores, 4000
|
646 |
+
),
|
647 |
+
metadata=chunk_fnames,
|
648 |
+
transform=lambda freqs, _index, total: freqs / total.sum(),
|
649 |
+
rescale_x=lambda arr: arr, # identity
|
650 |
+
rescale_y=lambda arr: arr, # identity
|
651 |
+
)
|
652 |
+
mo.iframe(scikit_html)
|
653 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
|
655 |
|
656 |
if __name__ == "__main__":
|
development.md
CHANGED
@@ -3,6 +3,6 @@
|
|
3 |
## Testing your Dockerfile locally
|
4 |
|
5 |
```bash
|
6 |
-
docker build -t
|
7 |
-
docker run -it --rm -p 7860:7860
|
8 |
```
|
|
|
3 |
## Testing your Dockerfile locally
|
4 |
|
5 |
```bash
|
6 |
+
docker build -t scattertext-ja-novels .
|
7 |
+
docker run -it --rm -p 7860:7860 scattertext-ja-novels
|
8 |
```
|
pyproject.toml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "scattertext-ja-novels"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Scattertext on Japanese novels"
|
5 |
+
readme = "README.md"
|
6 |
+
requires-python = ">=3.12"
|
7 |
+
dependencies = [
|
8 |
+
"altair>=5.5.0",
|
9 |
+
"fugashi-plus>=1.4.0.post1",
|
10 |
+
"marimo>=0.13.15",
|
11 |
+
"numpy>=2.2.6",
|
12 |
+
"pandas>=2.3.0",
|
13 |
+
"pyarrow>=20.0.0",
|
14 |
+
"scattertext==0.2.2",
|
15 |
+
"scikit-learn==1.7.0",
|
16 |
+
"scipy==1.13.1",
|
17 |
+
]
|
requirements.txt
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
marimo
|
2 |
-
# Or a specific version
|
3 |
-
# marimo>=0.9.0
|
4 |
-
|
5 |
-
# Add other dependencies as needed
|
|
|
|
|
|
|
|
|
|
|
|
uv.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|