etrotta commited on
Commit
5aaa673
·
1 Parent(s): 15c9aa4

First draft - Querying with SQL

Browse files
Files changed (1) hide show
  1. polars/07-querying-with-sql.py +328 -0
polars/07-querying-with-sql.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.12"
3
+ # dependencies = [
4
+ # "duckdb==1.4.3",
5
+ # "kagglehub==0.3.13",
6
+ # "polars==1.36.1",
7
+ # "pyarrow==22.0.0",
8
+ # "sqlalchemy==2.0.45",
9
+ # "sqlglot==28.3.0",
10
+ # ]
11
+ # ///
12
+
13
+ import marimo
14
+
15
+ __generated_with = "0.18.4"
16
+ app = marimo.App(width="medium")
17
+
18
+
19
+ @app.cell(hide_code=True)
20
+ def _(mo):
21
+ mo.md(r"""
22
+ ## SQL Features in Marimo and Polars
23
+
24
+ _By [etrotta](https://github.com/etrotta)_
25
+
26
+ For this Notebook, we'll be using a [hotel booking analytics](https://www.kaggle.com/datasets/alperenmyung/international-hotel-booking-analytics) dataset.
27
+
28
+ We will see many ways in which you can use SQL inside of marimo and how each feature interacts with polars, including:
29
+ - How to read data from a SQLite file (or any Database connection)
30
+ - What are SQL Cells in Marimo
31
+ - How to load an SQL query into a DataFrame
32
+ - How to query DataFrames using SQL
33
+ """)
34
+ return
35
+
36
+
37
+ @app.cell
38
+ def _(mo, reviews, sqlite_engine):
39
+ _df = mo.sql(
40
+ f"""
41
+ SELECT * FROM reviews LIMIT 100
42
+ """,
43
+ engine=sqlite_engine,
44
+ )
45
+ return
46
+
47
+
48
+ @app.cell(hide_code=True)
49
+ def _(mo):
50
+ mo.md(r"""
51
+ We will start by using `kagglehub` to download a `.sqlite` file, then create an `SQLAlchemy` engine to let marimo know about the database.
52
+ """)
53
+ return
54
+
55
+
56
+ @app.cell
57
+ def _(kagglehub):
58
+ dataset_id = "alperenmyung/international-hotel-booking-analytics"
59
+ cached_file = kagglehub.dataset_download(dataset_id, "booking_db.sqlite")
60
+ return (cached_file,)
61
+
62
+
63
+ @app.cell
64
+ def _(cached_file):
65
+ cached_file
66
+ return
67
+
68
+
69
+ @app.cell(hide_code=True)
70
+ def _(mo):
71
+ mo.md(r"""
72
+ ### Using Marimo's SQL Cells
73
+ """)
74
+ return
75
+
76
+
77
+ @app.cell
78
+ def _(cached_file, sqlalchemy):
79
+ sqlite_engine = sqlalchemy.create_engine("sqlite:///" + cached_file)
80
+ return (sqlite_engine,)
81
+
82
+
83
+ @app.cell(hide_code=True)
84
+ def _(mo):
85
+ mo.md(r"""
86
+ After creating the Engine, you should be able to see it in the **Data Sources** pannel in the sidebar. Whenever you create an SQLAlchemy engine as a global variable, Marimo picks up on it and makes it available for use in SQL Cells
87
+
88
+ You can use it to consult all tables and their columns, as well as click "Add table to notebook" to get the code to use it, creating our first SQL Cell:
89
+ """)
90
+ return
91
+
92
+
93
+ @app.cell
94
+ def _(hotels, mo, sqlite_engine):
95
+ _df = mo.sql(
96
+ f"""
97
+ SELECT * FROM hotels LIMIT 10
98
+ """,
99
+ engine=sqlite_engine,
100
+ )
101
+ return
102
+
103
+
104
+ @app.cell(hide_code=True)
105
+ def _(mo):
106
+ mo.md(r"""
107
+ The `Output variable:` can be used to save the output as a polars DataFrame you can access later
108
+
109
+ For example, fetching all scores then performing a group by in polars
110
+ """)
111
+ return
112
+
113
+
114
+ @app.cell
115
+ def _(mo, reviews, sqlite_engine, users):
116
+ polars_age_groups = mo.sql(
117
+ f"""
118
+ SELECT reviews.*, age_group FROM reviews JOIN users ON reviews.user_id = users.user_id LIMIT 1000
119
+ """,
120
+ engine=sqlite_engine,
121
+ )
122
+ return (polars_age_groups,)
123
+
124
+
125
+ @app.cell
126
+ def _(pl, polars_age_groups):
127
+ _mean_scores = pl.col("^score_.*$").mean()
128
+ _age_group_start = pl.col("age_group").str.slice(0, 2).cast(int)
129
+ polars_age_groups.group_by("age_group").agg(_mean_scores).sort(_age_group_start)
130
+ return
131
+
132
+
133
+ @app.cell(hide_code=True)
134
+ def _(mo):
135
+ mo.md(r"""
136
+ Although you could also calculate it directly in SQL, this gives you the flexibility to use polars for operations that are harder to describe in SQL
137
+ """)
138
+ return
139
+
140
+
141
+ @app.cell
142
+ def _(mo, reviews, sqlite_engine, users):
143
+ _df = mo.sql(
144
+ f"""
145
+ SELECT age_group, AVG(reviews.score_overall) FROM reviews JOIN users ON reviews.user_id = users.user_id GROUP BY age_group
146
+ """,
147
+ engine=sqlite_engine,
148
+ )
149
+ return
150
+
151
+
152
+ @app.cell(hide_code=True)
153
+ def _(mo):
154
+ mo.md(r"""
155
+ You can also use SQL Cells to query DataFrames via DuckDB, but remember to change the Engine from the SQLite engine into the DuckDB Memory engine when doing so
156
+ """)
157
+ return
158
+
159
+
160
+ @app.cell
161
+ def _(mo, polars_age_groups):
162
+ _df = mo.sql(
163
+ f"""
164
+ SELECT * FROM polars_age_groups LIMIT 10
165
+ """
166
+ )
167
+ return
168
+
169
+
170
+ @app.cell(hide_code=True)
171
+ def _(mo):
172
+ mo.md(r"""
173
+ ### Using Polars directly
174
+ """)
175
+ return
176
+
177
+
178
+ @app.cell(hide_code=True)
179
+ def _(mo):
180
+ mo.md(r"""
181
+ Polars also offers some methods to interact with databases and query DataFrames using SQL directly, which you can use inside or outside of marimo the same.
182
+ """)
183
+ return
184
+
185
+
186
+ @app.cell(hide_code=True)
187
+ def _(mo):
188
+ mo.md(r"""
189
+ Reading data from Databases:
190
+ """)
191
+ return
192
+
193
+
194
+ @app.cell
195
+ def _(pl, sqlite_engine):
196
+ hotels = pl.read_database("SELECT * FROM hotels LIMIT 10", sqlite_engine)
197
+ hotels
198
+ return (hotels,)
199
+
200
+
201
+ @app.cell(hide_code=True)
202
+ def _(mo):
203
+ mo.md(r"""
204
+ Querying DataFrames with SQL:
205
+ """)
206
+ return
207
+
208
+
209
+ @app.cell
210
+ def _(hotels):
211
+ hotels.sql("SELECT * from self ORDER BY cleanliness_base DESC LIMIT 5")
212
+ return
213
+
214
+
215
+ @app.cell(hide_code=True)
216
+ def _(mo):
217
+ mo.md(r"""
218
+ ### Using DuckDB
219
+ """)
220
+ return
221
+
222
+
223
+ @app.cell(hide_code=True)
224
+ def _(mo):
225
+ mo.md(r"""
226
+ While marimo's SQL Cells are very practical and polars's direct methods are about as portable as it gets using polars, you can also use other libraries that integrate with polars via Arrow tables or input plugins.
227
+
228
+ One example of such integrations is DuckDB, which can be used with polars's Lazy mode as of 1.4.0
229
+ """)
230
+ return
231
+
232
+
233
+ @app.cell
234
+ def _(cached_file, duckdb):
235
+ duckdb_conn = duckdb.connect(cached_file)
236
+ return (duckdb_conn,)
237
+
238
+
239
+ @app.cell
240
+ def _(duckdb_conn):
241
+ # Loading into a normal DataFrame:
242
+ duckdb_conn.sql("SELECT * FROM hotels LIMIT 10").pl()
243
+ return
244
+
245
+
246
+ @app.cell
247
+ def _(duckdb_conn):
248
+ # Loading into a LazyFrame:
249
+ duckdb_conn.sql("SELECT * FROM hotels").pl(lazy=True).limit(10).collect()
250
+ return
251
+
252
+
253
+ @app.cell(hide_code=True)
254
+ def _(mo):
255
+ mo.md(r"""
256
+ Note that this is very similar to SQL cells backed by DuckDB, with the biggest difference being that you can control how the result is consumed as opposed to it always being loaded into memory.
257
+
258
+ Many features such as querying from DataFrames work the same using DuckDB directly as they do in DuckDB-backed SQL Cells, and vice-versa
259
+ """)
260
+ return
261
+
262
+
263
+ @app.cell
264
+ def _(duckdb, hotels):
265
+ duckdb.sql("SELECT * FROM hotels").pl(lazy=True).sort("cleanliness_base", descending=True).limit(5).collect()
266
+ return
267
+
268
+
269
+ @app.cell(hide_code=True)
270
+ def _(mo):
271
+ mo.md(r"""
272
+ ### Utilities
273
+ """)
274
+ return
275
+
276
+
277
+ @app.cell(hide_code=True)
278
+ def _(mo):
279
+ delete_file_button = mo.ui.run_button(label="Delete cached file", kind="warn")
280
+ mo.vstack([mo.md("If you want to delete the downloaded file from your cache"), delete_file_button])
281
+ return (delete_file_button,)
282
+
283
+
284
+ @app.cell
285
+ def _(cached_file, delete_file_button, pathlib):
286
+ if delete_file_button.value:
287
+ pathlib.Path(cached_file).unlink()
288
+ return
289
+
290
+
291
+ @app.cell
292
+ def _():
293
+ import marimo as mo
294
+ return (mo,)
295
+
296
+
297
+ @app.cell
298
+ def _():
299
+ import polars as pl
300
+ return (pl,)
301
+
302
+
303
+ @app.cell
304
+ def _():
305
+ import duckdb
306
+ return (duckdb,)
307
+
308
+
309
+ @app.cell
310
+ def _():
311
+ import sqlalchemy
312
+ return (sqlalchemy,)
313
+
314
+
315
+ @app.cell
316
+ def _():
317
+ import kagglehub
318
+ return (kagglehub,)
319
+
320
+
321
+ @app.cell
322
+ def _():
323
+ import pathlib
324
+ return (pathlib,)
325
+
326
+
327
+ if __name__ == "__main__":
328
+ app.run()