mt3842ml commited on
Commit
f1fc0cc
·
verified ·
1 Parent(s): 5607bc4

Upload praw.ipynb

Browse files
Files changed (1) hide show
  1. praw.ipynb +1152 -0
praw.ipynb ADDED
@@ -0,0 +1,1152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU",
17
+ "widgets": {
18
+ "application/vnd.jupyter.widget-state+json": {
19
+ "6a04ce68a3fd4e8a85eab7ef95b460bf": {
20
+ "model_module": "@jupyter-widgets/controls",
21
+ "model_name": "HBoxModel",
22
+ "model_module_version": "1.5.0",
23
+ "state": {
24
+ "_dom_classes": [],
25
+ "_model_module": "@jupyter-widgets/controls",
26
+ "_model_module_version": "1.5.0",
27
+ "_model_name": "HBoxModel",
28
+ "_view_count": null,
29
+ "_view_module": "@jupyter-widgets/controls",
30
+ "_view_module_version": "1.5.0",
31
+ "_view_name": "HBoxView",
32
+ "box_style": "",
33
+ "children": [
34
+ "IPY_MODEL_8d334cffae1e43d3965926cf47915d87",
35
+ "IPY_MODEL_9c4084b9c03c4b7d9fb6351d67e51181",
36
+ "IPY_MODEL_1af047bf93ef458ab60dec0419e162c0"
37
+ ],
38
+ "layout": "IPY_MODEL_ad9c688d0f8b4a94819008130cedbada"
39
+ }
40
+ },
41
+ "8d334cffae1e43d3965926cf47915d87": {
42
+ "model_module": "@jupyter-widgets/controls",
43
+ "model_name": "HTMLModel",
44
+ "model_module_version": "1.5.0",
45
+ "state": {
46
+ "_dom_classes": [],
47
+ "_model_module": "@jupyter-widgets/controls",
48
+ "_model_module_version": "1.5.0",
49
+ "_model_name": "HTMLModel",
50
+ "_view_count": null,
51
+ "_view_module": "@jupyter-widgets/controls",
52
+ "_view_module_version": "1.5.0",
53
+ "_view_name": "HTMLView",
54
+ "description": "",
55
+ "description_tooltip": null,
56
+ "layout": "IPY_MODEL_7fd0861c22a94f639cae337d38815752",
57
+ "placeholder": "​",
58
+ "style": "IPY_MODEL_13d05e0960ac4220a3874135a233b5b2",
59
+ "value": "100%"
60
+ }
61
+ },
62
+ "9c4084b9c03c4b7d9fb6351d67e51181": {
63
+ "model_module": "@jupyter-widgets/controls",
64
+ "model_name": "FloatProgressModel",
65
+ "model_module_version": "1.5.0",
66
+ "state": {
67
+ "_dom_classes": [],
68
+ "_model_module": "@jupyter-widgets/controls",
69
+ "_model_module_version": "1.5.0",
70
+ "_model_name": "FloatProgressModel",
71
+ "_view_count": null,
72
+ "_view_module": "@jupyter-widgets/controls",
73
+ "_view_module_version": "1.5.0",
74
+ "_view_name": "ProgressView",
75
+ "bar_style": "success",
76
+ "description": "",
77
+ "description_tooltip": null,
78
+ "layout": "IPY_MODEL_46b40383a6ce4ca2ae35faa715c87f87",
79
+ "max": 7,
80
+ "min": 0,
81
+ "orientation": "horizontal",
82
+ "style": "IPY_MODEL_2e2ec919da9240e48f7f85464bd8376b",
83
+ "value": 7
84
+ }
85
+ },
86
+ "1af047bf93ef458ab60dec0419e162c0": {
87
+ "model_module": "@jupyter-widgets/controls",
88
+ "model_name": "HTMLModel",
89
+ "model_module_version": "1.5.0",
90
+ "state": {
91
+ "_dom_classes": [],
92
+ "_model_module": "@jupyter-widgets/controls",
93
+ "_model_module_version": "1.5.0",
94
+ "_model_name": "HTMLModel",
95
+ "_view_count": null,
96
+ "_view_module": "@jupyter-widgets/controls",
97
+ "_view_module_version": "1.5.0",
98
+ "_view_name": "HTMLView",
99
+ "description": "",
100
+ "description_tooltip": null,
101
+ "layout": "IPY_MODEL_190b28c62a7548599de3cf1fe701c9e3",
102
+ "placeholder": "​",
103
+ "style": "IPY_MODEL_50c3e507389643ceaf02095a60d63045",
104
+ "value": " 7/7 [00:32<00:00,  4.32s/it]"
105
+ }
106
+ },
107
+ "ad9c688d0f8b4a94819008130cedbada": {
108
+ "model_module": "@jupyter-widgets/base",
109
+ "model_name": "LayoutModel",
110
+ "model_module_version": "1.2.0",
111
+ "state": {
112
+ "_model_module": "@jupyter-widgets/base",
113
+ "_model_module_version": "1.2.0",
114
+ "_model_name": "LayoutModel",
115
+ "_view_count": null,
116
+ "_view_module": "@jupyter-widgets/base",
117
+ "_view_module_version": "1.2.0",
118
+ "_view_name": "LayoutView",
119
+ "align_content": null,
120
+ "align_items": null,
121
+ "align_self": null,
122
+ "border": null,
123
+ "bottom": null,
124
+ "display": null,
125
+ "flex": null,
126
+ "flex_flow": null,
127
+ "grid_area": null,
128
+ "grid_auto_columns": null,
129
+ "grid_auto_flow": null,
130
+ "grid_auto_rows": null,
131
+ "grid_column": null,
132
+ "grid_gap": null,
133
+ "grid_row": null,
134
+ "grid_template_areas": null,
135
+ "grid_template_columns": null,
136
+ "grid_template_rows": null,
137
+ "height": null,
138
+ "justify_content": null,
139
+ "justify_items": null,
140
+ "left": null,
141
+ "margin": null,
142
+ "max_height": null,
143
+ "max_width": null,
144
+ "min_height": null,
145
+ "min_width": null,
146
+ "object_fit": null,
147
+ "object_position": null,
148
+ "order": null,
149
+ "overflow": null,
150
+ "overflow_x": null,
151
+ "overflow_y": null,
152
+ "padding": null,
153
+ "right": null,
154
+ "top": null,
155
+ "visibility": null,
156
+ "width": null
157
+ }
158
+ },
159
+ "7fd0861c22a94f639cae337d38815752": {
160
+ "model_module": "@jupyter-widgets/base",
161
+ "model_name": "LayoutModel",
162
+ "model_module_version": "1.2.0",
163
+ "state": {
164
+ "_model_module": "@jupyter-widgets/base",
165
+ "_model_module_version": "1.2.0",
166
+ "_model_name": "LayoutModel",
167
+ "_view_count": null,
168
+ "_view_module": "@jupyter-widgets/base",
169
+ "_view_module_version": "1.2.0",
170
+ "_view_name": "LayoutView",
171
+ "align_content": null,
172
+ "align_items": null,
173
+ "align_self": null,
174
+ "border": null,
175
+ "bottom": null,
176
+ "display": null,
177
+ "flex": null,
178
+ "flex_flow": null,
179
+ "grid_area": null,
180
+ "grid_auto_columns": null,
181
+ "grid_auto_flow": null,
182
+ "grid_auto_rows": null,
183
+ "grid_column": null,
184
+ "grid_gap": null,
185
+ "grid_row": null,
186
+ "grid_template_areas": null,
187
+ "grid_template_columns": null,
188
+ "grid_template_rows": null,
189
+ "height": null,
190
+ "justify_content": null,
191
+ "justify_items": null,
192
+ "left": null,
193
+ "margin": null,
194
+ "max_height": null,
195
+ "max_width": null,
196
+ "min_height": null,
197
+ "min_width": null,
198
+ "object_fit": null,
199
+ "object_position": null,
200
+ "order": null,
201
+ "overflow": null,
202
+ "overflow_x": null,
203
+ "overflow_y": null,
204
+ "padding": null,
205
+ "right": null,
206
+ "top": null,
207
+ "visibility": null,
208
+ "width": null
209
+ }
210
+ },
211
+ "13d05e0960ac4220a3874135a233b5b2": {
212
+ "model_module": "@jupyter-widgets/controls",
213
+ "model_name": "DescriptionStyleModel",
214
+ "model_module_version": "1.5.0",
215
+ "state": {
216
+ "_model_module": "@jupyter-widgets/controls",
217
+ "_model_module_version": "1.5.0",
218
+ "_model_name": "DescriptionStyleModel",
219
+ "_view_count": null,
220
+ "_view_module": "@jupyter-widgets/base",
221
+ "_view_module_version": "1.2.0",
222
+ "_view_name": "StyleView",
223
+ "description_width": ""
224
+ }
225
+ },
226
+ "46b40383a6ce4ca2ae35faa715c87f87": {
227
+ "model_module": "@jupyter-widgets/base",
228
+ "model_name": "LayoutModel",
229
+ "model_module_version": "1.2.0",
230
+ "state": {
231
+ "_model_module": "@jupyter-widgets/base",
232
+ "_model_module_version": "1.2.0",
233
+ "_model_name": "LayoutModel",
234
+ "_view_count": null,
235
+ "_view_module": "@jupyter-widgets/base",
236
+ "_view_module_version": "1.2.0",
237
+ "_view_name": "LayoutView",
238
+ "align_content": null,
239
+ "align_items": null,
240
+ "align_self": null,
241
+ "border": null,
242
+ "bottom": null,
243
+ "display": null,
244
+ "flex": null,
245
+ "flex_flow": null,
246
+ "grid_area": null,
247
+ "grid_auto_columns": null,
248
+ "grid_auto_flow": null,
249
+ "grid_auto_rows": null,
250
+ "grid_column": null,
251
+ "grid_gap": null,
252
+ "grid_row": null,
253
+ "grid_template_areas": null,
254
+ "grid_template_columns": null,
255
+ "grid_template_rows": null,
256
+ "height": null,
257
+ "justify_content": null,
258
+ "justify_items": null,
259
+ "left": null,
260
+ "margin": null,
261
+ "max_height": null,
262
+ "max_width": null,
263
+ "min_height": null,
264
+ "min_width": null,
265
+ "object_fit": null,
266
+ "object_position": null,
267
+ "order": null,
268
+ "overflow": null,
269
+ "overflow_x": null,
270
+ "overflow_y": null,
271
+ "padding": null,
272
+ "right": null,
273
+ "top": null,
274
+ "visibility": null,
275
+ "width": null
276
+ }
277
+ },
278
+ "2e2ec919da9240e48f7f85464bd8376b": {
279
+ "model_module": "@jupyter-widgets/controls",
280
+ "model_name": "ProgressStyleModel",
281
+ "model_module_version": "1.5.0",
282
+ "state": {
283
+ "_model_module": "@jupyter-widgets/controls",
284
+ "_model_module_version": "1.5.0",
285
+ "_model_name": "ProgressStyleModel",
286
+ "_view_count": null,
287
+ "_view_module": "@jupyter-widgets/base",
288
+ "_view_module_version": "1.2.0",
289
+ "_view_name": "StyleView",
290
+ "bar_color": null,
291
+ "description_width": ""
292
+ }
293
+ },
294
+ "190b28c62a7548599de3cf1fe701c9e3": {
295
+ "model_module": "@jupyter-widgets/base",
296
+ "model_name": "LayoutModel",
297
+ "model_module_version": "1.2.0",
298
+ "state": {
299
+ "_model_module": "@jupyter-widgets/base",
300
+ "_model_module_version": "1.2.0",
301
+ "_model_name": "LayoutModel",
302
+ "_view_count": null,
303
+ "_view_module": "@jupyter-widgets/base",
304
+ "_view_module_version": "1.2.0",
305
+ "_view_name": "LayoutView",
306
+ "align_content": null,
307
+ "align_items": null,
308
+ "align_self": null,
309
+ "border": null,
310
+ "bottom": null,
311
+ "display": null,
312
+ "flex": null,
313
+ "flex_flow": null,
314
+ "grid_area": null,
315
+ "grid_auto_columns": null,
316
+ "grid_auto_flow": null,
317
+ "grid_auto_rows": null,
318
+ "grid_column": null,
319
+ "grid_gap": null,
320
+ "grid_row": null,
321
+ "grid_template_areas": null,
322
+ "grid_template_columns": null,
323
+ "grid_template_rows": null,
324
+ "height": null,
325
+ "justify_content": null,
326
+ "justify_items": null,
327
+ "left": null,
328
+ "margin": null,
329
+ "max_height": null,
330
+ "max_width": null,
331
+ "min_height": null,
332
+ "min_width": null,
333
+ "object_fit": null,
334
+ "object_position": null,
335
+ "order": null,
336
+ "overflow": null,
337
+ "overflow_x": null,
338
+ "overflow_y": null,
339
+ "padding": null,
340
+ "right": null,
341
+ "top": null,
342
+ "visibility": null,
343
+ "width": null
344
+ }
345
+ },
346
+ "50c3e507389643ceaf02095a60d63045": {
347
+ "model_module": "@jupyter-widgets/controls",
348
+ "model_name": "DescriptionStyleModel",
349
+ "model_module_version": "1.5.0",
350
+ "state": {
351
+ "_model_module": "@jupyter-widgets/controls",
352
+ "_model_module_version": "1.5.0",
353
+ "_model_name": "DescriptionStyleModel",
354
+ "_view_count": null,
355
+ "_view_module": "@jupyter-widgets/base",
356
+ "_view_module_version": "1.2.0",
357
+ "_view_name": "StyleView",
358
+ "description_width": ""
359
+ }
360
+ },
361
+ "d67f5fd0199a41bb949a974fbd3ffab2": {
362
+ "model_module": "@jupyter-widgets/controls",
363
+ "model_name": "HBoxModel",
364
+ "model_module_version": "1.5.0",
365
+ "state": {
366
+ "_dom_classes": [],
367
+ "_model_module": "@jupyter-widgets/controls",
368
+ "_model_module_version": "1.5.0",
369
+ "_model_name": "HBoxModel",
370
+ "_view_count": null,
371
+ "_view_module": "@jupyter-widgets/controls",
372
+ "_view_module_version": "1.5.0",
373
+ "_view_name": "HBoxView",
374
+ "box_style": "",
375
+ "children": [
376
+ "IPY_MODEL_2113f2a1b6ab40fd8232aeec27a18443",
377
+ "IPY_MODEL_a9d7ad46be6747ffbab4b2c8614ddce8",
378
+ "IPY_MODEL_a6ff8a0f04944a7ea4c310d21d6ffd0b"
379
+ ],
380
+ "layout": "IPY_MODEL_05325e3c286d49d1a2f4b07ca83f4932"
381
+ }
382
+ },
383
+ "2113f2a1b6ab40fd8232aeec27a18443": {
384
+ "model_module": "@jupyter-widgets/controls",
385
+ "model_name": "HTMLModel",
386
+ "model_module_version": "1.5.0",
387
+ "state": {
388
+ "_dom_classes": [],
389
+ "_model_module": "@jupyter-widgets/controls",
390
+ "_model_module_version": "1.5.0",
391
+ "_model_name": "HTMLModel",
392
+ "_view_count": null,
393
+ "_view_module": "@jupyter-widgets/controls",
394
+ "_view_module_version": "1.5.0",
395
+ "_view_name": "HTMLView",
396
+ "description": "",
397
+ "description_tooltip": null,
398
+ "layout": "IPY_MODEL_fbd9cc97dc474c7d99a7e2a9c4c9e598",
399
+ "placeholder": "​",
400
+ "style": "IPY_MODEL_1ac94815802c4d60b605e5a6bf375349",
401
+ "value": "Map: 100%"
402
+ }
403
+ },
404
+ "a9d7ad46be6747ffbab4b2c8614ddce8": {
405
+ "model_module": "@jupyter-widgets/controls",
406
+ "model_name": "FloatProgressModel",
407
+ "model_module_version": "1.5.0",
408
+ "state": {
409
+ "_dom_classes": [],
410
+ "_model_module": "@jupyter-widgets/controls",
411
+ "_model_module_version": "1.5.0",
412
+ "_model_name": "FloatProgressModel",
413
+ "_view_count": null,
414
+ "_view_module": "@jupyter-widgets/controls",
415
+ "_view_module_version": "1.5.0",
416
+ "_view_name": "ProgressView",
417
+ "bar_style": "success",
418
+ "description": "",
419
+ "description_tooltip": null,
420
+ "layout": "IPY_MODEL_bf3fd415224e4c379013624bf4701a80",
421
+ "max": 10,
422
+ "min": 0,
423
+ "orientation": "horizontal",
424
+ "style": "IPY_MODEL_4d35d12cf5774b3ba9958bced27b2444",
425
+ "value": 10
426
+ }
427
+ },
428
+ "a6ff8a0f04944a7ea4c310d21d6ffd0b": {
429
+ "model_module": "@jupyter-widgets/controls",
430
+ "model_name": "HTMLModel",
431
+ "model_module_version": "1.5.0",
432
+ "state": {
433
+ "_dom_classes": [],
434
+ "_model_module": "@jupyter-widgets/controls",
435
+ "_model_module_version": "1.5.0",
436
+ "_model_name": "HTMLModel",
437
+ "_view_count": null,
438
+ "_view_module": "@jupyter-widgets/controls",
439
+ "_view_module_version": "1.5.0",
440
+ "_view_name": "HTMLView",
441
+ "description": "",
442
+ "description_tooltip": null,
443
+ "layout": "IPY_MODEL_c2fd01b3bf494abdbae47574a8cdb367",
444
+ "placeholder": "​",
445
+ "style": "IPY_MODEL_01248f0d5d33408b8038c1aefcf52888",
446
+ "value": " 10/10 [00:00<00:00, 290.58 examples/s]"
447
+ }
448
+ },
449
+ "05325e3c286d49d1a2f4b07ca83f4932": {
450
+ "model_module": "@jupyter-widgets/base",
451
+ "model_name": "LayoutModel",
452
+ "model_module_version": "1.2.0",
453
+ "state": {
454
+ "_model_module": "@jupyter-widgets/base",
455
+ "_model_module_version": "1.2.0",
456
+ "_model_name": "LayoutModel",
457
+ "_view_count": null,
458
+ "_view_module": "@jupyter-widgets/base",
459
+ "_view_module_version": "1.2.0",
460
+ "_view_name": "LayoutView",
461
+ "align_content": null,
462
+ "align_items": null,
463
+ "align_self": null,
464
+ "border": null,
465
+ "bottom": null,
466
+ "display": null,
467
+ "flex": null,
468
+ "flex_flow": null,
469
+ "grid_area": null,
470
+ "grid_auto_columns": null,
471
+ "grid_auto_flow": null,
472
+ "grid_auto_rows": null,
473
+ "grid_column": null,
474
+ "grid_gap": null,
475
+ "grid_row": null,
476
+ "grid_template_areas": null,
477
+ "grid_template_columns": null,
478
+ "grid_template_rows": null,
479
+ "height": null,
480
+ "justify_content": null,
481
+ "justify_items": null,
482
+ "left": null,
483
+ "margin": null,
484
+ "max_height": null,
485
+ "max_width": null,
486
+ "min_height": null,
487
+ "min_width": null,
488
+ "object_fit": null,
489
+ "object_position": null,
490
+ "order": null,
491
+ "overflow": null,
492
+ "overflow_x": null,
493
+ "overflow_y": null,
494
+ "padding": null,
495
+ "right": null,
496
+ "top": null,
497
+ "visibility": null,
498
+ "width": null
499
+ }
500
+ },
501
+ "fbd9cc97dc474c7d99a7e2a9c4c9e598": {
502
+ "model_module": "@jupyter-widgets/base",
503
+ "model_name": "LayoutModel",
504
+ "model_module_version": "1.2.0",
505
+ "state": {
506
+ "_model_module": "@jupyter-widgets/base",
507
+ "_model_module_version": "1.2.0",
508
+ "_model_name": "LayoutModel",
509
+ "_view_count": null,
510
+ "_view_module": "@jupyter-widgets/base",
511
+ "_view_module_version": "1.2.0",
512
+ "_view_name": "LayoutView",
513
+ "align_content": null,
514
+ "align_items": null,
515
+ "align_self": null,
516
+ "border": null,
517
+ "bottom": null,
518
+ "display": null,
519
+ "flex": null,
520
+ "flex_flow": null,
521
+ "grid_area": null,
522
+ "grid_auto_columns": null,
523
+ "grid_auto_flow": null,
524
+ "grid_auto_rows": null,
525
+ "grid_column": null,
526
+ "grid_gap": null,
527
+ "grid_row": null,
528
+ "grid_template_areas": null,
529
+ "grid_template_columns": null,
530
+ "grid_template_rows": null,
531
+ "height": null,
532
+ "justify_content": null,
533
+ "justify_items": null,
534
+ "left": null,
535
+ "margin": null,
536
+ "max_height": null,
537
+ "max_width": null,
538
+ "min_height": null,
539
+ "min_width": null,
540
+ "object_fit": null,
541
+ "object_position": null,
542
+ "order": null,
543
+ "overflow": null,
544
+ "overflow_x": null,
545
+ "overflow_y": null,
546
+ "padding": null,
547
+ "right": null,
548
+ "top": null,
549
+ "visibility": null,
550
+ "width": null
551
+ }
552
+ },
553
+ "1ac94815802c4d60b605e5a6bf375349": {
554
+ "model_module": "@jupyter-widgets/controls",
555
+ "model_name": "DescriptionStyleModel",
556
+ "model_module_version": "1.5.0",
557
+ "state": {
558
+ "_model_module": "@jupyter-widgets/controls",
559
+ "_model_module_version": "1.5.0",
560
+ "_model_name": "DescriptionStyleModel",
561
+ "_view_count": null,
562
+ "_view_module": "@jupyter-widgets/base",
563
+ "_view_module_version": "1.2.0",
564
+ "_view_name": "StyleView",
565
+ "description_width": ""
566
+ }
567
+ },
568
+ "bf3fd415224e4c379013624bf4701a80": {
569
+ "model_module": "@jupyter-widgets/base",
570
+ "model_name": "LayoutModel",
571
+ "model_module_version": "1.2.0",
572
+ "state": {
573
+ "_model_module": "@jupyter-widgets/base",
574
+ "_model_module_version": "1.2.0",
575
+ "_model_name": "LayoutModel",
576
+ "_view_count": null,
577
+ "_view_module": "@jupyter-widgets/base",
578
+ "_view_module_version": "1.2.0",
579
+ "_view_name": "LayoutView",
580
+ "align_content": null,
581
+ "align_items": null,
582
+ "align_self": null,
583
+ "border": null,
584
+ "bottom": null,
585
+ "display": null,
586
+ "flex": null,
587
+ "flex_flow": null,
588
+ "grid_area": null,
589
+ "grid_auto_columns": null,
590
+ "grid_auto_flow": null,
591
+ "grid_auto_rows": null,
592
+ "grid_column": null,
593
+ "grid_gap": null,
594
+ "grid_row": null,
595
+ "grid_template_areas": null,
596
+ "grid_template_columns": null,
597
+ "grid_template_rows": null,
598
+ "height": null,
599
+ "justify_content": null,
600
+ "justify_items": null,
601
+ "left": null,
602
+ "margin": null,
603
+ "max_height": null,
604
+ "max_width": null,
605
+ "min_height": null,
606
+ "min_width": null,
607
+ "object_fit": null,
608
+ "object_position": null,
609
+ "order": null,
610
+ "overflow": null,
611
+ "overflow_x": null,
612
+ "overflow_y": null,
613
+ "padding": null,
614
+ "right": null,
615
+ "top": null,
616
+ "visibility": null,
617
+ "width": null
618
+ }
619
+ },
620
+ "4d35d12cf5774b3ba9958bced27b2444": {
621
+ "model_module": "@jupyter-widgets/controls",
622
+ "model_name": "ProgressStyleModel",
623
+ "model_module_version": "1.5.0",
624
+ "state": {
625
+ "_model_module": "@jupyter-widgets/controls",
626
+ "_model_module_version": "1.5.0",
627
+ "_model_name": "ProgressStyleModel",
628
+ "_view_count": null,
629
+ "_view_module": "@jupyter-widgets/base",
630
+ "_view_module_version": "1.2.0",
631
+ "_view_name": "StyleView",
632
+ "bar_color": null,
633
+ "description_width": ""
634
+ }
635
+ },
636
+ "c2fd01b3bf494abdbae47574a8cdb367": {
637
+ "model_module": "@jupyter-widgets/base",
638
+ "model_name": "LayoutModel",
639
+ "model_module_version": "1.2.0",
640
+ "state": {
641
+ "_model_module": "@jupyter-widgets/base",
642
+ "_model_module_version": "1.2.0",
643
+ "_model_name": "LayoutModel",
644
+ "_view_count": null,
645
+ "_view_module": "@jupyter-widgets/base",
646
+ "_view_module_version": "1.2.0",
647
+ "_view_name": "LayoutView",
648
+ "align_content": null,
649
+ "align_items": null,
650
+ "align_self": null,
651
+ "border": null,
652
+ "bottom": null,
653
+ "display": null,
654
+ "flex": null,
655
+ "flex_flow": null,
656
+ "grid_area": null,
657
+ "grid_auto_columns": null,
658
+ "grid_auto_flow": null,
659
+ "grid_auto_rows": null,
660
+ "grid_column": null,
661
+ "grid_gap": null,
662
+ "grid_row": null,
663
+ "grid_template_areas": null,
664
+ "grid_template_columns": null,
665
+ "grid_template_rows": null,
666
+ "height": null,
667
+ "justify_content": null,
668
+ "justify_items": null,
669
+ "left": null,
670
+ "margin": null,
671
+ "max_height": null,
672
+ "max_width": null,
673
+ "min_height": null,
674
+ "min_width": null,
675
+ "object_fit": null,
676
+ "object_position": null,
677
+ "order": null,
678
+ "overflow": null,
679
+ "overflow_x": null,
680
+ "overflow_y": null,
681
+ "padding": null,
682
+ "right": null,
683
+ "top": null,
684
+ "visibility": null,
685
+ "width": null
686
+ }
687
+ },
688
+ "01248f0d5d33408b8038c1aefcf52888": {
689
+ "model_module": "@jupyter-widgets/controls",
690
+ "model_name": "DescriptionStyleModel",
691
+ "model_module_version": "1.5.0",
692
+ "state": {
693
+ "_model_module": "@jupyter-widgets/controls",
694
+ "_model_module_version": "1.5.0",
695
+ "_model_name": "DescriptionStyleModel",
696
+ "_view_count": null,
697
+ "_view_module": "@jupyter-widgets/base",
698
+ "_view_module_version": "1.2.0",
699
+ "_view_name": "StyleView",
700
+ "description_width": ""
701
+ }
702
+ }
703
+ }
704
+ }
705
+ },
706
+ "cells": [
707
+ {
708
+ "cell_type": "code",
709
+ "execution_count": null,
710
+ "metadata": {
711
+ "id": "BiY1TYKVyZnF"
712
+ },
713
+ "outputs": [],
714
+ "source": [
715
+ "!pip install praw\n",
716
+ "!pip install pinecone\n",
717
+ "!pip install semantic-router\n",
718
+ "!pip install datasets"
719
+ ]
720
+ },
721
+ {
722
+ "cell_type": "code",
723
+ "source": [
724
+ "!pip install numpy==1.26.0"
725
+ ],
726
+ "metadata": {
727
+ "id": "k_b_EgJVTnwQ"
728
+ },
729
+ "execution_count": null,
730
+ "outputs": []
731
+ },
732
+ {
733
+ "cell_type": "code",
734
+ "source": [
735
+ "!pip install semantic-router[local]"
736
+ ],
737
+ "metadata": {
738
+ "id": "yb9c4QahkH6p"
739
+ },
740
+ "execution_count": null,
741
+ "outputs": []
742
+ },
743
+ {
744
+ "cell_type": "markdown",
745
+ "source": [
746
+ "Collecting Data"
747
+ ],
748
+ "metadata": {
749
+ "id": "GkgPDkwzHwHX"
750
+ }
751
+ },
752
+ {
753
+ "cell_type": "code",
754
+ "source": [
755
+ "import praw\n",
756
+ "from google.colab import userdata\n",
757
+ "\n",
758
+ "reddit = praw.Reddit(\n",
759
+ " client_id=userdata.get('REDDIT_CLIENT_ID'),\n",
760
+ " client_secret=userdata.get('REDDIT_CLIENT_SECRET'),\n",
761
+ " user_agent=userdata.get('REDDIT_USER_AGENT'),\n",
762
+ ")"
763
+ ],
764
+ "metadata": {
765
+ "id": "bmzp0IKW04XI"
766
+ },
767
+ "execution_count": null,
768
+ "outputs": []
769
+ },
770
+ {
771
+ "cell_type": "code",
772
+ "source": [
773
+ "def submissionToDict(submission):\n",
774
+ " submissionAsDict = {}\n",
775
+ " submissionAsDict['id'] = submission.id\n",
776
+ " # Metadata is directly stored as a dictionary with 'title' and 'body'\n",
777
+ " submissionAsDict['metadata'] = {\n",
778
+ " 'title': submission.title,\n",
779
+ " 'content': '\\n'.join([comment.body for comment in submission.comments.list() if isinstance(comment, praw.models.Comment)]) # Join comments into a single string, but only if it's a Comment object\n",
780
+ " }\n",
781
+ " return submissionAsDict"
782
+ ],
783
+ "metadata": {
784
+ "id": "D-n4R8Gh7JQH"
785
+ },
786
+ "execution_count": null,
787
+ "outputs": []
788
+ },
789
+ {
790
+ "cell_type": "code",
791
+ "source": [
792
+ "from IPython.display import clear_output\n",
793
+ "\n",
794
+ "data = []\n",
795
+ "subreddit = reddit.subreddit(\"AskNYC\")\n",
796
+ "for submission in subreddit.hot(limit=10):\n",
797
+ " data.append(submissionToDict(submission)) # Await the result of submissionToDict\n",
798
+ "\n",
799
+ "clear_output()"
800
+ ],
801
+ "metadata": {
802
+ "id": "PrZKgKJO7Ygh"
803
+ },
804
+ "execution_count": null,
805
+ "outputs": []
806
+ },
807
+ {
808
+ "cell_type": "code",
809
+ "source": [
810
+ "from datasets import Dataset\n",
811
+ "\n",
812
+ "# Convert your existing 'data' list into a Dataset object\n",
813
+ "data = Dataset.from_list(data)\n",
814
+ "\n",
815
+ "# Apply the mapping function to structure the data\n",
816
+ "data = data.map(lambda x: {\n",
817
+ " \"id\": x[\"id\"],\n",
818
+ " \"metadata\": {\n",
819
+ " \"title\": x[\"metadata\"][\"title\"], # Access title from metadata\n",
820
+ " \"content\": x[\"metadata\"][\"content\"], # Access content from metadata\n",
821
+ " }\n",
822
+ "})\n",
823
+ "\n",
824
+ "# Since you don't have the extra columns in your original data\n",
825
+ "# you can skip the remove_columns step\n",
826
+ "\n",
827
+ "# Now 'data' is a Dataset object\n",
828
+ "print(data)\n"
829
+ ],
830
+ "metadata": {
831
+ "colab": {
832
+ "base_uri": "https://localhost:8080/",
833
+ "height": 120,
834
+ "referenced_widgets": [
835
+ "d67f5fd0199a41bb949a974fbd3ffab2",
836
+ "2113f2a1b6ab40fd8232aeec27a18443",
837
+ "a9d7ad46be6747ffbab4b2c8614ddce8",
838
+ "a6ff8a0f04944a7ea4c310d21d6ffd0b",
839
+ "05325e3c286d49d1a2f4b07ca83f4932",
840
+ "fbd9cc97dc474c7d99a7e2a9c4c9e598",
841
+ "1ac94815802c4d60b605e5a6bf375349",
842
+ "bf3fd415224e4c379013624bf4701a80",
843
+ "4d35d12cf5774b3ba9958bced27b2444",
844
+ "c2fd01b3bf494abdbae47574a8cdb367",
845
+ "01248f0d5d33408b8038c1aefcf52888"
846
+ ]
847
+ },
848
+ "id": "Aw2zNQW0MRgv",
849
+ "outputId": "655bf508-a1a2-4ef3-fc46-0db101321d0c"
850
+ },
851
+ "execution_count": null,
852
+ "outputs": [
853
+ {
854
+ "output_type": "display_data",
855
+ "data": {
856
+ "text/plain": [
857
+ "Map: 0%| | 0/10 [00:00<?, ? examples/s]"
858
+ ],
859
+ "application/vnd.jupyter.widget-view+json": {
860
+ "version_major": 2,
861
+ "version_minor": 0,
862
+ "model_id": "d67f5fd0199a41bb949a974fbd3ffab2"
863
+ }
864
+ },
865
+ "metadata": {}
866
+ },
867
+ {
868
+ "output_type": "stream",
869
+ "name": "stdout",
870
+ "text": [
871
+ "Dataset({\n",
872
+ " features: ['id', 'metadata'],\n",
873
+ " num_rows: 10\n",
874
+ "})\n"
875
+ ]
876
+ }
877
+ ]
878
+ },
879
+ {
880
+ "cell_type": "markdown",
881
+ "source": [
882
+ "Connect to Pinecone"
883
+ ],
884
+ "metadata": {
885
+ "id": "9CFcqKNtHx7P"
886
+ }
887
+ },
888
+ {
889
+ "cell_type": "code",
890
+ "source": [
891
+ "from semantic_router.encoders import HuggingFaceEncoder\n",
892
+ "\n",
893
+ "encoder = HuggingFaceEncoder(name=\"dwzhu/e5-base-4k\")\n",
894
+ "embeds = encoder([\"this is a test\"])\n",
895
+ "dims = len(embeds[0])"
896
+ ],
897
+ "metadata": {
898
+ "id": "7ANnOvOAH5EP"
899
+ },
900
+ "execution_count": null,
901
+ "outputs": []
902
+ },
903
+ {
904
+ "cell_type": "code",
905
+ "source": [
906
+ "import os\n",
907
+ "import getpass\n",
908
+ "from pinecone import Pinecone\n",
909
+ "\n",
910
+ "# initialize connection to pinecone (get API key at app.pinecone.io)\n",
911
+ "api_key = userdata.get('PINECONE_API_KEY')\n",
912
+ "\n",
913
+ "# configure client\n",
914
+ "pc = Pinecone(api_key=api_key)\n"
915
+ ],
916
+ "metadata": {
917
+ "id": "9Vob3-UwJd3q"
918
+ },
919
+ "execution_count": null,
920
+ "outputs": []
921
+ },
922
+ {
923
+ "cell_type": "code",
924
+ "source": [
925
+ "# configure client\n",
926
+ "pc = Pinecone(api_key=api_key)\n",
927
+ "\n",
928
+ "from pinecone import ServerlessSpec\n",
929
+ "\n",
930
+ "spec = ServerlessSpec(\n",
931
+ " cloud=\"aws\", region=\"us-east-1\"\n",
932
+ ")\n"
933
+ ],
934
+ "metadata": {
935
+ "id": "U4EjLI9HJU9U"
936
+ },
937
+ "execution_count": null,
938
+ "outputs": []
939
+ },
940
+ {
941
+ "cell_type": "code",
942
+ "source": [
943
+ "import time\n",
944
+ "\n",
945
+ "index_name = \"groq-llama-3-rag\"\n",
946
+ "existing_indexes = [\n",
947
+ " index_info[\"name\"] for index_info in pc.list_indexes()\n",
948
+ "]\n",
949
+ "\n",
950
+ "# check if index already exists (it shouldn't if this is first time)\n",
951
+ "if index_name not in existing_indexes:\n",
952
+ " # if does not exist, create index\n",
953
+ " pc.create_index(\n",
954
+ " index_name,\n",
955
+ " dimension=dims,\n",
956
+ " metric='cosine',\n",
957
+ " spec=spec\n",
958
+ " )\n",
959
+ " # wait for index to be initialized\n",
960
+ " while not pc.describe_index(index_name).status['ready']:\n",
961
+ " time.sleep(1)\n",
962
+ "\n",
963
+ "# connect to index\n",
964
+ "index = pc.Index(index_name)\n",
965
+ "time.sleep(1)\n",
966
+ "# view index stats\n",
967
+ "index.describe_index_stats()"
968
+ ],
969
+ "metadata": {
970
+ "id": "1HISfuQBFKHD"
971
+ },
972
+ "execution_count": null,
973
+ "outputs": []
974
+ },
975
+ {
976
+ "cell_type": "code",
977
+ "source": [
978
+ "from tqdm.auto import tqdm\n",
979
+ "\n",
980
+ "batch_size = 128 # how many embeddings we create and insert at once\n",
981
+ "\n",
982
+ "for i in tqdm(range(0, len(data), batch_size)):\n",
983
+ " # find end of batch\n",
984
+ " i_end = min(len(data), i+batch_size)\n",
985
+ " # create batch\n",
986
+ " batch = data[i:i_end]\n",
987
+ " # create embeddings\n",
988
+ " chunks = [f'{x[\"title\"]}: {x[\"content\"]}' for x in batch[\"metadata\"]]\n",
989
+ " embeds = encoder(chunks)\n",
990
+ " assert len(embeds) == (i_end-i)\n",
991
+ " to_upsert = list(zip(batch[\"id\"], embeds, batch[\"metadata\"]))\n",
992
+ " # upsert to Pinecone\n",
993
+ " index.upsert(vectors=to_upsert)"
994
+ ],
995
+ "metadata": {
996
+ "id": "W0EVhOpfLzlf"
997
+ },
998
+ "execution_count": null,
999
+ "outputs": []
1000
+ },
1001
+ {
1002
+ "cell_type": "markdown",
1003
+ "source": [
1004
+ "Now apply on different subreddits"
1005
+ ],
1006
+ "metadata": {
1007
+ "id": "BzQi9RKKQQN4"
1008
+ }
1009
+ },
1010
+ {
1011
+ "cell_type": "code",
1012
+ "source": [
1013
+ "\n",
1014
+ "data = []\n",
1015
+ "subreddit = reddit.subreddit(\"AskNYC\")\n",
1016
+ "for submission in subreddit.hot(limit=1000):\n",
1017
+ " data.append(submissionToDict(submission)) # Await the result of submissionToDict\n",
1018
+ "\n",
1019
+ "# Convert your existing 'data' list into a Dataset object\n",
1020
+ "data = Dataset.from_list(data)\n",
1021
+ "\n",
1022
+ "# Apply the mapping function to structure the data\n",
1023
+ "data = data.map(lambda x: {\n",
1024
+ " \"id\": x[\"id\"],\n",
1025
+ " \"metadata\": {\n",
1026
+ " \"title\": x[\"metadata\"][\"title\"], # Access title from metadata\n",
1027
+ " \"content\": x[\"metadata\"][\"content\"], # Access content from metadata\n",
1028
+ " }\n",
1029
+ "})\n"
1030
+ ],
1031
+ "metadata": {
1032
+ "id": "PxDIYKlHQMkf"
1033
+ },
1034
+ "execution_count": null,
1035
+ "outputs": []
1036
+ },
1037
+ {
1038
+ "cell_type": "code",
1039
+ "source": [
1040
+ "from tqdm.auto import tqdm\n",
1041
+ "\n",
1042
+ "batch_size = 128 # how many embeddings we create and insert at once\n",
1043
+ "\n",
1044
+ "for i in tqdm(range(0, len(data), batch_size)):\n",
1045
+ " # find end of batch\n",
1046
+ " i_end = min(len(data), i + batch_size)\n",
1047
+ " # create batch\n",
1048
+ " batch = data[i:i_end]\n",
1049
+ " # create embeddings\n",
1050
+ " chunks = [f'{x[\"title\"]}: {x[\"content\"][:1000]}' for x in batch[\"metadata\"]] # Truncate content to 1000 characters\n",
1051
+ " embeds = encoder(chunks)\n",
1052
+ " assert len(embeds) == (i_end - i)\n",
1053
+ " # Reduce metadata size before upserting\n",
1054
+ " metadata_to_upsert = [{'title': x['title'], 'content_snippet': x['content'][:2000]} for x in batch['metadata']] # Truncate content snippet to 2000 characters\n",
1055
+ " to_upsert = list(zip(batch[\"id\"], embeds, metadata_to_upsert)) # Use the reduced metadata\n",
1056
+ " # upsert to Pinecone\n",
1057
+ " index.upsert(vectors=to_upsert)"
1058
+ ],
1059
+ "metadata": {
1060
+ "colab": {
1061
+ "base_uri": "https://localhost:8080/",
1062
+ "height": 49,
1063
+ "referenced_widgets": [
1064
+ "6a04ce68a3fd4e8a85eab7ef95b460bf",
1065
+ "8d334cffae1e43d3965926cf47915d87",
1066
+ "9c4084b9c03c4b7d9fb6351d67e51181",
1067
+ "1af047bf93ef458ab60dec0419e162c0",
1068
+ "ad9c688d0f8b4a94819008130cedbada",
1069
+ "7fd0861c22a94f639cae337d38815752",
1070
+ "13d05e0960ac4220a3874135a233b5b2",
1071
+ "46b40383a6ce4ca2ae35faa715c87f87",
1072
+ "2e2ec919da9240e48f7f85464bd8376b",
1073
+ "190b28c62a7548599de3cf1fe701c9e3",
1074
+ "50c3e507389643ceaf02095a60d63045"
1075
+ ]
1076
+ },
1077
+ "id": "jRR61RObW3aM",
1078
+ "outputId": "a2e5ba63-25de-4a94-9c4e-873003835eb5"
1079
+ },
1080
+ "execution_count": null,
1081
+ "outputs": [
1082
+ {
1083
+ "output_type": "display_data",
1084
+ "data": {
1085
+ "text/plain": [
1086
+ " 0%| | 0/7 [00:00<?, ?it/s]"
1087
+ ],
1088
+ "application/vnd.jupyter.widget-view+json": {
1089
+ "version_major": 2,
1090
+ "version_minor": 0,
1091
+ "model_id": "6a04ce68a3fd4e8a85eab7ef95b460bf"
1092
+ }
1093
+ },
1094
+ "metadata": {}
1095
+ }
1096
+ ]
1097
+ },
1098
+ {
1099
+ "cell_type": "markdown",
1100
+ "source": [
1101
+ "Others"
1102
+ ],
1103
+ "metadata": {
1104
+ "id": "MzEZK65_Xeyo"
1105
+ }
1106
+ },
1107
+ {
1108
+ "cell_type": "code",
1109
+ "source": [
1110
+ "for item in ['Manhattan','Bronx', 'Brooklyn', 'Queens', 'StatenIsland']:\n",
1111
+ " data = []\n",
1112
+ " subreddit = reddit.subreddit(item)\n",
1113
+ " for submission in subreddit.hot(limit=256):\n",
1114
+ " data.append(submissionToDict(submission)) # Await the result of submissionToDict\n",
1115
+ "\n",
1116
+ " # Convert your existing 'data' list into a Dataset object\n",
1117
+ " data = Dataset.from_list(data)\n",
1118
+ "\n",
1119
+ " # Apply the mapping function to structure the data\n",
1120
+ " data = data.map(lambda x: {\n",
1121
+ " \"id\": x[\"id\"],\n",
1122
+ " \"metadata\": {\n",
1123
+ " \"title\": x[\"metadata\"][\"title\"], # Access title from metadata\n",
1124
+ " \"content\": x[\"metadata\"][\"content\"], # Access content from metadata\n",
1125
+ " }\n",
1126
+ " })\n",
1127
+ "\n",
1128
+ " batch_size = 128 # how many embeddings we create and insert at once\n",
1129
+ "\n",
1130
+ " for i in tqdm(range(0, len(data), batch_size)):\n",
1131
+ " # find end of batch\n",
1132
+ " i_end = min(len(data), i + batch_size)\n",
1133
+ " # create batch\n",
1134
+ " batch = data[i:i_end]\n",
1135
+ " # create embeddings\n",
1136
+ " chunks = [f'{x[\"title\"]}: {x[\"content\"][:1000]}' for x in batch[\"metadata\"]] # Truncate content to 1000 characters\n",
1137
+ " embeds = encoder(chunks)\n",
1138
+ " assert len(embeds) == (i_end - i)\n",
1139
+ " # Reduce metadata size before upserting\n",
1140
+ " metadata_to_upsert = [{'title': x['title'], 'content_snippet': x['content'][:2000]} for x in batch['metadata']] # Truncate content snippet to 2000 characters\n",
1141
+ " to_upsert = list(zip(batch[\"id\"], embeds, metadata_to_upsert)) # Use the reduced metadata\n",
1142
+ " # upsert to Pinecone\n",
1143
+ " index.upsert(vectors=to_upsert)\n"
1144
+ ],
1145
+ "metadata": {
1146
+ "id": "QD0xto7KYHCv"
1147
+ },
1148
+ "execution_count": null,
1149
+ "outputs": []
1150
+ }
1151
+ ]
1152
+ }