MVPilgrim commited on
Commit
909f4bf
·
1 Parent(s): ec89ccb
Files changed (1) hide show
  1. semsearch.py +105 -2
semsearch.py CHANGED
@@ -198,8 +198,111 @@ logger.debug(f"### webpageDocNames: {webpageDocNames}")
198
 
199
  ######################################################
200
  # Create database webpage and chunks collections.
201
- wpCollection = createWebpageCollection()
202
- wpChunkCollection = createChunksCollection()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  ###########################################################
205
  # Create document and chunks objects in the database.
 
198
 
199
  ######################################################
200
  # Create database webpage and chunks collections.
201
+ #wpCollection = createWebpageCollection()
202
+ #wpChunkCollection = createChunksCollection()
203
+
204
+ logger.info("#### createChunksCollection() entered.")
205
+ if client.collections.exists("Chunks"):
206
+ client.collections.delete("Chunks")
207
+
208
+ class_obj = {
209
+ "class": "Chunks",
210
+ "description": "Collection for document chunks.",
211
+ "vectorizer": "text2vec-transformers",
212
+ "moduleConfig": {
213
+ "text2vec-transformers": {
214
+ "vectorizeClassName": True
215
+ }
216
+ },
217
+ "vectorIndexType": "hnsw",
218
+ "vectorIndexConfig": {
219
+ "distance": "cosine",
220
+ },
221
+ "properties": [
222
+ {
223
+ "name": "chunk",
224
+ "dataType": ["text"],
225
+ "description": "Single webpage chunk.",
226
+ "vectorizer": "text2vec-transformers",
227
+ "moduleConfig": {
228
+ "text2vec-transformers": {
229
+ "vectorizePropertyName": False,
230
+ "skip": False,
231
+ "tokenization": "lowercase"
232
+ }
233
+ }
234
+ },
235
+ {
236
+ "name": "chunk_index",
237
+ "dataType": ["int"]
238
+ },
239
+ {
240
+ "name": "webpage",
241
+ "dataType": ["Documents"],
242
+ "description": "Webpage content chunks.",
243
+
244
+ "invertedIndexConfig": {
245
+ "bm25": {
246
+ "b": 0.75,
247
+ "k1": 1.2
248
+ }
249
+ }
250
+ }
251
+ ]
252
+ }
253
+ return(client.collections.create_from_dict(class_obj))
254
+
255
+ logger.info("#### createWebpageCollection() entered.")
256
+ if client.collections.exists("Documents"):
257
+ client.collections.delete("Documents")
258
+
259
+ class_obj = {
260
+ "class": "Documents",
261
+ "description": "For first attempt at loading a Weviate database.",
262
+ "vectorizer": "text2vec-transformers",
263
+ "moduleConfig": {
264
+ "text2vec-transformers": {
265
+ "vectorizeClassName": False
266
+ }
267
+ },
268
+ "vectorIndexType": "hnsw",
269
+ "vectorIndexConfig": {
270
+ "distance": "cosine",
271
+ },
272
+ "properties": [
273
+ {
274
+ "name": "title",
275
+ "dataType": ["text"],
276
+ "description": "HTML doc title.",
277
+ "vectorizer": "text2vec-transformers",
278
+ "moduleConfig": {
279
+ "text2vec-transformers": {
280
+ "vectorizePropertyName": True,
281
+ "skip": False,
282
+ "tokenization": "lowercase"
283
+ }
284
+ },
285
+ "invertedIndexConfig": {
286
+ "bm25": {
287
+ "b": 0.75,
288
+ "k1": 1.2
289
+ },
290
+ }
291
+ },
292
+ {
293
+ "name": "content",
294
+ "dataType": ["text"],
295
+ "description": "HTML page content.",
296
+ "moduleConfig": {
297
+ "text2vec-transformers": {
298
+ "vectorizePropertyName": True,
299
+ "tokenization": "whitespace"
300
+ }
301
+ }
302
+ }
303
+ ]
304
+ }
305
+ return(client.collections.create_from_dict(class_obj))
306
 
307
  ###########################################################
308
  # Create document and chunks objects in the database.