MVPilgrim commited on
Commit
17fe29d
·
1 Parent(s): e5190ef
Files changed (1) hide show
  1. semsearch.py +51 -52
semsearch.py CHANGED
@@ -200,58 +200,6 @@ logger.debug(f"### webpageDocNames: {webpageDocNames}")
200
  # Create database webpage and chunks collections.
201
  #wpCollection = createWebpageCollection()
202
  #wpChunkCollection = createChunksCollection()
203
-
204
- logger.info("#### createChunksCollection() entered.")
205
- if client.collections.exists("Chunks"):
206
- client.collections.delete("Chunks")
207
-
208
- class_obj = {
209
- "class": "Chunks",
210
- "description": "Collection for document chunks.",
211
- "vectorizer": "text2vec-transformers",
212
- "moduleConfig": {
213
- "text2vec-transformers": {
214
- "vectorizeClassName": True
215
- }
216
- },
217
- "vectorIndexType": "hnsw",
218
- "vectorIndexConfig": {
219
- "distance": "cosine",
220
- },
221
- "properties": [
222
- {
223
- "name": "chunk",
224
- "dataType": ["text"],
225
- "description": "Single webpage chunk.",
226
- "vectorizer": "text2vec-transformers",
227
- "moduleConfig": {
228
- "text2vec-transformers": {
229
- "vectorizePropertyName": False,
230
- "skip": False,
231
- "tokenization": "lowercase"
232
- }
233
- }
234
- },
235
- {
236
- "name": "chunk_index",
237
- "dataType": ["int"]
238
- },
239
- {
240
- "name": "webpage",
241
- "dataType": ["Documents"],
242
- "description": "Webpage content chunks.",
243
-
244
- "invertedIndexConfig": {
245
- "bm25": {
246
- "b": 0.75,
247
- "k1": 1.2
248
- }
249
- }
250
- }
251
- ]
252
- }
253
- wpChunkCollection = client.collections.create_from_dict(class_obj)
254
-
255
  logger.info("#### createWebpageCollection() entered.")
256
  if client.collections.exists("Documents"):
257
  client.collections.delete("Documents")
@@ -304,6 +252,57 @@ class_obj = {
304
  }
305
  wpCollection = client.collections.create_from_dict(class_obj)
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
  ###########################################################
309
  # Create document and chunks objects in the database.
 
200
  # Create database webpage and chunks collections.
201
  #wpCollection = createWebpageCollection()
202
  #wpChunkCollection = createChunksCollection()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  logger.info("#### createWebpageCollection() entered.")
204
  if client.collections.exists("Documents"):
205
  client.collections.delete("Documents")
 
252
  }
253
  wpCollection = client.collections.create_from_dict(class_obj)
254
 
255
+ logger.info("#### createChunksCollection() entered.")
256
+ if client.collections.exists("Chunks"):
257
+ client.collections.delete("Chunks")
258
+
259
+ class_obj = {
260
+ "class": "Chunks",
261
+ "description": "Collection for document chunks.",
262
+ "vectorizer": "text2vec-transformers",
263
+ "moduleConfig": {
264
+ "text2vec-transformers": {
265
+ "vectorizeClassName": True
266
+ }
267
+ },
268
+ "vectorIndexType": "hnsw",
269
+ "vectorIndexConfig": {
270
+ "distance": "cosine",
271
+ },
272
+ "properties": [
273
+ {
274
+ "name": "chunk",
275
+ "dataType": ["text"],
276
+ "description": "Single webpage chunk.",
277
+ "vectorizer": "text2vec-transformers",
278
+ "moduleConfig": {
279
+ "text2vec-transformers": {
280
+ "vectorizePropertyName": False,
281
+ "skip": False,
282
+ "tokenization": "lowercase"
283
+ }
284
+ }
285
+ },
286
+ {
287
+ "name": "chunk_index",
288
+ "dataType": ["int"]
289
+ },
290
+ {
291
+ "name": "webpage",
292
+ "dataType": ["Documents"],
293
+ "description": "Webpage content chunks.",
294
+
295
+ "invertedIndexConfig": {
296
+ "bm25": {
297
+ "b": 0.75,
298
+ "k1": 1.2
299
+ }
300
+ }
301
+ }
302
+ ]
303
+ }
304
+ wpChunkCollection = client.collections.create_from_dict(class_obj)
305
+
306
 
307
  ###########################################################
308
  # Create document and chunks objects in the database.