MVPilgrim commited on
Commit
29a8f89
·
1 Parent(s): a7f7eb0
Files changed (1) hide show
  1. semsearch.py +30 -2
semsearch.py CHANGED
@@ -72,7 +72,7 @@ def readParseChunkFiles():
72
  page_content = re.sub(r'\n+', '\n',page_content)
73
 
74
  page_contentArray.append(page_content);
75
- webpageTitles.append(title)
76
  max_tokens = 1000
77
  tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
78
  logger.debug(f"### tokenizer: {tokenizer}")
@@ -329,8 +329,36 @@ client.connect()
329
  readParseChunkFiles()
330
  wpCollection = createWebpageCollection()
331
  wpChunkCollection = createChunksCollection()
332
- createDatabaseObjects()
333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
  ###############################################################################
336
  # text contains prompt for vector DB.
 
72
  page_content = re.sub(r'\n+', '\n',page_content)
73
 
74
  page_contentArray.append(page_content);
75
+ webpageTitles.append (title)
76
  max_tokens = 1000
77
  tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
78
  logger.debug(f"### tokenizer: {tokenizer}")
 
329
  readParseChunkFiles()
330
  wpCollection = createWebpageCollection()
331
  wpChunkCollection = createChunksCollection()
 
332
 
333
+ #createDatabaseObjects()
334
+ logger.info("#### Create page/doc and chunk db objects.")
335
+ for i, className in enumerate(webpageDocNames):
336
+ title = webpageTitles[i]
337
+ logger.debug(f"## className, title: {className}, {title}")
338
+ # Create Webpage Object
339
+ page_content = page_contentArray[i]
340
+ # Insert the document.
341
+ wpCollectionObj_uuid = wpCollection.data.insert(
342
+ {
343
+ "name": className,
344
+ "title": title,
345
+ "content": page_content
346
+ }
347
+ )
348
+
349
+ # Insert the chunks for the document.
350
+ for i2, chunk in enumerate(webpageChunks[i]):
351
+ chunk_uuid = wpChunkCollection.data.insert(
352
+ {
353
+ "title": title,
354
+ "chunk": chunk,
355
+ "chunk_index": i2,
356
+ "references":
357
+ {
358
+ "webpage": wpCollectionObj_uuid
359
+ }
360
+ }
361
+ )
362
 
363
  ###############################################################################
364
  # text contains prompt for vector DB.