MVPilgrim commited on
Commit
4d1c68b
·
1 Parent(s): cee962e
Files changed (1) hide show
  1. semsearch.py +66 -79
semsearch.py CHANGED
@@ -21,7 +21,7 @@ from IPython.display import display, clear_output
21
 
22
 
23
  weaviate_logger = logging.getLogger("httpx")
24
- weaviate_logger.setLevel(logging.INFO)
25
 
26
  logger = logging.getLogger(__name__)
27
  logging.basicConfig(level=logging.INFO)
@@ -95,60 +95,9 @@ submitButton = widgets.Button(
95
  )
96
 
97
 
98
- #######################################################
99
- # Read each text input file, parse it into a document,
100
- # chunk it, collect chunks and document name.
101
- logger.info("#### Read and chunk input text files.")
102
- for filename in os.listdir(pathString):
103
- logger.info(filename)
104
- path = Path(pathString + "/" + filename)
105
- filename = filename.rstrip(".html")
106
- webpageDocNames.append(filename)
107
- htmlLoader = BSHTMLLoader(path,"utf-8")
108
- htmlData = htmlLoader.load()
109
-
110
- title = htmlData[0].metadata['title']
111
- page_content = htmlData[0].page_content
112
-
113
- # Clean data. Remove multiple newlines, etc.
114
- page_content = re.sub(r'\n+', '\n',page_content)
115
-
116
- page_contentArray.append(page_content);
117
- webpageTitles.append(title)
118
- max_tokens = 1000
119
- tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
120
- logger.debug(f"### tokenizer: {tokenizer}")
121
- splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
122
- chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
123
-
124
- chunks = []
125
- for chnk in chunksOnePage:
126
- logger.debug(f"#### chnk in file: {chnk}")
127
- chunks.append(chnk)
128
- logger.debug(f"chunks: {chunks}")
129
- webpageChunks.append(chunks)
130
- webpageChunksDocNames.append(filename + "Chunks")
131
-
132
- logger.debug(f"### filename, title: {filename}, {title}")
133
-
134
- logger.debug(f"### webpageDocNames: {webpageDocNames}")
135
-
136
-
137
  ######################################################
138
  # Connect to the Weaviate vector database.
139
  logger.info("#### Create Weaviate db client connection.")
140
- #client = weaviate.connect_to_custom(
141
- # http_host="127.0.0.1",
142
- # http_port=8080,
143
- # http_secure=False,
144
- # grpc_host="127.0.0.1",
145
- # grpc_port=50051,
146
- # grpc_secure=False,
147
- # timeout=[600,600]
148
- # #read_timeout=600,
149
- # #write_timeout=90
150
- #)
151
-
152
  client = weaviate.WeaviateClient(
153
  connection_params=ConnectionParams.from_params(
154
  http_host="localhost",
@@ -156,20 +105,56 @@ client = weaviate.WeaviateClient(
156
  http_secure=False,
157
  grpc_host="localhost",
158
  grpc_port="50051",
159
- grpc_secure=False,
160
- log_level="WARNING"
161
  ),
162
- # auth_client_secret=weaviate.auth.AuthApiKey("secr3tk3y"),
163
- # additional_headers={
164
- # "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")
165
- # },
166
- additional_config=AdditionalConfig(
167
- timeout=Timeout(init=60, query=1800, insert=1800), # Values in seconds
168
- )
169
  )
170
  client.connect()
171
 
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  ######################################################
174
  # Create database webpage and chunks collections.
175
  #wpCollection = createWebpageCollection()
@@ -278,20 +263,23 @@ if not client.collections.exists("Chunks"):
278
 
279
  ###########################################################
280
  # Create document and chunks objects in the database.
281
- logger.info("#### Create page/doc db objects.")
282
- for i, className in enumerate(webpageDocNames):
283
- title = webpageTitles[i]
284
- logger.debug(f"## className, title: {className}, {title}")
285
- # Create Webpage Object
286
- page_content = page_contentArray[i]
287
- # Insert the document.
288
- wpCollectionObj_uuid = wpCollection.data.insert(
289
- {
290
- "name": className,
291
- "title": title,
292
- "content": page_content
293
- }
294
- )
 
 
 
295
  logger.info("#### Create chunk db objects.")
296
  # Insert the chunks for the document.
297
  for i2, chunk in enumerate(webpageChunks[i]):
@@ -436,7 +424,6 @@ submitButton.on_click(on_submitButton_clicked)
436
  display(output_widget)
437
 
438
 
439
- logger.info("#### Closing client db connection.")
440
- client.close()
441
-
442
- logger.info("#### Program terminating.")
 
21
 
22
 
23
  weaviate_logger = logging.getLogger("httpx")
24
+ weaviate_logger.setLevel(logging.WARNING)
25
 
26
  logger = logging.getLogger(__name__)
27
  logging.basicConfig(level=logging.INFO)
 
95
  )
96
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  ######################################################
99
  # Connect to the Weaviate vector database.
100
  logger.info("#### Create Weaviate db client connection.")
 
 
 
 
 
 
 
 
 
 
 
 
101
  client = weaviate.WeaviateClient(
102
  connection_params=ConnectionParams.from_params(
103
  http_host="localhost",
 
105
  http_secure=False,
106
  grpc_host="localhost",
107
  grpc_port="50051",
108
+ grpc_secure=False
 
109
  ),
110
+ },
111
+ additional_config=AdditionalConfig(
112
+ timeout=Timeout(init=60, query=1800, insert=1800), # Values in seconds
113
+ )
 
 
 
114
  )
115
  client.connect()
116
 
117
 
118
+ #######################################################
119
+ # Read each text input file, parse it into a document,
120
+ # chunk it, collect chunks and document name.
121
+ logger.info("#### Read and chunk input text files.")
122
+ if not client.collections.exists("Documents") || not client.collections.exists("Documents") :
123
+ for filename in os.listdir(pathString):
124
+ logger.info(filename)
125
+ path = Path(pathString + "/" + filename)
126
+ filename = filename.rstrip(".html")
127
+ webpageDocNames.append(filename)
128
+ htmlLoader = BSHTMLLoader(path,"utf-8")
129
+ htmlData = htmlLoader.load()
130
+
131
+ title = htmlData[0].metadata['title']
132
+ page_content = htmlData[0].page_content
133
+
134
+ # Clean data. Remove multiple newlines, etc.
135
+ page_content = re.sub(r'\n+', '\n',page_content)
136
+
137
+ page_contentArray.append(page_content);
138
+ webpageTitles.append(title)
139
+ max_tokens = 1000
140
+ tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
141
+ logger.debug(f"### tokenizer: {tokenizer}")
142
+ splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
143
+ chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
144
+
145
+ chunks = []
146
+ for chnk in chunksOnePage:
147
+ logger.debug(f"#### chnk in file: {chnk}")
148
+ chunks.append(chnk)
149
+ logger.debug(f"chunks: {chunks}")
150
+ webpageChunks.append(chunks)
151
+ webpageChunksDocNames.append(filename + "Chunks")
152
+
153
+ logger.debug(f"### filename, title: {filename}, {title}")
154
+ logger.debug(f"### webpageDocNames: {webpageDocNames}")
155
+
156
+
157
+
158
  ######################################################
159
  # Create database webpage and chunks collections.
160
  #wpCollection = createWebpageCollection()
 
263
 
264
  ###########################################################
265
  # Create document and chunks objects in the database.
266
+ if not client.collections.exists("Documents") :
267
+ logger.info("#### Create page/doc db objects.")
268
+ for i, className in enumerate(webpageDocNames):
269
+ title = webpageTitles[i]
270
+ logger.debug(f"## className, title: {className}, {title}")
271
+ # Create Webpage Object
272
+ page_content = page_contentArray[i]
273
+ # Insert the document.
274
+ wpCollectionObj_uuid = wpCollection.data.insert(
275
+ {
276
+ "name": className,
277
+ "title": title,
278
+ "content": page_content
279
+ }
280
+ )
281
+
282
+ if not client.collections.exists("Chunks") :
283
  logger.info("#### Create chunk db objects.")
284
  # Insert the chunks for the document.
285
  for i2, chunk in enumerate(webpageChunks[i]):
 
424
  display(output_widget)
425
 
426
 
427
+ #logger.info("#### Closing client db connection.")
428
+ #client.close()
429
+ #logger.info("#### Program terminating.")