MVPilgrim commited on
Commit
d3b316d
·
1 Parent(s): b67784b
Files changed (1) hide show
  1. semsearchDbgUI.py +1 -297
semsearchDbgUI.py CHANGED
@@ -21,9 +21,6 @@ import ipywidgets as widgets
21
  from IPython.display import display, clear_output
22
 
23
 
24
- weaviate_logger = logging.getLogger("httpx")
25
- weaviate_logger.setLevel(logging.WARNING)
26
-
27
  logger = logging.getLogger(__name__)
28
  logging.basicConfig(level=logging.INFO)
29
 
@@ -34,15 +31,6 @@ logging.basicConfig(level=logging.INFO)
34
  #
35
  logger.info("#### MAINLINE ENTERED.")
36
 
37
- #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
38
- pathString = "/app/inputDocs"
39
- chunks = []
40
- webpageDocNames = []
41
- page_contentArray = []
42
- webpageChunks = []
43
- webpageTitles = []
44
- webpageChunksDocNames = []
45
-
46
  #####################################################################
47
  # Create UI widgets.
48
  output_widget = widgets.Output()
@@ -96,288 +84,8 @@ submitButton = widgets.Button(
96
  )
97
 
98
 
99
- ######################################################
100
- # Connect to the Weaviate vector database.
101
- logger.info("#### Create Weaviate db client connection.")
102
- client = weaviate.WeaviateClient(
103
- connection_params=ConnectionParams.from_params(
104
- http_host="localhost",
105
- http_port="8080",
106
- http_secure=False,
107
- grpc_host="localhost",
108
- grpc_port="50051",
109
- grpc_secure=False
110
- ),
111
- additional_config=AdditionalConfig(
112
- timeout=Timeout(init=60, query=1800, insert=1800), # Values in seconds
113
- )
114
- )
115
- client.connect()
116
-
117
-
118
- #######################################################
119
- # Read each text input file, parse it into a document,
120
- # chunk it, collect chunks and document name.
121
- logger.info("#### Read and chunk input text files.")
122
- if not client.collections.exists("Documents") or not client.collections.exists("Documents") :
123
- for filename in os.listdir(pathString):
124
- logger.info(filename)
125
- path = Path(pathString + "/" + filename)
126
- filename = filename.rstrip(".html")
127
- webpageDocNames.append(filename)
128
- htmlLoader = BSHTMLLoader(path,"utf-8")
129
- htmlData = htmlLoader.load()
130
-
131
- title = htmlData[0].metadata['title']
132
- page_content = htmlData[0].page_content
133
-
134
- # Clean data. Remove multiple newlines, etc.
135
- page_content = re.sub(r'\n+', '\n',page_content)
136
-
137
- page_contentArray.append(page_content);
138
- webpageTitles.append(title)
139
- max_tokens = 1000
140
- tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
141
- logger.debug(f"### tokenizer: {tokenizer}")
142
- splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
143
- chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
144
-
145
- chunks = []
146
- for chnk in chunksOnePage:
147
- logger.debug(f"#### chnk in file: {chnk}")
148
- chunks.append(chnk)
149
- logger.debug(f"chunks: {chunks}")
150
- webpageChunks.append(chunks)
151
- webpageChunksDocNames.append(filename + "Chunks")
152
-
153
- logger.debug(f"### filename, title: {filename}, {title}")
154
- logger.debug(f"### webpageDocNames: {webpageDocNames}")
155
-
156
-
157
-
158
- ######################################################
159
- # Create database webpage and chunks collections.
160
- #wpCollection = createWebpageCollection()
161
- #wpChunkCollection = createChunksCollection()
162
- logger.info("#### createWebpageCollection() entered.")
163
- if not client.collections.exists("Documents"):
164
- #client.collections.delete("Documents")
165
- class_obj = {
166
- "class": "Documents",
167
- "description": "For first attempt at loading a Weviate database.",
168
- "vectorizer": "text2vec-transformers",
169
- "moduleConfig": {
170
- "text2vec-transformers": {
171
- "vectorizeClassName": False
172
- }
173
- },
174
- "vectorIndexType": "hnsw",
175
- "vectorIndexConfig": {
176
- "distance": "cosine",
177
- },
178
- "properties": [
179
- {
180
- "name": "title",
181
- "dataType": ["text"],
182
- "description": "HTML doc title.",
183
- "vectorizer": "text2vec-transformers",
184
- "moduleConfig": {
185
- "text2vec-transformers": {
186
- "vectorizePropertyName": True,
187
- "skip": False,
188
- "tokenization": "lowercase"
189
- }
190
- },
191
- "invertedIndexConfig": {
192
- "bm25": {
193
- "b": 0.75,
194
- "k1": 1.2
195
- },
196
- }
197
- },
198
- {
199
- "name": "content",
200
- "dataType": ["text"],
201
- "description": "HTML page content.",
202
- "moduleConfig": {
203
- "text2vec-transformers": {
204
- "vectorizePropertyName": True,
205
- "tokenization": "whitespace"
206
- }
207
- }
208
- }
209
- ]
210
- }
211
- wpCollection = client.collections.create_from_dict(class_obj)
212
-
213
- logger.info("#### createChunksCollection() entered.")
214
- if not client.collections.exists("Chunks"):
215
- #client.collections.delete("Chunks")
216
- class_obj = {
217
- "class": "Chunks",
218
- "description": "Collection for document chunks.",
219
- "vectorizer": "text2vec-transformers",
220
- "moduleConfig": {
221
- "text2vec-transformers": {
222
- "vectorizeClassName": True
223
- }
224
- },
225
- "vectorIndexType": "hnsw",
226
- "vectorIndexConfig": {
227
- "distance": "cosine",
228
- },
229
- "properties": [
230
- {
231
- "name": "chunk",
232
- "dataType": ["text"],
233
- "description": "Single webpage chunk.",
234
- "vectorizer": "text2vec-transformers",
235
- "moduleConfig": {
236
- "text2vec-transformers": {
237
- "vectorizePropertyName": False,
238
- "skip": False,
239
- "tokenization": "lowercase"
240
- }
241
- }
242
- },
243
- {
244
- "name": "chunk_index",
245
- "dataType": ["int"]
246
- },
247
- {
248
- "name": "webpage",
249
- "dataType": ["Documents"],
250
- "description": "Webpage content chunks.",
251
-
252
- "invertedIndexConfig": {
253
- "bm25": {
254
- "b": 0.75,
255
- "k1": 1.2
256
- }
257
- }
258
- }
259
- ]
260
- }
261
- wpChunkCollection = client.collections.create_from_dict(class_obj)
262
-
263
-
264
- ###########################################################
265
- # Create document and chunks objects in the database.
266
- if not client.collections.exists("Documents") :
267
- logger.info("#### Create page/doc db objects.")
268
- for i, className in enumerate(webpageDocNames):
269
- title = webpageTitles[i]
270
- logger.debug(f"## className, title: {className}, {title}")
271
- # Create Webpage Object
272
- page_content = page_contentArray[i]
273
- # Insert the document.
274
- wpCollectionObj_uuid = wpCollection.data.insert(
275
- {
276
- "name": className,
277
- "title": title,
278
- "content": page_content
279
- }
280
- )
281
-
282
- if not client.collections.exists("Chunks") :
283
- logger.info("#### Create chunk db objects.")
284
- # Insert the chunks for the document.
285
- for i2, chunk in enumerate(webpageChunks[i]):
286
- chunk_uuid = wpChunkCollection.data.insert(
287
- {
288
- "title": title,
289
- "chunk": chunk,
290
- "chunk_index": i2,
291
- "references":
292
- {
293
- "webpage": wpCollectionObj_uuid
294
- }
295
- }
296
- )
297
-
298
-
299
- #################################################################
300
- # Initialize the LLM.
301
- model_path = "/app/llama-2-7b-chat.Q4_0.gguf"
302
- llm = Llama(model_path,
303
- #*,
304
- n_gpu_layers=0,
305
- split_mode=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
306
- main_gpu=0,
307
- tensor_split=None,
308
- vocab_only=False,
309
- use_mmap=True,
310
- use_mlock=False,
311
- kv_overrides=None,
312
- seed=llama_cpp.LLAMA_DEFAULT_SEED,
313
- n_ctx=512,
314
- n_batch=512,
315
- n_threads=8,
316
- n_threads_batch=16,
317
- rope_scaling_type=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
318
- pooling_type=llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
319
- rope_freq_base=0.0,
320
- rope_freq_scale=0.0,
321
- yarn_ext_factor=-1.0,
322
- yarn_attn_factor=1.0,
323
- yarn_beta_fast=32.0,
324
- yarn_beta_slow=1.0,
325
- yarn_orig_ctx=0,
326
- logits_all=False,
327
- embedding=False,
328
- offload_kqv=True,
329
- last_n_tokens_size=64,
330
- lora_base=None,
331
- lora_scale=1.0,
332
- lora_path=None,
333
- numa=False,
334
- chat_format=None,
335
- chat_handler=None,
336
- draft_model=None,
337
- tokenizer=None,
338
- type_k=None,
339
- type_v=None,
340
- verbose=True
341
- )
342
-
343
-
344
- def getRagData(promptText):
345
- ###############################################################################
346
- # Initial the the sentence transformer and encode the query prompt.
347
- logger.info(f"#### Encode text query prompt to create vectors. {text}")
348
- model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
349
-
350
- vector = model.encode(promptText)
351
- vectorList = []
352
-
353
- logger.debug("#### Print vectors.")
354
- for vec in vector:
355
- vectorList.append(vec)
356
- logger.debug(f"vectorList: {vectorList[2]}")
357
-
358
- # Fetch chunks and print chunks.
359
- logger.info("#### Retrieve semchunks from db using vectors from prompt.")
360
- semChunks = wpChunkCollection.query.near_vector(
361
- near_vector=vectorList,
362
- distance=0.7,
363
- limit=3
364
- )
365
- logger.debug(f"### semChunks[0]: {semChunks}")
366
-
367
- # Print chunks, corresponding document and document title.
368
- ragData = ""
369
- logger.info("#### Print individual retrieved chunks.")
370
- for chunk in enumerate(semChunks.objects):
371
- logger.info(f"#### chunk: {chunk}")
372
- ragData = ragData + "\n" + chunk[0]
373
- webpage_uuid = chunk[1].properties['references']['webpage']
374
- logger.info(f"webpage_uuid: {webpage_uuid}")
375
- wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
376
- logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
377
- #collection = client.collections.get("Chunks")
378
- return ragData
379
-
380
  # Display UI
 
381
  display(systemTextArea)
382
  display(userTextArea)
383
  display(ragPromptTextArea)
@@ -431,7 +139,3 @@ def on_submitButton_clicked(b):
431
  submitButton.on_click(on_submitButton_clicked)
432
  display(output_widget)
433
 
434
-
435
- #logger.info("#### Closing client db connection.")
436
- #client.close()
437
- #logger.info("#### Program terminating.")
 
21
  from IPython.display import display, clear_output
22
 
23
 
 
 
 
24
  logger = logging.getLogger(__name__)
25
  logging.basicConfig(level=logging.INFO)
26
 
 
31
  #
32
  logger.info("#### MAINLINE ENTERED.")
33
 
 
 
 
 
 
 
 
 
 
34
  #####################################################################
35
  # Create UI widgets.
36
  output_widget = widgets.Output()
 
84
  )
85
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  # Display UI
88
+ log.debug("### : ")
89
  display(systemTextArea)
90
  display(userTextArea)
91
  display(ragPromptTextArea)
 
139
  submitButton.on_click(on_submitButton_clicked)
140
  display(output_widget)
141