MVPilgrim commited on
Commit
a7f7eb0
·
1 Parent(s): d2928dd
Files changed (1) hide show
  1. semsearch_Hld02.py +402 -0
semsearch_Hld02.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import weaviate
2
+
3
+ from sentence_transformers import SentenceTransformer
4
+ from langchain_community.document_loaders import BSHTMLLoader
5
+ from pathlib import Path
6
+ from lxml import html
7
+ import logging
8
+ from semantic_text_splitter import HuggingFaceTextSplitter
9
+ from tokenizers import Tokenizer
10
+ import json
11
+ import os
12
+ import re
13
+ import logging
14
+
15
+ import llama_cpp
16
+ from llama_cpp import Llama
17
+ import ipywidgets as widgets
18
+ import time
19
+ from IPython.display import display, clear_output
20
+
21
+ weaviate_logger = logging.getLogger("httpx")
22
+ weaviate_logger.setLevel(logging.WARNING)
23
+
24
+ logger = logging.getLogger(__name__)
25
+ logging.basicConfig(level=logging.INFO)
26
+
27
+
28
+
29
+ #################################################################
30
+ # Connect to Weaviate vector database.
31
+ #################################################################
32
+ client = ""
33
+ def connectToWeaviateDB():
34
+ ######################################################
35
+ # Connect to the Weaviate vector database.
36
+ logger.info("#### Create Weaviate db client connection.")
37
+ client = weaviate.connect_to_custom(
38
+ http_host="127.0.0.1",
39
+ http_port=8080,
40
+ http_secure=False,
41
+ grpc_host="127.0.0.1",
42
+ grpc_port=50051,
43
+ grpc_secure=False
44
+ )
45
+ client.connect()
46
+
47
+
48
+ #######################################################
49
+ # Read each text input file, parse it into a document,
50
+ # chunk it, collect chunks and document name.
51
+ #######################################################
52
+ webpageDocNames = []
53
+ page_contentArray = []
54
+ webpageTitles = []
55
+ webpageChunks = []
56
+ webpageChunksDocNames = []
57
+
58
+ def readParseChunkFiles():
59
+ logger.info("#### Read and chunk input text files.")
60
+ for filename in os.listdir(pathString):
61
+ logger.info(filename)
62
+ path = Path(pathString + "/" + filename)
63
+ filename = filename.rstrip(".html")
64
+ webpageDocNames.append(filename)
65
+ htmlLoader = BSHTMLLoader(path,"utf-8")
66
+ htmlData = htmlLoader.load()
67
+
68
+ title = htmlData[0].metadata['title']
69
+ page_content = htmlData[0].page_content
70
+
71
+ # Clean data. Remove multiple newlines, etc.
72
+ page_content = re.sub(r'\n+', '\n',page_content)
73
+
74
+ page_contentArray.append(page_content);
75
+ webpageTitles.append (title)
76
+ max_tokens = 1000
77
+ tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
78
+ logger.debug(f"### tokenizer: {tokenizer}")
79
+ splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
80
+ chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
81
+
82
+ chunks = []
83
+ for chnk in chunksOnePage:
84
+ logger.debug(f"#### chnk in file: {chnk}")
85
+ chunks.append(chnk)
86
+ logger.debug(f"chunks: {chunks}")
87
+ webpageChunks.append(chunks)
88
+ webpageChunksDocNames.append(filename + "Chunks")
89
+
90
+ logger.debug(f"### filename, title: {filename}, {title}")
91
+
92
+ logger.debug(f"### webpageDocNames: {webpageDocNames}")
93
+
94
+ #################################################################
95
+ # Create the chunks collection for the Weaviate database.
96
+ #################################################################
97
+ def createChunksCollection():
98
+ logger.info("#### createChunksCollection() entered.")
99
+ if client.collections.exists("Chunks"):
100
+ client.collections.delete("Chunks")
101
+
102
+ class_obj = {
103
+ "class": "Chunks",
104
+ "description": "Collection for document chunks.",
105
+ "vectorizer": "text2vec-transformers",
106
+ "moduleConfig": {
107
+ "text2vec-transformers": {
108
+ "vectorizeClassName": True
109
+ }
110
+ },
111
+ "vectorIndexType": "hnsw",
112
+ "vectorIndexConfig": {
113
+ "distance": "cosine",
114
+ },
115
+ "properties": [
116
+ {
117
+ "name": "chunk",
118
+ "dataType": ["text"],
119
+ "description": "Single webpage chunk.",
120
+ "vectorizer": "text2vec-transformers",
121
+ "moduleConfig": {
122
+ "text2vec-transformers": {
123
+ "vectorizePropertyName": False,
124
+ "skip": False,
125
+ "tokenization": "lowercase"
126
+ }
127
+ }
128
+ },
129
+ {
130
+ "name": "chunk_index",
131
+ "dataType": ["int"]
132
+ },
133
+ {
134
+ "name": "webpage",
135
+ "dataType": ["Documents"],
136
+ "description": "Webpage content chunks.",
137
+
138
+ "invertedIndexConfig": {
139
+ "bm25": {
140
+ "b": 0.75,
141
+ "k1": 1.2
142
+ }
143
+ }
144
+ }
145
+ ]
146
+ }
147
+ return(client.collections.create_from_dict(class_obj))
148
+
149
+
150
+ #####################################################################
151
+ # Create the document collection for the Weaviate database.
152
+ #####################################################################
153
+ def createWebpageCollection():
154
+ logger.info("#### createWebpageCollection() entered.")
155
+ if client.collections.exists("Documents"):
156
+ client.collections.delete("Documents")
157
+
158
+ class_obj = {
159
+ "class": "Documents",
160
+ "description": "For first attempt at loading a Weviate database.",
161
+ "vectorizer": "text2vec-transformers",
162
+ "moduleConfig": {
163
+ "text2vec-transformers": {
164
+ "vectorizeClassName": False
165
+ }
166
+ },
167
+ "vectorIndexType": "hnsw",
168
+ "vectorIndexConfig": {
169
+ "distance": "cosine",
170
+ },
171
+ "properties": [
172
+ {
173
+ "name": "title",
174
+ "dataType": ["text"],
175
+ "description": "HTML doc title.",
176
+ "vectorizer": "text2vec-transformers",
177
+ "moduleConfig": {
178
+ "text2vec-transformers": {
179
+ "vectorizePropertyName": True,
180
+ "skip": False,
181
+ "tokenization": "lowercase"
182
+ }
183
+ },
184
+ "invertedIndexConfig": {
185
+ "bm25": {
186
+ "b": 0.75,
187
+ "k1": 1.2
188
+ },
189
+ }
190
+ },
191
+ {
192
+ "name": "content",
193
+ "dataType": ["text"],
194
+ "description": "HTML page content.",
195
+ "moduleConfig": {
196
+ "text2vec-transformers": {
197
+ "vectorizePropertyName": True,
198
+ "tokenization": "whitespace"
199
+ }
200
+ }
201
+ }
202
+ ]
203
+ }
204
+ return(client.collections.create_from_dict(class_obj))
205
+
206
+
207
+ #################################################################
208
+ # Create document and chunk objects in database.
209
+ #################################################################
210
+ def createDatabaseObjects():
211
+ logger.info("#### Create page/doc and chunk db objects.")
212
+ for i, className in enumerate(webpageDocNames):
213
+ title = webpageTitles[i]
214
+ logger.debug(f"## className, title: {className}, {title}")
215
+ # Create Webpage Object
216
+ page_content = page_contentArray[i]
217
+ # Insert the document.
218
+ wpCollectionObj_uuid = wpCollection.data.insert(
219
+ {
220
+ "name": className,
221
+ "title": title,
222
+ "content": page_content
223
+ }
224
+ )
225
+
226
+ # Insert the chunks for the document.
227
+ for i2, chunk in enumerate(webpageChunks[i]):
228
+ chunk_uuid = wpChunkCollection.data.insert(
229
+ {
230
+ "title": title,
231
+ "chunk": chunk,
232
+ "chunk_index": i2,
233
+ "references":
234
+ {
235
+ "webpage": wpCollectionObj_uuid
236
+ }
237
+ }
238
+ )
239
+
240
+
241
+ #################################################################
242
+ # Create display widgets.
243
+ #################################################################
244
+ output_widget = ""
245
+ systemTextArea = ""
246
+ userTextArea = ""
247
+ ragPromptTextArea = ""
248
+ responseTextArea = ""
249
+ selectRag = ""
250
+ submitButton = ""
251
+ def createWidgets():
252
+ output_widget = widgets.Output()
253
+ with output_widget:
254
+ print("### Create widgets entered.")
255
+
256
+ systemTextArea = widgets.Textarea(
257
+ value='',
258
+ placeholder='Enter System Prompt.',
259
+ description='Sys Prompt: ',
260
+ disabled=False,
261
+ layout=widgets.Layout(width='300px', height='80px')
262
+ )
263
+
264
+ userTextArea = widgets.Textarea(
265
+ value='',
266
+ placeholder='Enter User Prompt.',
267
+ description='User Prompt: ',
268
+ disabled=False,
269
+ layout=widgets.Layout(width='435px', height='110px')
270
+ )
271
+
272
+ ragPromptTextArea = widgets.Textarea(
273
+ value='',
274
+ placeholder='App generated prompt with RAG information.',
275
+ description='RAG Prompt: ',
276
+ disabled=False,
277
+ layout=widgets.Layout(width='580px', height='180px')
278
+ )
279
+
280
+ responseTextArea = widgets.Textarea(
281
+ value='',
282
+ placeholder='LLM generated response.',
283
+ description='LLM Resp: ',
284
+ disabled=False,
285
+ layout=widgets.Layout(width='780px', height='200px')
286
+ )
287
+
288
+ selectRag = widgets.Checkbox(
289
+ value=False,
290
+ description='Use RAG',
291
+ disabled=False
292
+ )
293
+
294
+ submitButton = widgets.Button(
295
+ description='Run Model.',
296
+ disabled=False,
297
+ button_style='', # 'success', 'info', 'warning', 'danger' or ''
298
+ tooltip='Click',
299
+ icon='check' # (FontAwesome names without the `fa-` prefix)
300
+ )
301
+
302
+
303
+ ######################################################################
304
+ # MAINLINE
305
+ ######################################################################
306
+ logger.info("#### MAINLINE ENTERED.")
307
+
308
+ #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
309
+ pathString = "/app/inputDocs"
310
+ chunks = []
311
+ webpageDocNames = []
312
+ page_contentArray = []
313
+ webpageChunks = []
314
+ webpageTitles = []
315
+ webpageChunksDocNames = []
316
+
317
+ #connectToWeaviateDB()
318
+ logger.info("#### Create Weaviate db client connection.")
319
+ client = weaviate.connect_to_custom(
320
+ http_host="127.0.0.1",
321
+ http_port=8080,
322
+ http_secure=False,
323
+ grpc_host="127.0.0.1",
324
+ grpc_port=50051,
325
+ grpc_secure=False
326
+ )
327
+ client.connect()
328
+
329
+ readParseChunkFiles()
330
+ wpCollection = createWebpageCollection()
331
+ wpChunkCollection = createChunksCollection()
332
+
333
+ #createDatabaseObjects()
334
+ logger.info("#### Create page/doc and chunk db objects.")
335
+ for i, className in enumerate(webpageDocNames):
336
+ title = webpageTitles[i]
337
+ logger.debug(f"## className, title: {className}, {title}")
338
+ # Create Webpage Object
339
+ page_content = page_contentArray[i]
340
+ # Insert the document.
341
+ wpCollectionObj_uuid = wpCollection.data.insert(
342
+ {
343
+ "name": className,
344
+ "title": title,
345
+ "content": page_content
346
+ }
347
+ )
348
+
349
+ # Insert the chunks for the document.
350
+ for i2, chunk in enumerate(webpageChunks[i]):
351
+ chunk_uuid = wpChunkCollection.data.insert(
352
+ {
353
+ "title": title,
354
+ "chunk": chunk,
355
+ "chunk_index": i2,
356
+ "references":
357
+ {
358
+ "webpage": wpCollectionObj_uuid
359
+ }
360
+ }
361
+ )
362
+
363
+ ###############################################################################
364
+ # text contains prompt for vector DB.
365
+ text = "human-made computer cognitive ability"
366
+
367
+
368
+ ###############################################################################
369
+ # Initial the the sentence transformer and encode the query prompt.
370
+ logger.info(f"#### Encode text query prompt to create vectors. {text}")
371
+ model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
372
+
373
+ vector = model.encode(text)
374
+ vectorList = []
375
+
376
+ logger.debug("#### Print vectors.")
377
+ for vec in vector:
378
+ vectorList.append(vec)
379
+ logger.debug(f"vectorList: {vectorList[2]}")
380
+
381
+ # Fetch chunks and print chunks.
382
+ logger.info("#### Retrieve semchunks from db using vectors from prompt.")
383
+ semChunks = wpChunkCollection.query.near_vector(
384
+ near_vector=vectorList,
385
+ distance=0.7,
386
+ limit=3
387
+ )
388
+ logger.debug(f"### semChunks[0]: {semChunks}")
389
+
390
+ # Print chunks, corresponding document and document title.
391
+ logger.info("#### Print individual retrieved chunks.")
392
+ for chunk in enumerate(semChunks.objects):
393
+ logger.info(f"#### chunk: {chunk}")
394
+ webpage_uuid = chunk[1].properties['references']['webpage']
395
+ logger.info(f"webpage_uuid: {webpage_uuid}")
396
+ wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
397
+ logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
398
+
399
+ logger.info("#### Closing client db connection.")
400
+ client.close()
401
+
402
+ logger.info("#### Program terminating.")