MVPilgrim commited on
Commit
ec89ccb
·
1 Parent(s): 2aab62b
Files changed (2) hide show
  1. semsearch._Hld03py +402 -0
  2. semsearch.py +274 -402
semsearch._Hld03py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import weaviate
2
+
3
+ from sentence_transformers import SentenceTransformer
4
+ from langchain_community.document_loaders import BSHTMLLoader
5
+ from pathlib import Path
6
+ from lxml import html
7
+ import logging
8
+ from semantic_text_splitter import HuggingFaceTextSplitter
9
+ from tokenizers import Tokenizer
10
+ import json
11
+ import os
12
+ import re
13
+ import logging
14
+
15
+ import llama_cpp
16
+ from llama_cpp import Llama
17
+ import ipywidgets as widgets
18
+ import time
19
+ from IPython.display import display, clear_output
20
+
21
+ weaviate_logger = logging.getLogger("httpx")
22
+ weaviate_logger.setLevel(logging.WARNING)
23
+
24
+ logger = logging.getLogger(__name__)
25
+ logging.basicConfig(level=logging.INFO)
26
+
27
+
28
+
29
+ #################################################################
30
+ # Connect to Weaviate vector database.
31
+ #################################################################
32
+ client = ""
33
+ def connectToWeaviateDB():
34
+ ######################################################
35
+ # Connect to the Weaviate vector database.
36
+ logger.info("#### Create Weaviate db client connection.")
37
+ client = weaviate.connect_to_custom(
38
+ http_host="127.0.0.1",
39
+ http_port=8080,
40
+ http_secure=False,
41
+ grpc_host="127.0.0.1",
42
+ grpc_port=50051,
43
+ grpc_secure=False
44
+ )
45
+ client.connect()
46
+
47
+
48
+ #######################################################
49
+ # Read each text input file, parse it into a document,
50
+ # chunk it, collect chunks and document name.
51
+ #######################################################
52
+ webpageDocNames = []
53
+ page_contentArray = []
54
+ webpageTitles = []
55
+ webpageChunks = []
56
+ webpageChunksDocNames = []
57
+
58
+ def readParseChunkFiles():
59
+ logger.info("#### Read and chunk input text files.")
60
+ for filename in os.listdir(pathString):
61
+ logger.info(filename)
62
+ path = Path(pathString + "/" + filename)
63
+ filename = filename.rstrip(".html")
64
+ webpageDocNames.append(filename)
65
+ htmlLoader = BSHTMLLoader(path,"utf-8")
66
+ htmlData = htmlLoader.load()
67
+
68
+ title = htmlData[0].metadata['title']
69
+ page_content = htmlData[0].page_content
70
+
71
+ # Clean data. Remove multiple newlines, etc.
72
+ page_content = re.sub(r'\n+', '\n',page_content)
73
+
74
+ page_contentArray.append(page_content);
75
+ webpageTitles.append (title)
76
+ max_tokens = 1000
77
+ tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
78
+ logger.debug(f"### tokenizer: {tokenizer}")
79
+ splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
80
+ chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
81
+
82
+ chunks = []
83
+ for chnk in chunksOnePage:
84
+ logger.debug(f"#### chnk in file: {chnk}")
85
+ chunks.append(chnk)
86
+ logger.debug(f"chunks: {chunks}")
87
+ webpageChunks.append(chunks)
88
+ webpageChunksDocNames.append(filename + "Chunks")
89
+
90
+ logger.debug(f"### filename, title: {filename}, {title}")
91
+
92
+ logger.debug(f"### webpageDocNames: {webpageDocNames}")
93
+
94
+ #################################################################
95
+ # Create the chunks collection for the Weaviate database.
96
+ #################################################################
97
+ def createChunksCollection():
98
+ logger.info("#### createChunksCollection() entered.")
99
+ if client.collections.exists("Chunks"):
100
+ client.collections.delete("Chunks")
101
+
102
+ class_obj = {
103
+ "class": "Chunks",
104
+ "description": "Collection for document chunks.",
105
+ "vectorizer": "text2vec-transformers",
106
+ "moduleConfig": {
107
+ "text2vec-transformers": {
108
+ "vectorizeClassName": True
109
+ }
110
+ },
111
+ "vectorIndexType": "hnsw",
112
+ "vectorIndexConfig": {
113
+ "distance": "cosine",
114
+ },
115
+ "properties": [
116
+ {
117
+ "name": "chunk",
118
+ "dataType": ["text"],
119
+ "description": "Single webpage chunk.",
120
+ "vectorizer": "text2vec-transformers",
121
+ "moduleConfig": {
122
+ "text2vec-transformers": {
123
+ "vectorizePropertyName": False,
124
+ "skip": False,
125
+ "tokenization": "lowercase"
126
+ }
127
+ }
128
+ },
129
+ {
130
+ "name": "chunk_index",
131
+ "dataType": ["int"]
132
+ },
133
+ {
134
+ "name": "webpage",
135
+ "dataType": ["Documents"],
136
+ "description": "Webpage content chunks.",
137
+
138
+ "invertedIndexConfig": {
139
+ "bm25": {
140
+ "b": 0.75,
141
+ "k1": 1.2
142
+ }
143
+ }
144
+ }
145
+ ]
146
+ }
147
+ return(client.collections.create_from_dict(class_obj))
148
+
149
+
150
+ #####################################################################
151
+ # Create the document collection for the Weaviate database.
152
+ #####################################################################
153
+ def createWebpageCollection():
154
+ logger.info("#### createWebpageCollection() entered.")
155
+ if client.collections.exists("Documents"):
156
+ client.collections.delete("Documents")
157
+
158
+ class_obj = {
159
+ "class": "Documents",
160
+ "description": "For first attempt at loading a Weviate database.",
161
+ "vectorizer": "text2vec-transformers",
162
+ "moduleConfig": {
163
+ "text2vec-transformers": {
164
+ "vectorizeClassName": False
165
+ }
166
+ },
167
+ "vectorIndexType": "hnsw",
168
+ "vectorIndexConfig": {
169
+ "distance": "cosine",
170
+ },
171
+ "properties": [
172
+ {
173
+ "name": "title",
174
+ "dataType": ["text"],
175
+ "description": "HTML doc title.",
176
+ "vectorizer": "text2vec-transformers",
177
+ "moduleConfig": {
178
+ "text2vec-transformers": {
179
+ "vectorizePropertyName": True,
180
+ "skip": False,
181
+ "tokenization": "lowercase"
182
+ }
183
+ },
184
+ "invertedIndexConfig": {
185
+ "bm25": {
186
+ "b": 0.75,
187
+ "k1": 1.2
188
+ },
189
+ }
190
+ },
191
+ {
192
+ "name": "content",
193
+ "dataType": ["text"],
194
+ "description": "HTML page content.",
195
+ "moduleConfig": {
196
+ "text2vec-transformers": {
197
+ "vectorizePropertyName": True,
198
+ "tokenization": "whitespace"
199
+ }
200
+ }
201
+ }
202
+ ]
203
+ }
204
+ return(client.collections.create_from_dict(class_obj))
205
+
206
+
207
+ #################################################################
208
+ # Create document and chunk objects in database.
209
+ #################################################################
210
+ def createDatabaseObjects():
211
+ logger.info("#### Create page/doc and chunk db objects.")
212
+ for i, className in enumerate(webpageDocNames):
213
+ title = webpageTitles[i]
214
+ logger.debug(f"## className, title: {className}, {title}")
215
+ # Create Webpage Object
216
+ page_content = page_contentArray[i]
217
+ # Insert the document.
218
+ wpCollectionObj_uuid = wpCollection.data.insert(
219
+ {
220
+ "name": className,
221
+ "title": title,
222
+ "content": page_content
223
+ }
224
+ )
225
+
226
+ # Insert the chunks for the document.
227
+ for i2, chunk in enumerate(webpageChunks[i]):
228
+ chunk_uuid = wpChunkCollection.data.insert(
229
+ {
230
+ "title": title,
231
+ "chunk": chunk,
232
+ "chunk_index": i2,
233
+ "references":
234
+ {
235
+ "webpage": wpCollectionObj_uuid
236
+ }
237
+ }
238
+ )
239
+
240
+
241
+ #################################################################
242
+ # Create display widgets.
243
+ #################################################################
244
+ output_widget = ""
245
+ systemTextArea = ""
246
+ userTextArea = ""
247
+ ragPromptTextArea = ""
248
+ responseTextArea = ""
249
+ selectRag = ""
250
+ submitButton = ""
251
+ def createWidgets():
252
+ output_widget = widgets.Output()
253
+ with output_widget:
254
+ print("### Create widgets entered.")
255
+
256
+ systemTextArea = widgets.Textarea(
257
+ value='',
258
+ placeholder='Enter System Prompt.',
259
+ description='Sys Prompt: ',
260
+ disabled=False,
261
+ layout=widgets.Layout(width='300px', height='80px')
262
+ )
263
+
264
+ userTextArea = widgets.Textarea(
265
+ value='',
266
+ placeholder='Enter User Prompt.',
267
+ description='User Prompt: ',
268
+ disabled=False,
269
+ layout=widgets.Layout(width='435px', height='110px')
270
+ )
271
+
272
+ ragPromptTextArea = widgets.Textarea(
273
+ value='',
274
+ placeholder='App generated prompt with RAG information.',
275
+ description='RAG Prompt: ',
276
+ disabled=False,
277
+ layout=widgets.Layout(width='580px', height='180px')
278
+ )
279
+
280
+ responseTextArea = widgets.Textarea(
281
+ value='',
282
+ placeholder='LLM generated response.',
283
+ description='LLM Resp: ',
284
+ disabled=False,
285
+ layout=widgets.Layout(width='780px', height='200px')
286
+ )
287
+
288
+ selectRag = widgets.Checkbox(
289
+ value=False,
290
+ description='Use RAG',
291
+ disabled=False
292
+ )
293
+
294
+ submitButton = widgets.Button(
295
+ description='Run Model.',
296
+ disabled=False,
297
+ button_style='', # 'success', 'info', 'warning', 'danger' or ''
298
+ tooltip='Click',
299
+ icon='check' # (FontAwesome names without the `fa-` prefix)
300
+ )
301
+
302
+
303
+ ######################################################################
304
+ # MAINLINE
305
+ ######################################################################
306
+ logger.info("#### MAINLINE ENTERED.")
307
+
308
+ #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
309
+ pathString = "/app/inputDocs"
310
+ chunks = []
311
+ webpageDocNames = []
312
+ page_contentArray = []
313
+ webpageChunks = []
314
+ webpageTitles = []
315
+ webpageChunksDocNames = []
316
+
317
+ #connectToWeaviateDB()
318
+ logger.info("#### Create Weaviate db client connection.")
319
+ client = weaviate.connect_to_custom(
320
+ http_host="127.0.0.1",
321
+ http_port=8080,
322
+ http_secure=False,
323
+ grpc_host="127.0.0.1",
324
+ grpc_port=50051,
325
+ grpc_secure=False
326
+ )
327
+ client.connect()
328
+
329
+ readParseChunkFiles()
330
+ wpCollection = createWebpageCollection()
331
+ wpChunkCollection = createChunksCollection()
332
+
333
+ #createDatabaseObjects()
334
+ logger.info("#### Create page/doc and chunk db objects.")
335
+ for i, className in enumerate(webpageDocNames):
336
+ title = webpageTitles[i]
337
+ logger.debug(f"## className, title: {className}, {title}")
338
+ # Create Webpage Object
339
+ page_content = page_contentArray[i]
340
+ # Insert the document.
341
+ wpCollectionObj_uuid = wpCollection.data.insert(
342
+ {
343
+ "name": className,
344
+ "title": title,
345
+ "content": page_content
346
+ }
347
+ )
348
+
349
+ # Insert the chunks for the document.
350
+ for i2, chunk in enumerate(webpageChunks[i]):
351
+ chunk_uuid = wpChunkCollection.data.insert(
352
+ {
353
+ "title": title,
354
+ "chunk": chunk,
355
+ "chunk_index": i2,
356
+ "references":
357
+ {
358
+ "webpage": wpCollectionObj_uuid
359
+ }
360
+ }
361
+ )
362
+
363
+ ###############################################################################
364
+ # text contains prompt for vector DB.
365
+ text = "human-made computer cognitive ability"
366
+
367
+
368
+ ###############################################################################
369
+ # Initial the the sentence transformer and encode the query prompt.
370
+ logger.info(f"#### Encode text query prompt to create vectors. {text}")
371
+ model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
372
+
373
+ vector = model.encode(text)
374
+ vectorList = []
375
+
376
+ logger.debug("#### Print vectors.")
377
+ for vec in vector:
378
+ vectorList.append(vec)
379
+ logger.debug(f"vectorList: {vectorList[2]}")
380
+
381
+ # Fetch chunks and print chunks.
382
+ logger.info("#### Retrieve semchunks from db using vectors from prompt.")
383
+ semChunks = wpChunkCollection.query.near_vector(
384
+ near_vector=vectorList,
385
+ distance=0.7,
386
+ limit=3
387
+ )
388
+ logger.debug(f"### semChunks[0]: {semChunks}")
389
+
390
+ # Print chunks, corresponding document and document title.
391
+ logger.info("#### Print individual retrieved chunks.")
392
+ for chunk in enumerate(semChunks.objects):
393
+ logger.info(f"#### chunk: {chunk}")
394
+ webpage_uuid = chunk[1].properties['references']['webpage']
395
+ logger.info(f"webpage_uuid: {webpage_uuid}")
396
+ wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
397
+ logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
398
+
399
+ logger.info("#### Closing client db connection.")
400
+ client.close()
401
+
402
+ logger.info("#### Program terminating.")
semsearch.py CHANGED
@@ -1,402 +1,274 @@
1
- import weaviate
2
-
3
- from sentence_transformers import SentenceTransformer
4
- from langchain_community.document_loaders import BSHTMLLoader
5
- from pathlib import Path
6
- from lxml import html
7
- import logging
8
- from semantic_text_splitter import HuggingFaceTextSplitter
9
- from tokenizers import Tokenizer
10
- import json
11
- import os
12
- import re
13
- import logging
14
-
15
- import llama_cpp
16
- from llama_cpp import Llama
17
- import ipywidgets as widgets
18
- import time
19
- from IPython.display import display, clear_output
20
-
21
- weaviate_logger = logging.getLogger("httpx")
22
- weaviate_logger.setLevel(logging.WARNING)
23
-
24
- logger = logging.getLogger(__name__)
25
- logging.basicConfig(level=logging.INFO)
26
-
27
-
28
-
29
- #################################################################
30
- # Connect to Weaviate vector database.
31
- #################################################################
32
- client = ""
33
- def connectToWeaviateDB():
34
- ######################################################
35
- # Connect to the Weaviate vector database.
36
- logger.info("#### Create Weaviate db client connection.")
37
- client = weaviate.connect_to_custom(
38
- http_host="127.0.0.1",
39
- http_port=8080,
40
- http_secure=False,
41
- grpc_host="127.0.0.1",
42
- grpc_port=50051,
43
- grpc_secure=False
44
- )
45
- client.connect()
46
-
47
-
48
- #######################################################
49
- # Read each text input file, parse it into a document,
50
- # chunk it, collect chunks and document name.
51
- #######################################################
52
- webpageDocNames = []
53
- page_contentArray = []
54
- webpageTitles = []
55
- webpageChunks = []
56
- webpageChunksDocNames = []
57
-
58
- def readParseChunkFiles():
59
- logger.info("#### Read and chunk input text files.")
60
- for filename in os.listdir(pathString):
61
- logger.info(filename)
62
- path = Path(pathString + "/" + filename)
63
- filename = filename.rstrip(".html")
64
- webpageDocNames.append(filename)
65
- htmlLoader = BSHTMLLoader(path,"utf-8")
66
- htmlData = htmlLoader.load()
67
-
68
- title = htmlData[0].metadata['title']
69
- page_content = htmlData[0].page_content
70
-
71
- # Clean data. Remove multiple newlines, etc.
72
- page_content = re.sub(r'\n+', '\n',page_content)
73
-
74
- page_contentArray.append(page_content);
75
- webpageTitles.append (title)
76
- max_tokens = 1000
77
- tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
78
- logger.debug(f"### tokenizer: {tokenizer}")
79
- splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
80
- chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
81
-
82
- chunks = []
83
- for chnk in chunksOnePage:
84
- logger.debug(f"#### chnk in file: {chnk}")
85
- chunks.append(chnk)
86
- logger.debug(f"chunks: {chunks}")
87
- webpageChunks.append(chunks)
88
- webpageChunksDocNames.append(filename + "Chunks")
89
-
90
- logger.debug(f"### filename, title: {filename}, {title}")
91
-
92
- logger.debug(f"### webpageDocNames: {webpageDocNames}")
93
-
94
- #################################################################
95
- # Create the chunks collection for the Weaviate database.
96
- #################################################################
97
- def createChunksCollection():
98
- logger.info("#### createChunksCollection() entered.")
99
- if client.collections.exists("Chunks"):
100
- client.collections.delete("Chunks")
101
-
102
- class_obj = {
103
- "class": "Chunks",
104
- "description": "Collection for document chunks.",
105
- "vectorizer": "text2vec-transformers",
106
- "moduleConfig": {
107
- "text2vec-transformers": {
108
- "vectorizeClassName": True
109
- }
110
- },
111
- "vectorIndexType": "hnsw",
112
- "vectorIndexConfig": {
113
- "distance": "cosine",
114
- },
115
- "properties": [
116
- {
117
- "name": "chunk",
118
- "dataType": ["text"],
119
- "description": "Single webpage chunk.",
120
- "vectorizer": "text2vec-transformers",
121
- "moduleConfig": {
122
- "text2vec-transformers": {
123
- "vectorizePropertyName": False,
124
- "skip": False,
125
- "tokenization": "lowercase"
126
- }
127
- }
128
- },
129
- {
130
- "name": "chunk_index",
131
- "dataType": ["int"]
132
- },
133
- {
134
- "name": "webpage",
135
- "dataType": ["Documents"],
136
- "description": "Webpage content chunks.",
137
-
138
- "invertedIndexConfig": {
139
- "bm25": {
140
- "b": 0.75,
141
- "k1": 1.2
142
- }
143
- }
144
- }
145
- ]
146
- }
147
- return(client.collections.create_from_dict(class_obj))
148
-
149
-
150
- #####################################################################
151
- # Create the document collection for the Weaviate database.
152
- #####################################################################
153
- def createWebpageCollection():
154
- logger.info("#### createWebpageCollection() entered.")
155
- if client.collections.exists("Documents"):
156
- client.collections.delete("Documents")
157
-
158
- class_obj = {
159
- "class": "Documents",
160
- "description": "For first attempt at loading a Weviate database.",
161
- "vectorizer": "text2vec-transformers",
162
- "moduleConfig": {
163
- "text2vec-transformers": {
164
- "vectorizeClassName": False
165
- }
166
- },
167
- "vectorIndexType": "hnsw",
168
- "vectorIndexConfig": {
169
- "distance": "cosine",
170
- },
171
- "properties": [
172
- {
173
- "name": "title",
174
- "dataType": ["text"],
175
- "description": "HTML doc title.",
176
- "vectorizer": "text2vec-transformers",
177
- "moduleConfig": {
178
- "text2vec-transformers": {
179
- "vectorizePropertyName": True,
180
- "skip": False,
181
- "tokenization": "lowercase"
182
- }
183
- },
184
- "invertedIndexConfig": {
185
- "bm25": {
186
- "b": 0.75,
187
- "k1": 1.2
188
- },
189
- }
190
- },
191
- {
192
- "name": "content",
193
- "dataType": ["text"],
194
- "description": "HTML page content.",
195
- "moduleConfig": {
196
- "text2vec-transformers": {
197
- "vectorizePropertyName": True,
198
- "tokenization": "whitespace"
199
- }
200
- }
201
- }
202
- ]
203
- }
204
- return(client.collections.create_from_dict(class_obj))
205
-
206
-
207
- #################################################################
208
- # Create document and chunk objects in database.
209
- #################################################################
210
- def createDatabaseObjects():
211
- logger.info("#### Create page/doc and chunk db objects.")
212
- for i, className in enumerate(webpageDocNames):
213
- title = webpageTitles[i]
214
- logger.debug(f"## className, title: {className}, {title}")
215
- # Create Webpage Object
216
- page_content = page_contentArray[i]
217
- # Insert the document.
218
- wpCollectionObj_uuid = wpCollection.data.insert(
219
- {
220
- "name": className,
221
- "title": title,
222
- "content": page_content
223
- }
224
- )
225
-
226
- # Insert the chunks for the document.
227
- for i2, chunk in enumerate(webpageChunks[i]):
228
- chunk_uuid = wpChunkCollection.data.insert(
229
- {
230
- "title": title,
231
- "chunk": chunk,
232
- "chunk_index": i2,
233
- "references":
234
- {
235
- "webpage": wpCollectionObj_uuid
236
- }
237
- }
238
- )
239
-
240
-
241
- #################################################################
242
- # Create display widgets.
243
- #################################################################
244
- output_widget = ""
245
- systemTextArea = ""
246
- userTextArea = ""
247
- ragPromptTextArea = ""
248
- responseTextArea = ""
249
- selectRag = ""
250
- submitButton = ""
251
- def createWidgets():
252
- output_widget = widgets.Output()
253
- with output_widget:
254
- print("### Create widgets entered.")
255
-
256
- systemTextArea = widgets.Textarea(
257
- value='',
258
- placeholder='Enter System Prompt.',
259
- description='Sys Prompt: ',
260
- disabled=False,
261
- layout=widgets.Layout(width='300px', height='80px')
262
- )
263
-
264
- userTextArea = widgets.Textarea(
265
- value='',
266
- placeholder='Enter User Prompt.',
267
- description='User Prompt: ',
268
- disabled=False,
269
- layout=widgets.Layout(width='435px', height='110px')
270
- )
271
-
272
- ragPromptTextArea = widgets.Textarea(
273
- value='',
274
- placeholder='App generated prompt with RAG information.',
275
- description='RAG Prompt: ',
276
- disabled=False,
277
- layout=widgets.Layout(width='580px', height='180px')
278
- )
279
-
280
- responseTextArea = widgets.Textarea(
281
- value='',
282
- placeholder='LLM generated response.',
283
- description='LLM Resp: ',
284
- disabled=False,
285
- layout=widgets.Layout(width='780px', height='200px')
286
- )
287
-
288
- selectRag = widgets.Checkbox(
289
- value=False,
290
- description='Use RAG',
291
- disabled=False
292
- )
293
-
294
- submitButton = widgets.Button(
295
- description='Run Model.',
296
- disabled=False,
297
- button_style='', # 'success', 'info', 'warning', 'danger' or ''
298
- tooltip='Click',
299
- icon='check' # (FontAwesome names without the `fa-` prefix)
300
- )
301
-
302
-
303
- ######################################################################
304
- # MAINLINE
305
- ######################################################################
306
- logger.info("#### MAINLINE ENTERED.")
307
-
308
- #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
309
- pathString = "/app/inputDocs"
310
- chunks = []
311
- webpageDocNames = []
312
- page_contentArray = []
313
- webpageChunks = []
314
- webpageTitles = []
315
- webpageChunksDocNames = []
316
-
317
- #connectToWeaviateDB()
318
- logger.info("#### Create Weaviate db client connection.")
319
- client = weaviate.connect_to_custom(
320
- http_host="127.0.0.1",
321
- http_port=8080,
322
- http_secure=False,
323
- grpc_host="127.0.0.1",
324
- grpc_port=50051,
325
- grpc_secure=False
326
- )
327
- client.connect()
328
-
329
- readParseChunkFiles()
330
- wpCollection = createWebpageCollection()
331
- wpChunkCollection = createChunksCollection()
332
-
333
- #createDatabaseObjects()
334
- logger.info("#### Create page/doc and chunk db objects.")
335
- for i, className in enumerate(webpageDocNames):
336
- title = webpageTitles[i]
337
- logger.debug(f"## className, title: {className}, {title}")
338
- # Create Webpage Object
339
- page_content = page_contentArray[i]
340
- # Insert the document.
341
- wpCollectionObj_uuid = wpCollection.data.insert(
342
- {
343
- "name": className,
344
- "title": title,
345
- "content": page_content
346
- }
347
- )
348
-
349
- # Insert the chunks for the document.
350
- for i2, chunk in enumerate(webpageChunks[i]):
351
- chunk_uuid = wpChunkCollection.data.insert(
352
- {
353
- "title": title,
354
- "chunk": chunk,
355
- "chunk_index": i2,
356
- "references":
357
- {
358
- "webpage": wpCollectionObj_uuid
359
- }
360
- }
361
- )
362
-
363
- ###############################################################################
364
- # text contains prompt for vector DB.
365
- text = "human-made computer cognitive ability"
366
-
367
-
368
- ###############################################################################
369
- # Initial the the sentence transformer and encode the query prompt.
370
- logger.info(f"#### Encode text query prompt to create vectors. {text}")
371
- model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
372
-
373
- vector = model.encode(text)
374
- vectorList = []
375
-
376
- logger.debug("#### Print vectors.")
377
- for vec in vector:
378
- vectorList.append(vec)
379
- logger.debug(f"vectorList: {vectorList[2]}")
380
-
381
- # Fetch chunks and print chunks.
382
- logger.info("#### Retrieve semchunks from db using vectors from prompt.")
383
- semChunks = wpChunkCollection.query.near_vector(
384
- near_vector=vectorList,
385
- distance=0.7,
386
- limit=3
387
- )
388
- logger.debug(f"### semChunks[0]: {semChunks}")
389
-
390
- # Print chunks, corresponding document and document title.
391
- logger.info("#### Print individual retrieved chunks.")
392
- for chunk in enumerate(semChunks.objects):
393
- logger.info(f"#### chunk: {chunk}")
394
- webpage_uuid = chunk[1].properties['references']['webpage']
395
- logger.info(f"webpage_uuid: {webpage_uuid}")
396
- wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
397
- logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
398
-
399
- logger.info("#### Closing client db connection.")
400
- client.close()
401
-
402
- logger.info("#### Program terminating.")
 
1
+ import weaviate
2
+
3
+ from sentence_transformers import SentenceTransformer
4
+ from langchain_community.document_loaders import BSHTMLLoader
5
+ from pathlib import Path
6
+ from lxml import html
7
+ import logging
8
+ from semantic_text_splitter import HuggingFaceTextSplitter
9
+ from tokenizers import Tokenizer
10
+ import json
11
+ import os
12
+ import re
13
+ import logging
14
+
15
+ weaviate_logger = logging.getLogger("httpx")
16
+ weaviate_logger.setLevel(logging.WARNING)
17
+
18
+ logger = logging.getLogger(__name__)
19
+ logging.basicConfig(level=logging.INFO)
20
+
21
+
22
+ #################################################################
23
+ # Create the chunks collection for the Weaviate database.
24
+ def createChunksCollection():
25
+ logger.info("#### createChunksCollection() entered.")
26
+ if client.collections.exists("Chunks"):
27
+ client.collections.delete("Chunks")
28
+
29
+ class_obj = {
30
+ "class": "Chunks",
31
+ "description": "Collection for document chunks.",
32
+ "vectorizer": "text2vec-transformers",
33
+ "moduleConfig": {
34
+ "text2vec-transformers": {
35
+ "vectorizeClassName": True
36
+ }
37
+ },
38
+ "vectorIndexType": "hnsw",
39
+ "vectorIndexConfig": {
40
+ "distance": "cosine",
41
+ },
42
+ "properties": [
43
+ {
44
+ "name": "chunk",
45
+ "dataType": ["text"],
46
+ "description": "Single webpage chunk.",
47
+ "vectorizer": "text2vec-transformers",
48
+ "moduleConfig": {
49
+ "text2vec-transformers": {
50
+ "vectorizePropertyName": False,
51
+ "skip": False,
52
+ "tokenization": "lowercase"
53
+ }
54
+ }
55
+ },
56
+ {
57
+ "name": "chunk_index",
58
+ "dataType": ["int"]
59
+ },
60
+ {
61
+ "name": "webpage",
62
+ "dataType": ["Documents"],
63
+ "description": "Webpage content chunks.",
64
+
65
+ "invertedIndexConfig": {
66
+ "bm25": {
67
+ "b": 0.75,
68
+ "k1": 1.2
69
+ }
70
+ }
71
+ }
72
+ ]
73
+ }
74
+ return(client.collections.create_from_dict(class_obj))
75
+
76
+
77
+ #####################################################################
78
+ # Create the document collection for the Weaviate database.
79
+ def createWebpageCollection():
80
+ logger.info("#### createWebpageCollection() entered.")
81
+ if client.collections.exists("Documents"):
82
+ client.collections.delete("Documents")
83
+
84
+ class_obj = {
85
+ "class": "Documents",
86
+ "description": "For first attempt at loading a Weviate database.",
87
+ "vectorizer": "text2vec-transformers",
88
+ "moduleConfig": {
89
+ "text2vec-transformers": {
90
+ "vectorizeClassName": False
91
+ }
92
+ },
93
+ "vectorIndexType": "hnsw",
94
+ "vectorIndexConfig": {
95
+ "distance": "cosine",
96
+ },
97
+ "properties": [
98
+ {
99
+ "name": "title",
100
+ "dataType": ["text"],
101
+ "description": "HTML doc title.",
102
+ "vectorizer": "text2vec-transformers",
103
+ "moduleConfig": {
104
+ "text2vec-transformers": {
105
+ "vectorizePropertyName": True,
106
+ "skip": False,
107
+ "tokenization": "lowercase"
108
+ }
109
+ },
110
+ "invertedIndexConfig": {
111
+ "bm25": {
112
+ "b": 0.75,
113
+ "k1": 1.2
114
+ },
115
+ }
116
+ },
117
+ {
118
+ "name": "content",
119
+ "dataType": ["text"],
120
+ "description": "HTML page content.",
121
+ "moduleConfig": {
122
+ "text2vec-transformers": {
123
+ "vectorizePropertyName": True,
124
+ "tokenization": "whitespace"
125
+ }
126
+ }
127
+ }
128
+ ]
129
+ }
130
+ return(client.collections.create_from_dict(class_obj))
131
+
132
+
133
+ ######################################################################
134
+ # MAINLINE
135
+ #
136
+ logger.info("#### MAINLINE ENTERED.")
137
+
138
+ #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
139
+ pathString = "/app/inputDocs"
140
+ chunks = []
141
+ webpageDocNames = []
142
+ page_contentArray = []
143
+ webpageChunks = []
144
+ webpageTitles = []
145
+ webpageChunksDocNames = []
146
+
147
+
148
+ ######################################################
149
+ # Connect to the Weaviate vector database.
150
+ logger.info("#### Create Weaviate db client connection.")
151
+ client = weaviate.connect_to_custom(
152
+ http_host="127.0.0.1",
153
+ http_port=8080,
154
+ http_secure=False,
155
+ grpc_host="127.0.0.1",
156
+ grpc_port=50051,
157
+ grpc_secure=False
158
+ )
159
+ client.connect()
160
+
161
+ #######################################################
162
+ # Read each text input file, parse it into a document,
163
+ # chunk it, collect chunks and document name.
164
+ logger.info("#### Read and chunk input text files.")
165
+ for filename in os.listdir(pathString):
166
+ logger.info(filename)
167
+ path = Path(pathString + "/" + filename)
168
+ filename = filename.rstrip(".html")
169
+ webpageDocNames.append(filename)
170
+ htmlLoader = BSHTMLLoader(path,"utf-8")
171
+ htmlData = htmlLoader.load()
172
+
173
+ title = htmlData[0].metadata['title']
174
+ page_content = htmlData[0].page_content
175
+
176
+ # Clean data. Remove multiple newlines, etc.
177
+ page_content = re.sub(r'\n+', '\n',page_content)
178
+
179
+ page_contentArray.append(page_content);
180
+ webpageTitles.append(title)
181
+ max_tokens = 1000
182
+ tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
183
+ logger.debug(f"### tokenizer: {tokenizer}")
184
+ splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
185
+ chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
186
+
187
+ chunks = []
188
+ for chnk in chunksOnePage:
189
+ logger.debug(f"#### chnk in file: {chnk}")
190
+ chunks.append(chnk)
191
+ logger.debug(f"chunks: {chunks}")
192
+ webpageChunks.append(chunks)
193
+ webpageChunksDocNames.append(filename + "Chunks")
194
+
195
+ logger.debug(f"### filename, title: {filename}, {title}")
196
+
197
+ logger.debug(f"### webpageDocNames: {webpageDocNames}")
198
+
199
+ ######################################################
200
+ # Create database webpage and chunks collections.
201
+ wpCollection = createWebpageCollection()
202
+ wpChunkCollection = createChunksCollection()
203
+
204
+ ###########################################################
205
+ # Create document and chunks objects in the database.
206
+ logger.info("#### Create page/doc and chunk db objects.")
207
+ for i, className in enumerate(webpageDocNames):
208
+ title = webpageTitles[i]
209
+ logger.debug(f"## className, title: {className}, {title}")
210
+ # Create Webpage Object
211
+ page_content = page_contentArray[i]
212
+ # Insert the document.
213
+ wpCollectionObj_uuid = wpCollection.data.insert(
214
+ {
215
+ "name": className,
216
+ "title": title,
217
+ "content": page_content
218
+ }
219
+ )
220
+
221
+ # Insert the chunks for the document.
222
+ for i2, chunk in enumerate(webpageChunks[i]):
223
+ chunk_uuid = wpChunkCollection.data.insert(
224
+ {
225
+ "title": title,
226
+ "chunk": chunk,
227
+ "chunk_index": i2,
228
+ "references":
229
+ {
230
+ "webpage": wpCollectionObj_uuid
231
+ }
232
+ }
233
+ )
234
+
235
+ ###############################################################################
236
+ # text contains prompt for vector DB.
237
+ text = "human-made computer cognitive ability"
238
+
239
+
240
+ ###############################################################################
241
+ # Initial the the sentence transformer and encode the query prompt.
242
+ logger.info(f"#### Encode text query prompt to create vectors. {text}")
243
+ model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
244
+
245
+ vector = model.encode(text)
246
+ vectorList = []
247
+
248
+ logger.debug("#### Print vectors.")
249
+ for vec in vector:
250
+ vectorList.append(vec)
251
+ logger.debug(f"vectorList: {vectorList[2]}")
252
+
253
+ # Fetch chunks and print chunks.
254
+ logger.info("#### Retrieve semchunks from db using vectors from prompt.")
255
+ semChunks = wpChunkCollection.query.near_vector(
256
+ near_vector=vectorList,
257
+ distance=0.7,
258
+ limit=3
259
+ )
260
+ logger.debug(f"### semChunks[0]: {semChunks}")
261
+
262
+ # Print chunks, corresponding document and document title.
263
+ logger.info("#### Print individual retrieved chunks.")
264
+ for chunk in enumerate(semChunks.objects):
265
+ logger.info(f"#### chunk: {chunk}")
266
+ webpage_uuid = chunk[1].properties['references']['webpage']
267
+ logger.info(f"webpage_uuid: {webpage_uuid}")
268
+ wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
269
+ logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
270
+
271
+ logger.info("#### Closing client db connection.")
272
+ client.close()
273
+
274
+ logger.info("#### Program terminating.")