MVPilgrim commited on
Commit
ab80d2d
·
1 Parent(s): c3b8950

Begin adding UI components.

Browse files
Files changed (2) hide show
  1. semsearch.py +174 -85
  2. semsearch_Orig.py +274 -0
semsearch.py CHANGED
@@ -12,6 +12,12 @@ import os
12
  import re
13
  import logging
14
 
 
 
 
 
 
 
15
  weaviate_logger = logging.getLogger("httpx")
16
  weaviate_logger.setLevel(logging.WARNING)
17
 
@@ -19,8 +25,75 @@ logger = logging.getLogger(__name__)
19
  logging.basicConfig(level=logging.INFO)
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  #################################################################
23
  # Create the chunks collection for the Weaviate database.
 
24
  def createChunksCollection():
25
  logger.info("#### createChunksCollection() entered.")
26
  if client.collections.exists("Chunks"):
@@ -76,6 +149,7 @@ def createChunksCollection():
76
 
77
  #####################################################################
78
  # Create the document collection for the Weaviate database.
 
79
  def createWebpageCollection():
80
  logger.info("#### createWebpageCollection() entered.")
81
  if client.collections.exists("Documents"):
@@ -130,9 +204,105 @@ def createWebpageCollection():
130
  return(client.collections.create_from_dict(class_obj))
131
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  ######################################################################
134
  # MAINLINE
135
- #
136
  logger.info("#### MAINLINE ENTERED.")
137
 
138
  #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
@@ -144,93 +314,12 @@ webpageChunks = []
144
  webpageTitles = []
145
  webpageChunksDocNames = []
146
 
147
-
148
- ######################################################
149
- # Connect to the Weaviate vector database.
150
- logger.info("#### Create Weaviate db client connection.")
151
- client = weaviate.connect_to_custom(
152
- http_host="127.0.0.1",
153
- http_port=8080,
154
- http_secure=False,
155
- grpc_host="127.0.0.1",
156
- grpc_port=50051,
157
- grpc_secure=False
158
- )
159
- client.connect()
160
-
161
- #######################################################
162
- # Read each text input file, parse it into a document,
163
- # chunk it, collect chunks and document name.
164
- logger.info("#### Read and chunk input text files.")
165
- for filename in os.listdir(pathString):
166
- logger.info(filename)
167
- path = Path(pathString + "/" + filename)
168
- filename = filename.rstrip(".html")
169
- webpageDocNames.append(filename)
170
- htmlLoader = BSHTMLLoader(path,"utf-8")
171
- htmlData = htmlLoader.load()
172
-
173
- title = htmlData[0].metadata['title']
174
- page_content = htmlData[0].page_content
175
-
176
- # Clean data. Remove multiple newlines, etc.
177
- page_content = re.sub(r'\n+', '\n',page_content)
178
-
179
- page_contentArray.append(page_content);
180
- webpageTitles.append(title)
181
- max_tokens = 1000
182
- tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
183
- logger.debug(f"### tokenizer: {tokenizer}")
184
- splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
185
- chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
186
-
187
- chunks = []
188
- for chnk in chunksOnePage:
189
- logger.debug(f"#### chnk in file: {chnk}")
190
- chunks.append(chnk)
191
- logger.debug(f"chunks: {chunks}")
192
- webpageChunks.append(chunks)
193
- webpageChunksDocNames.append(filename + "Chunks")
194
-
195
- logger.debug(f"### filename, title: {filename}, {title}")
196
-
197
- logger.debug(f"### webpageDocNames: {webpageDocNames}")
198
-
199
- ######################################################
200
- # Create database webpage and chunks collections.
201
  wpCollection = createWebpageCollection()
202
  wpChunkCollection = createChunksCollection()
 
203
 
204
- ###########################################################
205
- # Create document and chunks objects in the database.
206
- logger.info("#### Create page/doc and chunk db objects.")
207
- for i, className in enumerate(webpageDocNames):
208
- title = webpageTitles[i]
209
- logger.debug(f"## className, title: {className}, {title}")
210
- # Create Webpage Object
211
- page_content = page_contentArray[i]
212
- # Insert the document.
213
- wpCollectionObj_uuid = wpCollection.data.insert(
214
- {
215
- "name": className,
216
- "title": title,
217
- "content": page_content
218
- }
219
- )
220
-
221
- # Insert the chunks for the document.
222
- for i2, chunk in enumerate(webpageChunks[i]):
223
- chunk_uuid = wpChunkCollection.data.insert(
224
- {
225
- "title": title,
226
- "chunk": chunk,
227
- "chunk_index": i2,
228
- "references":
229
- {
230
- "webpage": wpCollectionObj_uuid
231
- }
232
- }
233
- )
234
 
235
  ###############################################################################
236
  # text contains prompt for vector DB.
 
12
  import re
13
  import logging
14
 
15
+ import llama_cpp
16
+ from llama_cpp import Llama
17
+ import ipywidgets as widgets
18
+ import time
19
+ from IPython.display import display, clear_output
20
+
21
  weaviate_logger = logging.getLogger("httpx")
22
  weaviate_logger.setLevel(logging.WARNING)
23
 
 
25
  logging.basicConfig(level=logging.INFO)
26
 
27
 
28
+
29
+ #################################################################
30
+ # Connect to Weaviate vector database.
31
+ #################################################################
32
+ client = ""
33
+ def connectToDatabase():
34
+ ######################################################
35
+ # Connect to the Weaviate vector database.
36
+ logger.info("#### Create Weaviate db client connection.")
37
+ client = weaviate.connect_to_custom(
38
+ http_host="127.0.0.1",
39
+ http_port=8080,
40
+ http_secure=False,
41
+ grpc_host="127.0.0.1",
42
+ grpc_port=50051,
43
+ grpc_secure=False
44
+ )
45
+ client.connect()
46
+
47
+
48
+ #######################################################
49
+ # Read each text input file, parse it into a document,
50
+ # chunk it, collect chunks and document name.
51
+ #######################################################
52
+ webpageDocNames = []
53
+ page_contentArray = []
54
+ webpageTitles = []
55
+ webpageChunks = []
56
+ webpageChunksDocNames = []
57
+
58
+ def readParseChunkFiles():
59
+ logger.info("#### Read and chunk input text files.")
60
+ for filename in os.listdir(pathString):
61
+ logger.info(filename)
62
+ path = Path(pathString + "/" + filename)
63
+ filename = filename.rstrip(".html")
64
+ webpageDocNames.append(filename)
65
+ htmlLoader = BSHTMLLoader(path,"utf-8")
66
+ htmlData = htmlLoader.load()
67
+
68
+ title = htmlData[0].metadata['title']
69
+ page_content = htmlData[0].page_content
70
+
71
+ # Clean data. Remove multiple newlines, etc.
72
+ page_content = re.sub(r'\n+', '\n',page_content)
73
+
74
+ page_contentArray.append(page_content);
75
+ webpageTitles.append(title)
76
+ max_tokens = 1000
77
+ tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
78
+ logger.debug(f"### tokenizer: {tokenizer}")
79
+ splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
80
+ chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
81
+
82
+ chunks = []
83
+ for chnk in chunksOnePage:
84
+ logger.debug(f"#### chnk in file: {chnk}")
85
+ chunks.append(chnk)
86
+ logger.debug(f"chunks: {chunks}")
87
+ webpageChunks.append(chunks)
88
+ webpageChunksDocNames.append(filename + "Chunks")
89
+
90
+ logger.debug(f"### filename, title: {filename}, {title}")
91
+
92
+ logger.debug(f"### webpageDocNames: {webpageDocNames}")
93
+
94
  #################################################################
95
  # Create the chunks collection for the Weaviate database.
96
+ #################################################################
97
  def createChunksCollection():
98
  logger.info("#### createChunksCollection() entered.")
99
  if client.collections.exists("Chunks"):
 
149
 
150
  #####################################################################
151
  # Create the document collection for the Weaviate database.
152
+ #####################################################################
153
  def createWebpageCollection():
154
  logger.info("#### createWebpageCollection() entered.")
155
  if client.collections.exists("Documents"):
 
204
  return(client.collections.create_from_dict(class_obj))
205
 
206
 
207
+ #################################################################
208
+ # Create document and chunk objects in database.
209
+ #################################################################
210
+ createDatabaseObjects():
211
+ logger.info("#### Create page/doc and chunk db objects.")
212
+ for i, className in enumerate(webpageDocNames):
213
+ title = webpageTitles[i]
214
+ logger.debug(f"## className, title: {className}, {title}")
215
+ # Create Webpage Object
216
+ page_content = page_contentArray[i]
217
+ # Insert the document.
218
+ wpCollectionObj_uuid = wpCollection.data.insert(
219
+ {
220
+ "name": className,
221
+ "title": title,
222
+ "content": page_content
223
+ }
224
+ )
225
+
226
+ # Insert the chunks for the document.
227
+ for i2, chunk in enumerate(webpageChunks[i]):
228
+ chunk_uuid = wpChunkCollection.data.insert(
229
+ {
230
+ "title": title,
231
+ "chunk": chunk,
232
+ "chunk_index": i2,
233
+ "references":
234
+ {
235
+ "webpage": wpCollectionObj_uuid
236
+ }
237
+ }
238
+ )
239
+
240
+
241
+ #################################################################
242
+ # Create display widgets.
243
+ #################################################################
244
+ output_widget = ""
245
+ systemTextArea = ""
246
+ userTextArea = ""
247
+ ragPromptTextArea = ""
248
+ responseTextArea = ""
249
+ selectRag = ""
250
+ submitButton = ""
251
+ def createWidgets():
252
+ output_widget = widgets.Output()
253
+ with output_widget:
254
+ print("### Create widgets entered.")
255
+
256
+ systemTextArea = widgets.Textarea(
257
+ value='',
258
+ placeholder='Enter System Prompt.',
259
+ description='Sys Prompt: ',
260
+ disabled=False,
261
+ layout=widgets.Layout(width='300px', height='80px')
262
+ )
263
+
264
+ userTextArea = widgets.Textarea(
265
+ value='',
266
+ placeholder='Enter User Prompt.',
267
+ description='User Prompt: ',
268
+ disabled=False,
269
+ layout=widgets.Layout(width='435px', height='110px')
270
+ )
271
+
272
+ ragPromptTextArea = widgets.Textarea(
273
+ value='',
274
+ placeholder='App generated prompt with RAG information.',
275
+ description='RAG Prompt: ',
276
+ disabled=False,
277
+ layout=widgets.Layout(width='580px', height='180px')
278
+ )
279
+
280
+ responseTextArea = widgets.Textarea(
281
+ value='',
282
+ placeholder='LLM generated response.',
283
+ description='LLM Resp: ',
284
+ disabled=False,
285
+ layout=widgets.Layout(width='780px', height='200px')
286
+ )
287
+
288
+ selectRag = widgets.Checkbox(
289
+ value=False,
290
+ description='Use RAG',
291
+ disabled=False
292
+ )
293
+
294
+ submitButton = widgets.Button(
295
+ description='Run Model.',
296
+ disabled=False,
297
+ button_style='', # 'success', 'info', 'warning', 'danger' or ''
298
+ tooltip='Click',
299
+ icon='check' # (FontAwesome names without the `fa-` prefix)
300
+ )
301
+
302
+
303
  ######################################################################
304
  # MAINLINE
305
+ ######################################################################
306
  logger.info("#### MAINLINE ENTERED.")
307
 
308
  #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
 
314
  webpageTitles = []
315
  webpageChunksDocNames = []
316
 
317
+ connectToWeaviateDB()
318
+ readParseChunkFiles()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  wpCollection = createWebpageCollection()
320
  wpChunkCollection = createChunksCollection()
321
+ createDatabaseObjects()
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  ###############################################################################
325
  # text contains prompt for vector DB.
semsearch_Orig.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import weaviate
2
+
3
+ from sentence_transformers import SentenceTransformer
4
+ from langchain_community.document_loaders import BSHTMLLoader
5
+ from pathlib import Path
6
+ from lxml import html
7
+ import logging
8
+ from semantic_text_splitter import HuggingFaceTextSplitter
9
+ from tokenizers import Tokenizer
10
+ import json
11
+ import os
12
+ import re
13
+ import logging
14
+
15
+ weaviate_logger = logging.getLogger("httpx")
16
+ weaviate_logger.setLevel(logging.WARNING)
17
+
18
+ logger = logging.getLogger(__name__)
19
+ logging.basicConfig(level=logging.INFO)
20
+
21
+
22
+ #################################################################
23
+ # Create the chunks collection for the Weaviate database.
24
+ def createChunksCollection():
25
+ logger.info("#### createChunksCollection() entered.")
26
+ if client.collections.exists("Chunks"):
27
+ client.collections.delete("Chunks")
28
+
29
+ class_obj = {
30
+ "class": "Chunks",
31
+ "description": "Collection for document chunks.",
32
+ "vectorizer": "text2vec-transformers",
33
+ "moduleConfig": {
34
+ "text2vec-transformers": {
35
+ "vectorizeClassName": True
36
+ }
37
+ },
38
+ "vectorIndexType": "hnsw",
39
+ "vectorIndexConfig": {
40
+ "distance": "cosine",
41
+ },
42
+ "properties": [
43
+ {
44
+ "name": "chunk",
45
+ "dataType": ["text"],
46
+ "description": "Single webpage chunk.",
47
+ "vectorizer": "text2vec-transformers",
48
+ "moduleConfig": {
49
+ "text2vec-transformers": {
50
+ "vectorizePropertyName": False,
51
+ "skip": False,
52
+ "tokenization": "lowercase"
53
+ }
54
+ }
55
+ },
56
+ {
57
+ "name": "chunk_index",
58
+ "dataType": ["int"]
59
+ },
60
+ {
61
+ "name": "webpage",
62
+ "dataType": ["Documents"],
63
+ "description": "Webpage content chunks.",
64
+
65
+ "invertedIndexConfig": {
66
+ "bm25": {
67
+ "b": 0.75,
68
+ "k1": 1.2
69
+ }
70
+ }
71
+ }
72
+ ]
73
+ }
74
+ return(client.collections.create_from_dict(class_obj))
75
+
76
+
77
+ #####################################################################
78
+ # Create the document collection for the Weaviate database.
79
+ def createWebpageCollection():
80
+ logger.info("#### createWebpageCollection() entered.")
81
+ if client.collections.exists("Documents"):
82
+ client.collections.delete("Documents")
83
+
84
+ class_obj = {
85
+ "class": "Documents",
86
+ "description": "For first attempt at loading a Weviate database.",
87
+ "vectorizer": "text2vec-transformers",
88
+ "moduleConfig": {
89
+ "text2vec-transformers": {
90
+ "vectorizeClassName": False
91
+ }
92
+ },
93
+ "vectorIndexType": "hnsw",
94
+ "vectorIndexConfig": {
95
+ "distance": "cosine",
96
+ },
97
+ "properties": [
98
+ {
99
+ "name": "title",
100
+ "dataType": ["text"],
101
+ "description": "HTML doc title.",
102
+ "vectorizer": "text2vec-transformers",
103
+ "moduleConfig": {
104
+ "text2vec-transformers": {
105
+ "vectorizePropertyName": True,
106
+ "skip": False,
107
+ "tokenization": "lowercase"
108
+ }
109
+ },
110
+ "invertedIndexConfig": {
111
+ "bm25": {
112
+ "b": 0.75,
113
+ "k1": 1.2
114
+ },
115
+ }
116
+ },
117
+ {
118
+ "name": "content",
119
+ "dataType": ["text"],
120
+ "description": "HTML page content.",
121
+ "moduleConfig": {
122
+ "text2vec-transformers": {
123
+ "vectorizePropertyName": True,
124
+ "tokenization": "whitespace"
125
+ }
126
+ }
127
+ }
128
+ ]
129
+ }
130
+ return(client.collections.create_from_dict(class_obj))
131
+
132
+
133
+ ######################################################################
134
+ # MAINLINE
135
+ #
136
+ logger.info("#### MAINLINE ENTERED.")
137
+
138
+ #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
139
+ pathString = "/app/inputDocs"
140
+ chunks = []
141
+ webpageDocNames = []
142
+ page_contentArray = []
143
+ webpageChunks = []
144
+ webpageTitles = []
145
+ webpageChunksDocNames = []
146
+
147
+
148
+ ######################################################
149
+ # Connect to the Weaviate vector database.
150
+ logger.info("#### Create Weaviate db client connection.")
151
+ client = weaviate.connect_to_custom(
152
+ http_host="127.0.0.1",
153
+ http_port=8080,
154
+ http_secure=False,
155
+ grpc_host="127.0.0.1",
156
+ grpc_port=50051,
157
+ grpc_secure=False
158
+ )
159
+ client.connect()
160
+
161
+ #######################################################
162
+ # Read each text input file, parse it into a document,
163
+ # chunk it, collect chunks and document name.
164
+ logger.info("#### Read and chunk input text files.")
165
+ for filename in os.listdir(pathString):
166
+ logger.info(filename)
167
+ path = Path(pathString + "/" + filename)
168
+ filename = filename.rstrip(".html")
169
+ webpageDocNames.append(filename)
170
+ htmlLoader = BSHTMLLoader(path,"utf-8")
171
+ htmlData = htmlLoader.load()
172
+
173
+ title = htmlData[0].metadata['title']
174
+ page_content = htmlData[0].page_content
175
+
176
+ # Clean data. Remove multiple newlines, etc.
177
+ page_content = re.sub(r'\n+', '\n',page_content)
178
+
179
+ page_contentArray.append(page_content);
180
+ webpageTitles.append(title)
181
+ max_tokens = 1000
182
+ tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
183
+ logger.debug(f"### tokenizer: {tokenizer}")
184
+ splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
185
+ chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
186
+
187
+ chunks = []
188
+ for chnk in chunksOnePage:
189
+ logger.debug(f"#### chnk in file: {chnk}")
190
+ chunks.append(chnk)
191
+ logger.debug(f"chunks: {chunks}")
192
+ webpageChunks.append(chunks)
193
+ webpageChunksDocNames.append(filename + "Chunks")
194
+
195
+ logger.debug(f"### filename, title: {filename}, {title}")
196
+
197
+ logger.debug(f"### webpageDocNames: {webpageDocNames}")
198
+
199
+ ######################################################
200
+ # Create database webpage and chunks collections.
201
+ wpCollection = createWebpageCollection()
202
+ wpChunkCollection = createChunksCollection()
203
+
204
+ ###########################################################
205
+ # Create document and chunks objects in the database.
206
+ logger.info("#### Create page/doc and chunk db objects.")
207
+ for i, className in enumerate(webpageDocNames):
208
+ title = webpageTitles[i]
209
+ logger.debug(f"## className, title: {className}, {title}")
210
+ # Create Webpage Object
211
+ page_content = page_contentArray[i]
212
+ # Insert the document.
213
+ wpCollectionObj_uuid = wpCollection.data.insert(
214
+ {
215
+ "name": className,
216
+ "title": title,
217
+ "content": page_content
218
+ }
219
+ )
220
+
221
+ # Insert the chunks for the document.
222
+ for i2, chunk in enumerate(webpageChunks[i]):
223
+ chunk_uuid = wpChunkCollection.data.insert(
224
+ {
225
+ "title": title,
226
+ "chunk": chunk,
227
+ "chunk_index": i2,
228
+ "references":
229
+ {
230
+ "webpage": wpCollectionObj_uuid
231
+ }
232
+ }
233
+ )
234
+
235
+ ###############################################################################
236
+ # text contains prompt for vector DB.
237
+ text = "human-made computer cognitive ability"
238
+
239
+
240
+ ###############################################################################
241
+ # Initial the the sentence transformer and encode the query prompt.
242
+ logger.info(f"#### Encode text query prompt to create vectors. {text}")
243
+ model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
244
+
245
+ vector = model.encode(text)
246
+ vectorList = []
247
+
248
+ logger.debug("#### Print vectors.")
249
+ for vec in vector:
250
+ vectorList.append(vec)
251
+ logger.debug(f"vectorList: {vectorList[2]}")
252
+
253
+ # Fetch chunks and print chunks.
254
+ logger.info("#### Retrieve semchunks from db using vectors from prompt.")
255
+ semChunks = wpChunkCollection.query.near_vector(
256
+ near_vector=vectorList,
257
+ distance=0.7,
258
+ limit=3
259
+ )
260
+ logger.debug(f"### semChunks[0]: {semChunks}")
261
+
262
+ # Print chunks, corresponding document and document title.
263
+ logger.info("#### Print individual retrieved chunks.")
264
+ for chunk in enumerate(semChunks.objects):
265
+ logger.info(f"#### chunk: {chunk}")
266
+ webpage_uuid = chunk[1].properties['references']['webpage']
267
+ logger.info(f"webpage_uuid: {webpage_uuid}")
268
+ wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
269
+ logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
270
+
271
+ logger.info("#### Closing client db connection.")
272
+ client.close()
273
+
274
+ logger.info("#### Program terminating.")