MVPilgrim commited on
Commit
ac5ddf6
·
1 Parent(s): e21b802
Files changed (5) hide show
  1. Dockerfile +5 -2
  2. semsearch.py +154 -3
  3. semsearch_Hld04_Working.py +271 -0
  4. semsearch_Hld05_Working.py +271 -0
  5. startup.sh +2 -0
Dockerfile CHANGED
@@ -52,6 +52,8 @@ ENV PATH="/app/text2vec-transformers:/app/text2vec-transformers/bin:${PATH}"
52
  #RUN pip install nltk==3.8.1 optimum==1.13.2 onnxruntime==1.16.1 onnx==1.14.1
53
  RUN ./custom_prerequisites.py
54
 
 
 
55
  ##############################
56
  RUN useradd -m -u 1000 user
57
 
@@ -62,5 +64,6 @@ VOLUME /data
62
  ##############################################################################
63
  # Start the weaviate vector database, text2vec-transformers and the semantic search app.
64
  #RUN /app/startup.sh
65
- #CMD ["/app/startup.sh"]
66
- RUN --mount=type=cache,target=/data,mode=777 /app/startup.sh
 
 
52
  #RUN pip install nltk==3.8.1 optimum==1.13.2 onnxruntime==1.16.1 onnx==1.14.1
53
  RUN ./custom_prerequisites.py
54
 
55
+ COPY Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q4_0.gguf /app
56
+
57
  ##############################
58
  RUN useradd -m -u 1000 user
59
 
 
64
  ##############################################################################
65
  # Start the weaviate vector database, text2vec-transformers and the semantic search app.
66
  #RUN /app/startup.sh
67
+ #RUN --mount=type=cache,target=/data,mode=777 /app/startup.sh
68
+ RUN --mount=type=cache,target=/data,mode=777 echo "### Mounting /data"
69
+ CMD ["/app/startup.sh"]
semsearch.py CHANGED
@@ -12,6 +12,12 @@ import os
12
  import re
13
  import logging
14
 
 
 
 
 
 
 
15
  weaviate_logger = logging.getLogger("httpx")
16
  weaviate_logger.setLevel(logging.WARNING)
17
 
@@ -34,6 +40,58 @@ webpageChunks = []
34
  webpageTitles = []
35
  webpageChunksDocNames = []
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  #######################################################
39
  # Read each text input file, parse it into a document,
@@ -83,9 +141,9 @@ client = weaviate.connect_to_custom(
83
  http_secure=False,
84
  grpc_host="127.0.0.1",
85
  grpc_port=50051,
86
- grpc_secure=False,
87
- read_timeout=600,
88
- write_timeout=90
89
  )
90
  client.connect()
91
 
@@ -265,6 +323,99 @@ for chunk in enumerate(semChunks.objects):
265
  wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
266
  logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  logger.info("#### Closing client db connection.")
269
  client.close()
270
 
 
12
  import re
13
  import logging
14
 
15
+ import llama_cpp
16
+ from llama_cpp import Llama
17
+ import ipywidgets as widgets
18
+ from IPython.display import display, clear_output
19
+
20
+
21
  weaviate_logger = logging.getLogger("httpx")
22
  weaviate_logger.setLevel(logging.WARNING)
23
 
 
40
  webpageTitles = []
41
  webpageChunksDocNames = []
42
 
43
+ #####################################################################
44
+ # Create UI widgets.
45
+ output_widget = widgets.Output()
46
+ with output_widget:
47
+ print("### Create widgets entered.")
48
+
49
+ systemTextArea = widgets.Textarea(
50
+ value='',
51
+ placeholder='Enter System Prompt.',
52
+ description='Sys Prompt: ',
53
+ disabled=False,
54
+ layout=widgets.Layout(width='300px', height='80px')
55
+ )
56
+
57
+ userTextArea = widgets.Textarea(
58
+ value='',
59
+ placeholder='Enter User Prompt.',
60
+ description='User Prompt: ',
61
+ disabled=False,
62
+ layout=widgets.Layout(width='435px', height='110px')
63
+ )
64
+
65
+ ragPromptTextArea = widgets.Textarea(
66
+ value='',
67
+ placeholder='App generated prompt with RAG information.',
68
+ description='RAG Prompt: ',
69
+ disabled=False,
70
+ layout=widgets.Layout(width='580px', height='180px')
71
+ )
72
+
73
+ responseTextArea = widgets.Textarea(
74
+ value='',
75
+ placeholder='LLM generated response.',
76
+ description='LLM Resp: ',
77
+ disabled=False,
78
+ layout=widgets.Layout(width='780px', height='200px')
79
+ )
80
+
81
+ selectRag = widgets.Checkbox(
82
+ value=False,
83
+ description='Use RAG',
84
+ disabled=False
85
+ )
86
+
87
+ submitButton = widgets.Button(
88
+ description='Run Model.',
89
+ disabled=False,
90
+ button_style='', # 'success', 'info', 'warning', 'danger' or ''
91
+ tooltip='Click',
92
+ icon='check' # (FontAwesome names without the `fa-` prefix)
93
+ )
94
+
95
 
96
  #######################################################
97
  # Read each text input file, parse it into a document,
 
141
  http_secure=False,
142
  grpc_host="127.0.0.1",
143
  grpc_port=50051,
144
+ grpc_secure=False
145
+ #read_timeout=600,
146
+ #write_timeout=90
147
  )
148
  client.connect()
149
 
 
323
  wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
324
  logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
325
 
326
+
327
+
328
+ ####################################################################
329
+ #
330
+ collection = client.collections.get("Chunks")
331
+ #model = SentenceTransformer('../multi-qa-MiniLM-L6-cos-v1')
332
+
333
+ #################################################################
334
+ # Initialize the LLM.
335
+ model_path = "/app/llama-2-7b-chat.Q4_0.gguf"
336
+ llm = Llama(model_path,
337
+ #*,
338
+ n_gpu_layers=0,
339
+ split_mode=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
340
+ main_gpu=0,
341
+ tensor_split=None,
342
+ vocab_only=False,
343
+ use_mmap=True,
344
+ use_mlock=False,
345
+ kv_overrides=None,
346
+ seed=llama_cpp.LLAMA_DEFAULT_SEED,
347
+ n_ctx=512,
348
+ n_batch=512,
349
+ n_threads=8,
350
+ n_threads_batch=16,
351
+ rope_scaling_type=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
352
+ pooling_type=llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
353
+ rope_freq_base=0.0,
354
+ rope_freq_scale=0.0,
355
+ yarn_ext_factor=-1.0,
356
+ yarn_attn_factor=1.0,
357
+ yarn_beta_fast=32.0,
358
+ yarn_beta_slow=1.0,
359
+ yarn_orig_ctx=0,
360
+ logits_all=False,
361
+ embedding=False,
362
+ offload_kqv=True,
363
+ last_n_tokens_size=64,
364
+ lora_base=None,
365
+ lora_scale=1.0,
366
+ lora_path=None,
367
+ numa=False,
368
+ chat_format=None,
369
+ chat_handler=None,
370
+ draft_model=None,
371
+ tokenizer=None,
372
+ type_k=None,
373
+ type_v=None,
374
+ verbose=True
375
+ )
376
+
377
+
378
+ display(systemTextArea)
379
+ display(userTextArea)
380
+ display(ragPromptTextArea)
381
+ display(responseTextArea)
382
+ display(selectRag)
383
+ display(submitButton)
384
+
385
+ def setPrompt(pprompt,ragFlag):
386
+ print("\n### setPrompt() entered. ragFlag: ",ragFlag)
387
+ if ragFlag:
388
+ ragPrompt = setRagPrompt(pprompt)
389
+ userPrompt = pprompt + "\n" + ragPrompt
390
+ prompt = userPrompt
391
+ else:
392
+ userPrompt = pprompt
393
+ prompt = f""" <s> [INST] <<SYS>> {systemTextArea.value} </SYS>> Q: {userPrompt} A: [/INST]"""
394
+ return prompt
395
+
396
+ def runModel(prompt):
397
+ output = llm.create_completion(
398
+ prompt, # Prompt
399
+ max_tokens=4096, # Generate up to 32 tokens
400
+ #stop = ["Q:", "\n"], # Stop generating just before the model would generate a new question
401
+ echo = False # Echo the prompt back in the output
402
+ )
403
+ responseTextArea.value = output["choices"][0]["text"]
404
+
405
+ def on_submitButton_clicked(b):
406
+ with output_widget:
407
+ clear_output(wait=True)
408
+ ragPromptTextArea.value = ""
409
+ responseTextArea.value = ""
410
+ log.debug(f"### selectRag: {selectRag.value}")
411
+ prompt = setPrompt(userTextArea.value,selectRag.value)
412
+ log.debug("### prompt: " + prompt)
413
+ runModel(prompt)
414
+
415
+ submitButton.on_click(on_submitButton_clicked)
416
+ display(output_widget)
417
+
418
+
419
  logger.info("#### Closing client db connection.")
420
  client.close()
421
 
semsearch_Hld04_Working.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import weaviate
2
+
3
+ from sentence_transformers import SentenceTransformer
4
+ from langchain_community.document_loaders import BSHTMLLoader
5
+ from pathlib import Path
6
+ from lxml import html
7
+ import logging
8
+ from semantic_text_splitter import HuggingFaceTextSplitter
9
+ from tokenizers import Tokenizer
10
+ import json
11
+ import os
12
+ import re
13
+ import logging
14
+
15
+ weaviate_logger = logging.getLogger("httpx")
16
+ weaviate_logger.setLevel(logging.WARNING)
17
+
18
+ logger = logging.getLogger(__name__)
19
+ logging.basicConfig(level=logging.INFO)
20
+
21
+
22
+
23
+ ######################################################################
24
+ # MAINLINE
25
+ #
26
+ logger.info("#### MAINLINE ENTERED.")
27
+
28
+ #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
29
+ pathString = "/app/inputDocs"
30
+ chunks = []
31
+ webpageDocNames = []
32
+ page_contentArray = []
33
+ webpageChunks = []
34
+ webpageTitles = []
35
+ webpageChunksDocNames = []
36
+
37
+
38
+ #######################################################
39
+ # Read each text input file, parse it into a document,
40
+ # chunk it, collect chunks and document name.
41
+ logger.info("#### Read and chunk input text files.")
42
+ for filename in os.listdir(pathString):
43
+ logger.info(filename)
44
+ path = Path(pathString + "/" + filename)
45
+ filename = filename.rstrip(".html")
46
+ webpageDocNames.append(filename)
47
+ htmlLoader = BSHTMLLoader(path,"utf-8")
48
+ htmlData = htmlLoader.load()
49
+
50
+ title = htmlData[0].metadata['title']
51
+ page_content = htmlData[0].page_content
52
+
53
+ # Clean data. Remove multiple newlines, etc.
54
+ page_content = re.sub(r'\n+', '\n',page_content)
55
+
56
+ page_contentArray.append(page_content);
57
+ webpageTitles.append(title)
58
+ max_tokens = 1000
59
+ tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
60
+ logger.debug(f"### tokenizer: {tokenizer}")
61
+ splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
62
+ chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
63
+
64
+ chunks = []
65
+ for chnk in chunksOnePage:
66
+ logger.debug(f"#### chnk in file: {chnk}")
67
+ chunks.append(chnk)
68
+ logger.debug(f"chunks: {chunks}")
69
+ webpageChunks.append(chunks)
70
+ webpageChunksDocNames.append(filename + "Chunks")
71
+
72
+ logger.debug(f"### filename, title: {filename}, {title}")
73
+
74
+ logger.debug(f"### webpageDocNames: {webpageDocNames}")
75
+
76
+
77
+ ######################################################
78
+ # Connect to the Weaviate vector database.
79
+ logger.info("#### Create Weaviate db client connection.")
80
+ client = weaviate.connect_to_custom(
81
+ http_host="127.0.0.1",
82
+ http_port=8080,
83
+ http_secure=False,
84
+ grpc_host="127.0.0.1",
85
+ grpc_port=50051,
86
+ grpc_secure=False
87
+ #read_timeout=600,
88
+ #write_timeout=90
89
+ )
90
+ client.connect()
91
+
92
+
93
+ ######################################################
94
+ # Create database webpage and chunks collections.
95
+ #wpCollection = createWebpageCollection()
96
+ #wpChunkCollection = createChunksCollection()
97
+ logger.info("#### createWebpageCollection() entered.")
98
+ if client.collections.exists("Documents"):
99
+ client.collections.delete("Documents")
100
+
101
+ class_obj = {
102
+ "class": "Documents",
103
+ "description": "For first attempt at loading a Weviate database.",
104
+ "vectorizer": "text2vec-transformers",
105
+ "moduleConfig": {
106
+ "text2vec-transformers": {
107
+ "vectorizeClassName": False
108
+ }
109
+ },
110
+ "vectorIndexType": "hnsw",
111
+ "vectorIndexConfig": {
112
+ "distance": "cosine",
113
+ },
114
+ "properties": [
115
+ {
116
+ "name": "title",
117
+ "dataType": ["text"],
118
+ "description": "HTML doc title.",
119
+ "vectorizer": "text2vec-transformers",
120
+ "moduleConfig": {
121
+ "text2vec-transformers": {
122
+ "vectorizePropertyName": True,
123
+ "skip": False,
124
+ "tokenization": "lowercase"
125
+ }
126
+ },
127
+ "invertedIndexConfig": {
128
+ "bm25": {
129
+ "b": 0.75,
130
+ "k1": 1.2
131
+ },
132
+ }
133
+ },
134
+ {
135
+ "name": "content",
136
+ "dataType": ["text"],
137
+ "description": "HTML page content.",
138
+ "moduleConfig": {
139
+ "text2vec-transformers": {
140
+ "vectorizePropertyName": True,
141
+ "tokenization": "whitespace"
142
+ }
143
+ }
144
+ }
145
+ ]
146
+ }
147
+ wpCollection = client.collections.create_from_dict(class_obj)
148
+
149
+ logger.info("#### createChunksCollection() entered.")
150
+ if client.collections.exists("Chunks"):
151
+ client.collections.delete("Chunks")
152
+
153
+ class_obj = {
154
+ "class": "Chunks",
155
+ "description": "Collection for document chunks.",
156
+ "vectorizer": "text2vec-transformers",
157
+ "moduleConfig": {
158
+ "text2vec-transformers": {
159
+ "vectorizeClassName": True
160
+ }
161
+ },
162
+ "vectorIndexType": "hnsw",
163
+ "vectorIndexConfig": {
164
+ "distance": "cosine",
165
+ },
166
+ "properties": [
167
+ {
168
+ "name": "chunk",
169
+ "dataType": ["text"],
170
+ "description": "Single webpage chunk.",
171
+ "vectorizer": "text2vec-transformers",
172
+ "moduleConfig": {
173
+ "text2vec-transformers": {
174
+ "vectorizePropertyName": False,
175
+ "skip": False,
176
+ "tokenization": "lowercase"
177
+ }
178
+ }
179
+ },
180
+ {
181
+ "name": "chunk_index",
182
+ "dataType": ["int"]
183
+ },
184
+ {
185
+ "name": "webpage",
186
+ "dataType": ["Documents"],
187
+ "description": "Webpage content chunks.",
188
+
189
+ "invertedIndexConfig": {
190
+ "bm25": {
191
+ "b": 0.75,
192
+ "k1": 1.2
193
+ }
194
+ }
195
+ }
196
+ ]
197
+ }
198
+ wpChunkCollection = client.collections.create_from_dict(class_obj)
199
+
200
+
201
+ ###########################################################
202
+ # Create document and chunks objects in the database.
203
+ logger.info("#### Create page/doc and chunk db objects.")
204
+ for i, className in enumerate(webpageDocNames):
205
+ title = webpageTitles[i]
206
+ logger.debug(f"## className, title: {className}, {title}")
207
+ # Create Webpage Object
208
+ page_content = page_contentArray[i]
209
+ # Insert the document.
210
+ wpCollectionObj_uuid = wpCollection.data.insert(
211
+ {
212
+ "name": className,
213
+ "title": title,
214
+ "content": page_content
215
+ }
216
+ )
217
+
218
+ # Insert the chunks for the document.
219
+ for i2, chunk in enumerate(webpageChunks[i]):
220
+ chunk_uuid = wpChunkCollection.data.insert(
221
+ {
222
+ "title": title,
223
+ "chunk": chunk,
224
+ "chunk_index": i2,
225
+ "references":
226
+ {
227
+ "webpage": wpCollectionObj_uuid
228
+ }
229
+ }
230
+ )
231
+
232
+ ###############################################################################
233
+ # text contains prompt for vector DB.
234
+ text = "human-made computer cognitive ability"
235
+
236
+
237
+ ###############################################################################
238
+ # Initial the the sentence transformer and encode the query prompt.
239
+ logger.info(f"#### Encode text query prompt to create vectors. {text}")
240
+ model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
241
+
242
+ vector = model.encode(text)
243
+ vectorList = []
244
+
245
+ logger.debug("#### Print vectors.")
246
+ for vec in vector:
247
+ vectorList.append(vec)
248
+ logger.debug(f"vectorList: {vectorList[2]}")
249
+
250
+ # Fetch chunks and print chunks.
251
+ logger.info("#### Retrieve semchunks from db using vectors from prompt.")
252
+ semChunks = wpChunkCollection.query.near_vector(
253
+ near_vector=vectorList,
254
+ distance=0.7,
255
+ limit=3
256
+ )
257
+ logger.debug(f"### semChunks[0]: {semChunks}")
258
+
259
+ # Print chunks, corresponding document and document title.
260
+ logger.info("#### Print individual retrieved chunks.")
261
+ for chunk in enumerate(semChunks.objects):
262
+ logger.info(f"#### chunk: {chunk}")
263
+ webpage_uuid = chunk[1].properties['references']['webpage']
264
+ logger.info(f"webpage_uuid: {webpage_uuid}")
265
+ wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
266
+ logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
267
+
268
+ logger.info("#### Closing client db connection.")
269
+ client.close()
270
+
271
+ logger.info("#### Program terminating.")
semsearch_Hld05_Working.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import weaviate
2
+
3
+ from sentence_transformers import SentenceTransformer
4
+ from langchain_community.document_loaders import BSHTMLLoader
5
+ from pathlib import Path
6
+ from lxml import html
7
+ import logging
8
+ from semantic_text_splitter import HuggingFaceTextSplitter
9
+ from tokenizers import Tokenizer
10
+ import json
11
+ import os
12
+ import re
13
+ import logging
14
+
15
+ weaviate_logger = logging.getLogger("httpx")
16
+ weaviate_logger.setLevel(logging.WARNING)
17
+
18
+ logger = logging.getLogger(__name__)
19
+ logging.basicConfig(level=logging.INFO)
20
+
21
+
22
+
23
+ ######################################################################
24
+ # MAINLINE
25
+ #
26
+ logger.info("#### MAINLINE ENTERED.")
27
+
28
+ #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
29
+ pathString = "/app/inputDocs"
30
+ chunks = []
31
+ webpageDocNames = []
32
+ page_contentArray = []
33
+ webpageChunks = []
34
+ webpageTitles = []
35
+ webpageChunksDocNames = []
36
+
37
+
38
+ #######################################################
39
+ # Read each text input file, parse it into a document,
40
+ # chunk it, collect chunks and document name.
41
+ logger.info("#### Read and chunk input text files.")
42
+ for filename in os.listdir(pathString):
43
+ logger.info(filename)
44
+ path = Path(pathString + "/" + filename)
45
+ filename = filename.rstrip(".html")
46
+ webpageDocNames.append(filename)
47
+ htmlLoader = BSHTMLLoader(path,"utf-8")
48
+ htmlData = htmlLoader.load()
49
+
50
+ title = htmlData[0].metadata['title']
51
+ page_content = htmlData[0].page_content
52
+
53
+ # Clean data. Remove multiple newlines, etc.
54
+ page_content = re.sub(r'\n+', '\n',page_content)
55
+
56
+ page_contentArray.append(page_content);
57
+ webpageTitles.append(title)
58
+ max_tokens = 1000
59
+ tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
60
+ logger.debug(f"### tokenizer: {tokenizer}")
61
+ splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
62
+ chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
63
+
64
+ chunks = []
65
+ for chnk in chunksOnePage:
66
+ logger.debug(f"#### chnk in file: {chnk}")
67
+ chunks.append(chnk)
68
+ logger.debug(f"chunks: {chunks}")
69
+ webpageChunks.append(chunks)
70
+ webpageChunksDocNames.append(filename + "Chunks")
71
+
72
+ logger.debug(f"### filename, title: {filename}, {title}")
73
+
74
+ logger.debug(f"### webpageDocNames: {webpageDocNames}")
75
+
76
+
77
+ ######################################################
78
+ # Connect to the Weaviate vector database.
79
+ logger.info("#### Create Weaviate db client connection.")
80
+ client = weaviate.connect_to_custom(
81
+ http_host="127.0.0.1",
82
+ http_port=8080,
83
+ http_secure=False,
84
+ grpc_host="127.0.0.1",
85
+ grpc_port=50051,
86
+ grpc_secure=False
87
+ #read_timeout=600,
88
+ #write_timeout=90
89
+ )
90
+ client.connect()
91
+
92
+
93
+ ######################################################
94
+ # Create database webpage and chunks collections.
95
+ #wpCollection = createWebpageCollection()
96
+ #wpChunkCollection = createChunksCollection()
97
+ logger.info("#### createWebpageCollection() entered.")
98
+ if client.collections.exists("Documents"):
99
+ client.collections.delete("Documents")
100
+
101
+ class_obj = {
102
+ "class": "Documents",
103
+ "description": "For first attempt at loading a Weviate database.",
104
+ "vectorizer": "text2vec-transformers",
105
+ "moduleConfig": {
106
+ "text2vec-transformers": {
107
+ "vectorizeClassName": False
108
+ }
109
+ },
110
+ "vectorIndexType": "hnsw",
111
+ "vectorIndexConfig": {
112
+ "distance": "cosine",
113
+ },
114
+ "properties": [
115
+ {
116
+ "name": "title",
117
+ "dataType": ["text"],
118
+ "description": "HTML doc title.",
119
+ "vectorizer": "text2vec-transformers",
120
+ "moduleConfig": {
121
+ "text2vec-transformers": {
122
+ "vectorizePropertyName": True,
123
+ "skip": False,
124
+ "tokenization": "lowercase"
125
+ }
126
+ },
127
+ "invertedIndexConfig": {
128
+ "bm25": {
129
+ "b": 0.75,
130
+ "k1": 1.2
131
+ },
132
+ }
133
+ },
134
+ {
135
+ "name": "content",
136
+ "dataType": ["text"],
137
+ "description": "HTML page content.",
138
+ "moduleConfig": {
139
+ "text2vec-transformers": {
140
+ "vectorizePropertyName": True,
141
+ "tokenization": "whitespace"
142
+ }
143
+ }
144
+ }
145
+ ]
146
+ }
147
+ wpCollection = client.collections.create_from_dict(class_obj)
148
+
149
+ logger.info("#### createChunksCollection() entered.")
150
+ if client.collections.exists("Chunks"):
151
+ client.collections.delete("Chunks")
152
+
153
+ class_obj = {
154
+ "class": "Chunks",
155
+ "description": "Collection for document chunks.",
156
+ "vectorizer": "text2vec-transformers",
157
+ "moduleConfig": {
158
+ "text2vec-transformers": {
159
+ "vectorizeClassName": True
160
+ }
161
+ },
162
+ "vectorIndexType": "hnsw",
163
+ "vectorIndexConfig": {
164
+ "distance": "cosine",
165
+ },
166
+ "properties": [
167
+ {
168
+ "name": "chunk",
169
+ "dataType": ["text"],
170
+ "description": "Single webpage chunk.",
171
+ "vectorizer": "text2vec-transformers",
172
+ "moduleConfig": {
173
+ "text2vec-transformers": {
174
+ "vectorizePropertyName": False,
175
+ "skip": False,
176
+ "tokenization": "lowercase"
177
+ }
178
+ }
179
+ },
180
+ {
181
+ "name": "chunk_index",
182
+ "dataType": ["int"]
183
+ },
184
+ {
185
+ "name": "webpage",
186
+ "dataType": ["Documents"],
187
+ "description": "Webpage content chunks.",
188
+
189
+ "invertedIndexConfig": {
190
+ "bm25": {
191
+ "b": 0.75,
192
+ "k1": 1.2
193
+ }
194
+ }
195
+ }
196
+ ]
197
+ }
198
+ wpChunkCollection = client.collections.create_from_dict(class_obj)
199
+
200
+
201
+ ###########################################################
202
+ # Create document and chunks objects in the database.
203
+ logger.info("#### Create page/doc and chunk db objects.")
204
+ for i, className in enumerate(webpageDocNames):
205
+ title = webpageTitles[i]
206
+ logger.debug(f"## className, title: {className}, {title}")
207
+ # Create Webpage Object
208
+ page_content = page_contentArray[i]
209
+ # Insert the document.
210
+ wpCollectionObj_uuid = wpCollection.data.insert(
211
+ {
212
+ "name": className,
213
+ "title": title,
214
+ "content": page_content
215
+ }
216
+ )
217
+
218
+ # Insert the chunks for the document.
219
+ for i2, chunk in enumerate(webpageChunks[i]):
220
+ chunk_uuid = wpChunkCollection.data.insert(
221
+ {
222
+ "title": title,
223
+ "chunk": chunk,
224
+ "chunk_index": i2,
225
+ "references":
226
+ {
227
+ "webpage": wpCollectionObj_uuid
228
+ }
229
+ }
230
+ )
231
+
232
+ ###############################################################################
233
+ # text contains prompt for vector DB.
234
+ text = "human-made computer cognitive ability"
235
+
236
+
237
+ ###############################################################################
238
+ # Initial the the sentence transformer and encode the query prompt.
239
+ logger.info(f"#### Encode text query prompt to create vectors. {text}")
240
+ model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
241
+
242
+ vector = model.encode(text)
243
+ vectorList = []
244
+
245
+ logger.debug("#### Print vectors.")
246
+ for vec in vector:
247
+ vectorList.append(vec)
248
+ logger.debug(f"vectorList: {vectorList[2]}")
249
+
250
+ # Fetch chunks and print chunks.
251
+ logger.info("#### Retrieve semchunks from db using vectors from prompt.")
252
+ semChunks = wpChunkCollection.query.near_vector(
253
+ near_vector=vectorList,
254
+ distance=0.7,
255
+ limit=3
256
+ )
257
+ logger.debug(f"### semChunks[0]: {semChunks}")
258
+
259
+ # Print chunks, corresponding document and document title.
260
+ logger.info("#### Print individual retrieved chunks.")
261
+ for chunk in enumerate(semChunks.objects):
262
+ logger.info(f"#### chunk: {chunk}")
263
+ webpage_uuid = chunk[1].properties['references']['webpage']
264
+ logger.info(f"webpage_uuid: {webpage_uuid}")
265
+ wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
266
+ logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
267
+
268
+ logger.info("#### Closing client db connection.")
269
+ client.close()
270
+
271
+ logger.info("#### Program terminating.")
startup.sh CHANGED
@@ -4,6 +4,7 @@ echo "#### startup.sh entered."
4
  #echo "### ls -l /app"; ls -l /app
5
  #echo "### ls -l /app/weaviate"; ls -l /app/weaviate
6
  #echo "### ls -l /app/text2vec-transformers"; ls -l /app/text2vec-transformers
 
7
 
8
  ################################################
9
  # Start tex2vec-transformers
@@ -21,6 +22,7 @@ ln -s /data/var/lib/weaviate /var/lib/weaviate
21
  echo "### ls -l /var/lib/weaviate"; ls -l /var/lib/weaviate
22
  echo "### ls -l /data"; ls -l /data
23
  echo "### ls -l /data/var/lib/weaviate"; ls -l /data/var/lib/weaviate
 
24
  export AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED=true \
25
  PERSISTENCE_DATA_PATH=/var/lib/weaviate \
26
  DEFAULT_VECTORIZER_MODULE=text2vec-transformers \
 
4
  #echo "### ls -l /app"; ls -l /app
5
  #echo "### ls -l /app/weaviate"; ls -l /app/weaviate
6
  #echo "### ls -l /app/text2vec-transformers"; ls -l /app/text2vec-transformers
7
+ echo "### ls -l /data"; ls -l /data
8
 
9
  ################################################
10
  # Start tex2vec-transformers
 
22
  echo "### ls -l /var/lib/weaviate"; ls -l /var/lib/weaviate
23
  echo "### ls -l /data"; ls -l /data
24
  echo "### ls -l /data/var/lib/weaviate"; ls -l /data/var/lib/weaviate
25
+
26
  export AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED=true \
27
  PERSISTENCE_DATA_PATH=/var/lib/weaviate \
28
  DEFAULT_VECTORIZER_MODULE=text2vec-transformers \