MVPilgrim commited on
Commit
6161c48
·
1 Parent(s): 1dbdbf0
Files changed (3) hide show
  1. Dockerfile +2 -1
  2. semsearchDbgUI.py +437 -0
  3. startupDbgUI.sh +7 -0
Dockerfile CHANGED
@@ -68,4 +68,5 @@ RUN useradd -m -u 1000 user
68
  #RUN /app/startup.sh
69
  #RUN --mount=type=cache,target=/data,mode=777 /app/startup.sh
70
  #RUN --mount=type=cache,target=/data,mode=777 echo "### Mounting /data"
71
- CMD ["/app/startup.sh"]
 
 
68
  #RUN /app/startup.sh
69
  #RUN --mount=type=cache,target=/data,mode=777 /app/startup.sh
70
  #RUN --mount=type=cache,target=/data,mode=777 echo "### Mounting /data"
71
+ #CMD ["/app/startup.sh"]
72
+ CMD ["/app/startupDbgUI.sh"]
semsearchDbgUI.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import weaviate
2
+ from weaviate.connect import ConnectionParams
3
+ from weaviate.classes.init import AdditionalConfig, Timeout
4
+
5
+ from sentence_transformers import SentenceTransformer
6
+ from langchain_community.document_loaders import BSHTMLLoader
7
+ from pathlib import Path
8
+ from lxml import html
9
+ import logging
10
+ from semantic_text_splitter import HuggingFaceTextSplitter
11
+ from tokenizers import Tokenizer
12
+ import json
13
+ import os
14
+ import re
15
+ import logging
16
+
17
+ import llama_cpp
18
+ from llama_cpp import Llama
19
+
20
+ import ipywidgets as widgets
21
+ from IPython.display import display, clear_output
22
+
23
+
24
+ weaviate_logger = logging.getLogger("httpx")
25
+ weaviate_logger.setLevel(logging.WARNING)
26
+
27
+ logger = logging.getLogger(__name__)
28
+ logging.basicConfig(level=logging.INFO)
29
+
30
+
31
+
32
+ ######################################################################
33
+ # MAINLINE
34
+ #
35
+ logger.info("#### MAINLINE ENTERED.")
36
+
37
+ #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
38
+ pathString = "/app/inputDocs"
39
+ chunks = []
40
+ webpageDocNames = []
41
+ page_contentArray = []
42
+ webpageChunks = []
43
+ webpageTitles = []
44
+ webpageChunksDocNames = []
45
+
46
+ #####################################################################
47
+ # Create UI widgets.
48
+ output_widget = widgets.Output()
49
+ with output_widget:
50
+ print("### Create widgets entered.")
51
+
52
+ systemTextArea = widgets.Textarea(
53
+ value='',
54
+ placeholder='Enter System Prompt.',
55
+ description='Sys Prompt: ',
56
+ disabled=False,
57
+ layout=widgets.Layout(width='300px', height='80px')
58
+ )
59
+
60
+ userTextArea = widgets.Textarea(
61
+ value='',
62
+ placeholder='Enter User Prompt.',
63
+ description='User Prompt: ',
64
+ disabled=False,
65
+ layout=widgets.Layout(width='435px', height='110px')
66
+ )
67
+
68
+ ragPromptTextArea = widgets.Textarea(
69
+ value='',
70
+ placeholder='App generated prompt with RAG information.',
71
+ description='RAG Prompt: ',
72
+ disabled=False,
73
+ layout=widgets.Layout(width='580px', height='180px')
74
+ )
75
+
76
+ responseTextArea = widgets.Textarea(
77
+ value='',
78
+ placeholder='LLM generated response.',
79
+ description='LLM Resp: ',
80
+ disabled=False,
81
+ layout=widgets.Layout(width='780px', height='200px')
82
+ )
83
+
84
+ selectRag = widgets.Checkbox(
85
+ value=False,
86
+ description='Use RAG',
87
+ disabled=False
88
+ )
89
+
90
+ submitButton = widgets.Button(
91
+ description='Run Model.',
92
+ disabled=False,
93
+ button_style='', # 'success', 'info', 'warning', 'danger' or ''
94
+ tooltip='Click',
95
+ icon='check' # (FontAwesome names without the `fa-` prefix)
96
+ )
97
+
98
+
99
+ ######################################################
100
+ # Connect to the Weaviate vector database.
101
+ logger.info("#### Create Weaviate db client connection.")
102
+ client = weaviate.WeaviateClient(
103
+ connection_params=ConnectionParams.from_params(
104
+ http_host="localhost",
105
+ http_port="8080",
106
+ http_secure=False,
107
+ grpc_host="localhost",
108
+ grpc_port="50051",
109
+ grpc_secure=False
110
+ ),
111
+ additional_config=AdditionalConfig(
112
+ timeout=Timeout(init=60, query=1800, insert=1800), # Values in seconds
113
+ )
114
+ )
115
+ client.connect()
116
+
117
+
118
+ #######################################################
119
+ # Read each text input file, parse it into a document,
120
+ # chunk it, collect chunks and document name.
121
+ logger.info("#### Read and chunk input text files.")
122
+ if not client.collections.exists("Documents") or not client.collections.exists("Documents") :
123
+ for filename in os.listdir(pathString):
124
+ logger.info(filename)
125
+ path = Path(pathString + "/" + filename)
126
+ filename = filename.rstrip(".html")
127
+ webpageDocNames.append(filename)
128
+ htmlLoader = BSHTMLLoader(path,"utf-8")
129
+ htmlData = htmlLoader.load()
130
+
131
+ title = htmlData[0].metadata['title']
132
+ page_content = htmlData[0].page_content
133
+
134
+ # Clean data. Remove multiple newlines, etc.
135
+ page_content = re.sub(r'\n+', '\n',page_content)
136
+
137
+ page_contentArray.append(page_content);
138
+ webpageTitles.append(title)
139
+ max_tokens = 1000
140
+ tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
141
+ logger.debug(f"### tokenizer: {tokenizer}")
142
+ splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
143
+ chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
144
+
145
+ chunks = []
146
+ for chnk in chunksOnePage:
147
+ logger.debug(f"#### chnk in file: {chnk}")
148
+ chunks.append(chnk)
149
+ logger.debug(f"chunks: {chunks}")
150
+ webpageChunks.append(chunks)
151
+ webpageChunksDocNames.append(filename + "Chunks")
152
+
153
+ logger.debug(f"### filename, title: {filename}, {title}")
154
+ logger.debug(f"### webpageDocNames: {webpageDocNames}")
155
+
156
+
157
+
158
+ ######################################################
159
+ # Create database webpage and chunks collections.
160
+ #wpCollection = createWebpageCollection()
161
+ #wpChunkCollection = createChunksCollection()
162
+ logger.info("#### createWebpageCollection() entered.")
163
+ if not client.collections.exists("Documents"):
164
+ #client.collections.delete("Documents")
165
+ class_obj = {
166
+ "class": "Documents",
167
+ "description": "For first attempt at loading a Weviate database.",
168
+ "vectorizer": "text2vec-transformers",
169
+ "moduleConfig": {
170
+ "text2vec-transformers": {
171
+ "vectorizeClassName": False
172
+ }
173
+ },
174
+ "vectorIndexType": "hnsw",
175
+ "vectorIndexConfig": {
176
+ "distance": "cosine",
177
+ },
178
+ "properties": [
179
+ {
180
+ "name": "title",
181
+ "dataType": ["text"],
182
+ "description": "HTML doc title.",
183
+ "vectorizer": "text2vec-transformers",
184
+ "moduleConfig": {
185
+ "text2vec-transformers": {
186
+ "vectorizePropertyName": True,
187
+ "skip": False,
188
+ "tokenization": "lowercase"
189
+ }
190
+ },
191
+ "invertedIndexConfig": {
192
+ "bm25": {
193
+ "b": 0.75,
194
+ "k1": 1.2
195
+ },
196
+ }
197
+ },
198
+ {
199
+ "name": "content",
200
+ "dataType": ["text"],
201
+ "description": "HTML page content.",
202
+ "moduleConfig": {
203
+ "text2vec-transformers": {
204
+ "vectorizePropertyName": True,
205
+ "tokenization": "whitespace"
206
+ }
207
+ }
208
+ }
209
+ ]
210
+ }
211
+ wpCollection = client.collections.create_from_dict(class_obj)
212
+
213
+ logger.info("#### createChunksCollection() entered.")
214
+ if not client.collections.exists("Chunks"):
215
+ #client.collections.delete("Chunks")
216
+ class_obj = {
217
+ "class": "Chunks",
218
+ "description": "Collection for document chunks.",
219
+ "vectorizer": "text2vec-transformers",
220
+ "moduleConfig": {
221
+ "text2vec-transformers": {
222
+ "vectorizeClassName": True
223
+ }
224
+ },
225
+ "vectorIndexType": "hnsw",
226
+ "vectorIndexConfig": {
227
+ "distance": "cosine",
228
+ },
229
+ "properties": [
230
+ {
231
+ "name": "chunk",
232
+ "dataType": ["text"],
233
+ "description": "Single webpage chunk.",
234
+ "vectorizer": "text2vec-transformers",
235
+ "moduleConfig": {
236
+ "text2vec-transformers": {
237
+ "vectorizePropertyName": False,
238
+ "skip": False,
239
+ "tokenization": "lowercase"
240
+ }
241
+ }
242
+ },
243
+ {
244
+ "name": "chunk_index",
245
+ "dataType": ["int"]
246
+ },
247
+ {
248
+ "name": "webpage",
249
+ "dataType": ["Documents"],
250
+ "description": "Webpage content chunks.",
251
+
252
+ "invertedIndexConfig": {
253
+ "bm25": {
254
+ "b": 0.75,
255
+ "k1": 1.2
256
+ }
257
+ }
258
+ }
259
+ ]
260
+ }
261
+ wpChunkCollection = client.collections.create_from_dict(class_obj)
262
+
263
+
264
+ ###########################################################
265
+ # Create document and chunks objects in the database.
266
+ if not client.collections.exists("Documents") :
267
+ logger.info("#### Create page/doc db objects.")
268
+ for i, className in enumerate(webpageDocNames):
269
+ title = webpageTitles[i]
270
+ logger.debug(f"## className, title: {className}, {title}")
271
+ # Create Webpage Object
272
+ page_content = page_contentArray[i]
273
+ # Insert the document.
274
+ wpCollectionObj_uuid = wpCollection.data.insert(
275
+ {
276
+ "name": className,
277
+ "title": title,
278
+ "content": page_content
279
+ }
280
+ )
281
+
282
+ if not client.collections.exists("Chunks") :
283
+ logger.info("#### Create chunk db objects.")
284
+ # Insert the chunks for the document.
285
+ for i2, chunk in enumerate(webpageChunks[i]):
286
+ chunk_uuid = wpChunkCollection.data.insert(
287
+ {
288
+ "title": title,
289
+ "chunk": chunk,
290
+ "chunk_index": i2,
291
+ "references":
292
+ {
293
+ "webpage": wpCollectionObj_uuid
294
+ }
295
+ }
296
+ )
297
+
298
+
299
+ #################################################################
300
+ # Initialize the LLM.
301
+ model_path = "/app/llama-2-7b-chat.Q4_0.gguf"
302
+ llm = Llama(model_path,
303
+ #*,
304
+ n_gpu_layers=0,
305
+ split_mode=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
306
+ main_gpu=0,
307
+ tensor_split=None,
308
+ vocab_only=False,
309
+ use_mmap=True,
310
+ use_mlock=False,
311
+ kv_overrides=None,
312
+ seed=llama_cpp.LLAMA_DEFAULT_SEED,
313
+ n_ctx=512,
314
+ n_batch=512,
315
+ n_threads=8,
316
+ n_threads_batch=16,
317
+ rope_scaling_type=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
318
+ pooling_type=llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
319
+ rope_freq_base=0.0,
320
+ rope_freq_scale=0.0,
321
+ yarn_ext_factor=-1.0,
322
+ yarn_attn_factor=1.0,
323
+ yarn_beta_fast=32.0,
324
+ yarn_beta_slow=1.0,
325
+ yarn_orig_ctx=0,
326
+ logits_all=False,
327
+ embedding=False,
328
+ offload_kqv=True,
329
+ last_n_tokens_size=64,
330
+ lora_base=None,
331
+ lora_scale=1.0,
332
+ lora_path=None,
333
+ numa=False,
334
+ chat_format=None,
335
+ chat_handler=None,
336
+ draft_model=None,
337
+ tokenizer=None,
338
+ type_k=None,
339
+ type_v=None,
340
+ verbose=True
341
+ )
342
+
343
+
344
+ def getRagData(promptText):
345
+ ###############################################################################
346
+ # Initial the the sentence transformer and encode the query prompt.
347
+ logger.info(f"#### Encode text query prompt to create vectors. {text}")
348
+ model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
349
+
350
+ vector = model.encode(promptText)
351
+ vectorList = []
352
+
353
+ logger.debug("#### Print vectors.")
354
+ for vec in vector:
355
+ vectorList.append(vec)
356
+ logger.debug(f"vectorList: {vectorList[2]}")
357
+
358
+ # Fetch chunks and print chunks.
359
+ logger.info("#### Retrieve semchunks from db using vectors from prompt.")
360
+ semChunks = wpChunkCollection.query.near_vector(
361
+ near_vector=vectorList,
362
+ distance=0.7,
363
+ limit=3
364
+ )
365
+ logger.debug(f"### semChunks[0]: {semChunks}")
366
+
367
+ # Print chunks, corresponding document and document title.
368
+ ragData = ""
369
+ logger.info("#### Print individual retrieved chunks.")
370
+ for chunk in enumerate(semChunks.objects):
371
+ logger.info(f"#### chunk: {chunk}")
372
+ ragData = ragData + "\n" + chunk[0]
373
+ webpage_uuid = chunk[1].properties['references']['webpage']
374
+ logger.info(f"webpage_uuid: {webpage_uuid}")
375
+ wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
376
+ logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
377
+ #collection = client.collections.get("Chunks")
378
+ return ragData
379
+
380
+ # Display UI
381
+ display(systemTextArea)
382
+ display(userTextArea)
383
+ display(ragPromptTextArea)
384
+ display(responseTextArea)
385
+ display(selectRag)
386
+ display(submitButton)
387
+
388
+ def runLLM(prompt):
389
+ max_tokens = 1000
390
+ temperature = 0.3
391
+ top_p = 0.1
392
+ echo = True
393
+ stop = ["Q", "\n"]
394
+
395
+ modelOutput = llm(
396
+ prompt,
397
+ max_tokens=max_tokens,
398
+ temperature=temperature,
399
+ top_p=top_p,
400
+ echo=echo,
401
+ stop=stop,
402
+ )
403
+ result = modelOutput["choices"][0]["text"].strip()
404
+ return(result)
405
+
406
+ def setPrompt(pprompt,ragFlag):
407
+ print("\n### setPrompt() entered. ragFlag: ",ragFlag)
408
+ if ragFlag:
409
+ ragPrompt = getRagData(pprompt)
410
+ userPrompt = pprompt + "\n" + ragPrompt
411
+ prompt = userPrompt
412
+ userPrompt = "Using this information: " + ragPrompt \
413
+ + "process the following statement or question and produce a a response" \
414
+ + intialPrompt
415
+ else:
416
+ userPrompt = pprompt
417
+ #prompt = f""" <s> [INST] <<SYS>> {systemTextArea.value} </SYS>> Q: {userPrompt} A: [/INST]"""
418
+ return userPrompt
419
+
420
+
421
+ def on_submitButton_clicked(b):
422
+ with output_widget:
423
+ clear_output(wait=True)
424
+ ragPromptTextArea.value = ""
425
+ responseTextArea.value = ""
426
+ log.debug(f"### selectRag: {selectRag.value}")
427
+ prompt = setPrompt(userTextArea.value,selectRag.value)
428
+ log.debug("### prompt: " + prompt)
429
+ runLLM(prompt)
430
+
431
+ submitButton.on_click(on_submitButton_clicked)
432
+ display(output_widget)
433
+
434
+
435
+ #logger.info("#### Closing client db connection.")
436
+ #client.close()
437
+ #logger.info("#### Program terminating.")
startupDbgUI.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #! /bin/bash
2
+
3
+ echo "#### startup.sh entered."
4
+
5
+ python /app/semsearchDbgUI.py
6
+
7
+