MVPilgrim commited on
Commit
daca123
·
1 Parent(s): 6281b13
Files changed (2) hide show
  1. Dockerfile +5 -7
  2. app.py +436 -0
Dockerfile CHANGED
@@ -78,10 +78,8 @@ WORKDIR /app
78
  #CMD ["/app/startupDbgUI.sh"]
79
  EXPOSE 8501
80
  #CMD /app/startup.sh; /usr/local/bin/streamlit run semsearch.py --server.port=8501 --server.address=0.0.0.0
81
- #CMD streamlit run stream.py \
82
- # --server.headless true \
83
- # --server.enableCORS false \
84
- # --server.enableXsrfProtection false \
85
- # --server.fileWatcherType none
86
-
87
- CMD streamlit run stream.py
 
78
  #CMD ["/app/startupDbgUI.sh"]
79
  EXPOSE 8501
80
  #CMD /app/startup.sh; /usr/local/bin/streamlit run semsearch.py --server.port=8501 --server.address=0.0.0.0
81
+ CMD streamlit run app.py \
82
+ --server.headless true \
83
+ --server.enableCORS false \
84
+ --server.enableXsrfProtection false \
85
+ --server.fileWatcherType none
 
 
app.py CHANGED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import weaviate
2
+ from weaviate.connect import ConnectionParams
3
+ from weaviate.classes.init import AdditionalConfig, Timeout
4
+
5
+ from sentence_transformers import SentenceTransformer
6
+ from langchain_community.document_loaders import BSHTMLLoader
7
+ from pathlib import Path
8
+ from lxml import html
9
+ import logging
10
+ from semantic_text_splitter import HuggingFaceTextSplitter
11
+ from tokenizers import Tokenizer
12
+ import json
13
+ import os
14
+ import re
15
+ import logging
16
+
17
+ import llama_cpp
18
+ from llama_cpp import Llama
19
+
20
+ import streamlit as st
21
+ import subprocess
22
+
23
+
24
+ st.markdown("<h1 style='text-align: center; color: #666666;'>Vector Database RAG Proof of Concept</h1>", unsafe_allow_html=True)
25
+ st.markdown("<h6 style='text-align: center; color: #666666;'>V1</h6>", unsafe_allow_html=True)
26
+
27
+
28
+ weaviate_logger = logging.getLogger("httpx")
29
+ weaviate_logger.setLevel(logging.WARNING)
30
+
31
+ logger = logging.getLogger(__name__)
32
+ logging.basicConfig(level=logging.INFO)
33
+
34
+
35
+ def runStartup():
36
+ result = subprocess.run(["bash", "startup.sh"], capture_output=True, text=True)
37
+ logger(f"startup.sh stdout: {result.stdout}")
38
+ logger(f"startup.sh stderr: {result.stderr}")
39
+ logger(f"Return code: {result.returncode}")
40
+ logger("### Running startup.sh")
41
+ runStartup()
42
+
43
+ # Function to load the CSS file
44
+ def load_css(file_name):
45
+ with open(file_name) as f:
46
+ st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
47
+
48
+ # Load the custom CSS
49
+ load_css(".streamlit/main.css")
50
+
51
+ st.markdown("<h1 style='text-align: center; color: #666666;'>Vector Database RAG Proof of Concept</h1>", unsafe_allow_html=True)
52
+ st.markdown("<h6 style='text-align: center; color: #666666;'>V1</h6>", unsafe_allow_html=True)
53
+
54
+ ######################################################################
55
+ # MAINLINE
56
+ #
57
+ logger.info("#### MAINLINE ENTERED.")
58
+
59
+ #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
60
+ pathString = "/app/inputDocs"
61
+ chunks = []
62
+ webpageDocNames = []
63
+ page_contentArray = []
64
+ webpageChunks = []
65
+ webpageTitles = []
66
+ webpageChunksDocNames = []
67
+
68
+ ######################################################
69
+ # Connect to the Weaviate vector database.
70
+ logger.info("#### Create Weaviate db client connection.")
71
+ client = weaviate.WeaviateClient(
72
+ connection_params=ConnectionParams.from_params(
73
+ http_host="localhost",
74
+ http_port="8080",
75
+ http_secure=False,
76
+ grpc_host="localhost",
77
+ grpc_port="50051",
78
+ grpc_secure=False
79
+ ),
80
+ additional_config=AdditionalConfig(
81
+ timeout=Timeout(init=60, query=1800, insert=1800), # Values in seconds
82
+ )
83
+ )
84
+ client.connect()
85
+
86
+
87
+ #######################################################
88
+ # Read each text input file, parse it into a document,
89
+ # chunk it, collect chunks and document name.
90
+ logger.info("#### Read and chunk input text files.")
91
+ if not client.collections.exists("Documents") or not client.collections.exists("Documents") :
92
+ for filename in os.listdir(pathString):
93
+ logger.info(filename)
94
+ path = Path(pathString + "/" + filename)
95
+ filename = filename.rstrip(".html")
96
+ webpageDocNames.append(filename)
97
+ htmlLoader = BSHTMLLoader(path,"utf-8")
98
+ htmlData = htmlLoader.load()
99
+
100
+ title = htmlData[0].metadata['title']
101
+ page_content = htmlData[0].page_content
102
+
103
+ # Clean data. Remove multiple newlines, etc.
104
+ page_content = re.sub(r'\n+', '\n',page_content)
105
+
106
+ page_contentArray.append(page_content);
107
+ webpageTitles.append(title)
108
+ max_tokens = 1000
109
+ tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
110
+ logger.debug(f"### tokenizer: {tokenizer}")
111
+ splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
112
+ chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
113
+
114
+ chunks = []
115
+ for chnk in chunksOnePage:
116
+ logger.debug(f"#### chnk in file: {chnk}")
117
+ chunks.append(chnk)
118
+ logger.debug(f"chunks: {chunks}")
119
+ webpageChunks.append(chunks)
120
+ webpageChunksDocNames.append(filename + "Chunks")
121
+
122
+ logger.debug(f"### filename, title: {filename}, {title}")
123
+ logger.debug(f"### webpageDocNames: {webpageDocNames}")
124
+
125
+
126
+
127
+ ######################################################
128
+ # Create database webpage and chunks collections.
129
+ #wpCollection = createWebpageCollection()
130
+ #wpChunkCollection = createChunksCollection()
131
+ logger.info("#### createWebpageCollection() entered.")
132
+ if not client.collections.exists("Documents"):
133
+ #client.collections.delete("Documents")
134
+ class_obj = {
135
+ "class": "Documents",
136
+ "description": "For first attempt at loading a Weviate database.",
137
+ "vectorizer": "text2vec-transformers",
138
+ "moduleConfig": {
139
+ "text2vec-transformers": {
140
+ "vectorizeClassName": False
141
+ }
142
+ },
143
+ "vectorIndexType": "hnsw",
144
+ "vectorIndexConfig": {
145
+ "distance": "cosine",
146
+ },
147
+ "properties": [
148
+ {
149
+ "name": "title",
150
+ "dataType": ["text"],
151
+ "description": "HTML doc title.",
152
+ "vectorizer": "text2vec-transformers",
153
+ "moduleConfig": {
154
+ "text2vec-transformers": {
155
+ "vectorizePropertyName": True,
156
+ "skip": False,
157
+ "tokenization": "lowercase"
158
+ }
159
+ },
160
+ "invertedIndexConfig": {
161
+ "bm25": {
162
+ "b": 0.75,
163
+ "k1": 1.2
164
+ },
165
+ }
166
+ },
167
+ {
168
+ "name": "content",
169
+ "dataType": ["text"],
170
+ "description": "HTML page content.",
171
+ "moduleConfig": {
172
+ "text2vec-transformers": {
173
+ "vectorizePropertyName": True,
174
+ "tokenization": "whitespace"
175
+ }
176
+ }
177
+ }
178
+ ]
179
+ }
180
+ wpCollection = client.collections.create_from_dict(class_obj)
181
+
182
+ logger.info("#### createChunksCollection() entered.")
183
+ if not client.collections.exists("Chunks"):
184
+ #client.collections.delete("Chunks")
185
+ class_obj = {
186
+ "class": "Chunks",
187
+ "description": "Collection for document chunks.",
188
+ "vectorizer": "text2vec-transformers",
189
+ "moduleConfig": {
190
+ "text2vec-transformers": {
191
+ "vectorizeClassName": True
192
+ }
193
+ },
194
+ "vectorIndexType": "hnsw",
195
+ "vectorIndexConfig": {
196
+ "distance": "cosine",
197
+ },
198
+ "properties": [
199
+ {
200
+ "name": "chunk",
201
+ "dataType": ["text"],
202
+ "description": "Single webpage chunk.",
203
+ "vectorizer": "text2vec-transformers",
204
+ "moduleConfig": {
205
+ "text2vec-transformers": {
206
+ "vectorizePropertyName": False,
207
+ "skip": False,
208
+ "tokenization": "lowercase"
209
+ }
210
+ }
211
+ },
212
+ {
213
+ "name": "chunk_index",
214
+ "dataType": ["int"]
215
+ },
216
+ {
217
+ "name": "webpage",
218
+ "dataType": ["Documents"],
219
+ "description": "Webpage content chunks.",
220
+
221
+ "invertedIndexConfig": {
222
+ "bm25": {
223
+ "b": 0.75,
224
+ "k1": 1.2
225
+ }
226
+ }
227
+ }
228
+ ]
229
+ }
230
+ wpChunkCollection = client.collections.create_from_dict(class_obj)
231
+
232
+
233
+ ###########################################################
234
+ # Create document and chunks objects in the database.
235
+ if not client.collections.exists("Documents") :
236
+ logger.info("#### Create page/doc db objects.")
237
+ for i, className in enumerate(webpageDocNames):
238
+ title = webpageTitles[i]
239
+ logger.debug(f"## className, title: {className}, {title}")
240
+ # Create Webpage Object
241
+ page_content = page_contentArray[i]
242
+ # Insert the document.
243
+ wpCollectionObj_uuid = wpCollection.data.insert(
244
+ {
245
+ "name": className,
246
+ "title": title,
247
+ "content": page_content
248
+ }
249
+ )
250
+
251
+ if not client.collections.exists("Chunks") :
252
+ logger.info("#### Create chunk db objects.")
253
+ # Insert the chunks for the document.
254
+ for i2, chunk in enumerate(webpageChunks[i]):
255
+ chunk_uuid = wpChunkCollection.data.insert(
256
+ {
257
+ "title": title,
258
+ "chunk": chunk,
259
+ "chunk_index": i2,
260
+ "references":
261
+ {
262
+ "webpage": wpCollectionObj_uuid
263
+ }
264
+ }
265
+ )
266
+
267
+
268
+ #################################################################
269
+ # Initialize the LLM.
270
+ model_path = "/app/llama-2-7b-chat.Q4_0.gguf"
271
+ llm = Llama(model_path,
272
+ #*,
273
+ n_gpu_layers=0,
274
+ split_mode=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
275
+ main_gpu=0,
276
+ tensor_split=None,
277
+ vocab_only=False,
278
+ use_mmap=True,
279
+ use_mlock=False,
280
+ kv_overrides=None,
281
+ seed=llama_cpp.LLAMA_DEFAULT_SEED,
282
+ n_ctx=512,
283
+ n_batch=512,
284
+ n_threads=8,
285
+ n_threads_batch=16,
286
+ rope_scaling_type=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
287
+ pooling_type=llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
288
+ rope_freq_base=0.0,
289
+ rope_freq_scale=0.0,
290
+ yarn_ext_factor=-1.0,
291
+ yarn_attn_factor=1.0,
292
+ yarn_beta_fast=32.0,
293
+ yarn_beta_slow=1.0,
294
+ yarn_orig_ctx=0,
295
+ logits_all=False,
296
+ embedding=False,
297
+ offload_kqv=True,
298
+ last_n_tokens_size=64,
299
+ lora_base=None,
300
+ lora_scale=1.0,
301
+ lora_path=None,
302
+ numa=False,
303
+ chat_format=None,
304
+ chat_handler=None,
305
+ draft_model=None,
306
+ tokenizer=None,
307
+ type_k=None,
308
+ type_v=None,
309
+ verbose=True
310
+ )
311
+
312
+
313
+ def getRagData(promptText):
314
+ ###############################################################################
315
+ # Initial the the sentence transformer and encode the query prompt.
316
+ logger.info(f"#### Encode text query prompt to create vectors. {text}")
317
+ model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
318
+
319
+ vector = model.encode(promptText)
320
+ vectorList = []
321
+
322
+ logger.debug("#### Print vectors.")
323
+ for vec in vector:
324
+ vectorList.append(vec)
325
+ logger.debug(f"vectorList: {vectorList[2]}")
326
+
327
+ # Fetch chunks and print chunks.
328
+ logger.info("#### Retrieve semchunks from db using vectors from prompt.")
329
+ semChunks = wpChunkCollection.query.near_vector(
330
+ near_vector=vectorList,
331
+ distance=0.7,
332
+ limit=3
333
+ )
334
+ logger.debug(f"### semChunks[0]: {semChunks}")
335
+
336
+ # Print chunks, corresponding document and document title.
337
+ ragData = ""
338
+ logger.info("#### Print individual retrieved chunks.")
339
+ for chunk in enumerate(semChunks.objects):
340
+ logger.info(f"#### chunk: {chunk}")
341
+ ragData = ragData + "\n" + chunk[0]
342
+ webpage_uuid = chunk[1].properties['references']['webpage']
343
+ logger.info(f"webpage_uuid: {webpage_uuid}")
344
+ wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
345
+ logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
346
+ #collection = client.collections.get("Chunks")
347
+ return ragData
348
+
349
+
350
+ # Display UI
351
+ col1, col2 = st.columns(2)
352
+
353
+ with col1:
354
+ if "sysTA" not in st.session_state:
355
+ st.session_state.sysTA = st.text_area(label="sysTA",value="fdsaf fsdafdsa")
356
+ elif "sysTAtext" in st.session_state:
357
+ st.session_state.sysTA = st.text_area(label="sysTA",value=st.session_state.sysTAtext)
358
+ else:
359
+ st.session_state.sysTA = st.text_area(label="sysTA",value=st.session_state.sysTA)
360
+
361
+ if "userpTA" not in st.session_state:
362
+ userTextArea = st.text_area(label="userpTA",value="fdsaf fsdafdsa")
363
+ elif "userpTAtext" in st.session_state:
364
+ st.session_state.userpTA = st.text_area(label="userpTA",value=st.session_state.userpTAtext)
365
+ else:
366
+ st.session_state.userpTA = st.text_area(label="userpTA",value=st.session_state.userpTA)
367
+
368
+ with col2:
369
+ if "ragpTA" not in st.session_state:
370
+ ragPromptTextArea = st.text_area(label="ragpTA",value="fdsaf fsdafdsa")
371
+ elif "ragpTAtext" in st.session_state:
372
+ st.session_state.ragpTA = st.text_area(label="ragpTA",value=st.session_state.ragpTAtext)
373
+ else:
374
+ st.session_state.ragTA = st.text_area(label="ragTA",value=st.session_state.ragTA)
375
+
376
+ if "rspTA" not in st.session_state:
377
+ responseTextArea = st.text_area(label="rspTA",value="fdsaf fsdafdsa")
378
+ elif "rspTAtext" in st.session_state:
379
+ st.session_state.rspTA = st.text_area(label="rspTA",value=st.session_state.rspTAtext)
380
+ else:
381
+ st.session_state.rspTA = st.text_area(label="rspTA",value=st.session_state.rspTA)
382
+
383
+ def runLLM(prompt):
384
+ max_tokens = 1000
385
+ temperature = 0.3
386
+ top_p = 0.1
387
+ echo = True
388
+ stop = ["Q", "\n"]
389
+
390
+ modelOutput = llm(
391
+ prompt,
392
+ max_tokens=max_tokens,
393
+ temperature=temperature,
394
+ top_p=top_p,
395
+ echo=echo,
396
+ stop=stop,
397
+ )
398
+ result = modelOutput["choices"][0]["text"].strip()
399
+ return(result)
400
+
401
+ def setPrompt(pprompt,ragFlag):
402
+ print("\n### setPrompt() entered. ragFlag: ",ragFlag)
403
+ if ragFlag:
404
+ ragPrompt = getRagData(pprompt)
405
+ userPrompt = pprompt + "\n" + ragPrompt
406
+ prompt = userPrompt
407
+ userPrompt = "Using this information: " + ragPrompt \
408
+ + "process the following statement or question and produce a a response" \
409
+ + intialPrompt
410
+ else:
411
+ userPrompt = pprompt
412
+ #prompt = f""" <s> [INST] <<SYS>> {systemTextArea.value} </SYS>> Q: {userPrompt} A: [/INST]"""
413
+ return userPrompt
414
+
415
+
416
+ def on_submitButton_clicked(b):
417
+ logger.debug("\n### on_submitButton_clicked")
418
+ st.session_state.sysTAtext = st.session_state.sysTA
419
+ logger.info(f"sysTAtext: {st.session_state.sysTAtext}")
420
+
421
+ st.session_state.userpTAtext = setPrompt("","")
422
+ st.session_state.userpTA = st.session_state.userpTAtext
423
+ logger.info(f"userpTAtext: {st.session_state.userpTAtext}")
424
+
425
+ st.session_state.rspTAtext = runLLM(st.session_state.userpTAtext)
426
+ st.session_state.rspTA = st.session_state.rspTAtext
427
+ logger.info(f"rspTAtext: {st.session_state.rspTAtext}")
428
+
429
+
430
+ with st.sidebar:
431
+ st.selectRag = st.checkbox("Enable Query With RAG",value=False,key="selectRag",help=None,on_change=None,args=None,kwargs=None,disabled=False,label_visibility="visible")
432
+ st.submitButton = st.button("Run LLM Query",key=None,help=None,on_click=on_submitButton_clicked,args=None,kwargs=None,type="secondary",disabled=False,use_container_width=False)
433
+
434
+ #logger.info("#### Closing client db connection.")
435
+ #client.close()
436
+ #logger.info("#### Program terminating.")