MVPilgrim commited on
Commit
ea3952b
·
1 Parent(s): 2c0a600
Files changed (3) hide show
  1. app.py +0 -426
  2. semsearch.py +10 -0
  3. startup.sh +2 -1
app.py CHANGED
@@ -1,426 +0,0 @@
1
- import weaviate
2
- from weaviate.connect import ConnectionParams
3
- from weaviate.classes.init import AdditionalConfig, Timeout
4
-
5
- from sentence_transformers import SentenceTransformer
6
- from langchain_community.document_loaders import BSHTMLLoader
7
- from pathlib import Path
8
- from lxml import html
9
- import logging
10
- from semantic_text_splitter import HuggingFaceTextSplitter
11
- from tokenizers import Tokenizer
12
- import json
13
- import os
14
- import re
15
- import logging
16
-
17
- import llama_cpp
18
- from llama_cpp import Llama
19
-
20
- import streamlit as st
21
-
22
-
23
- st.markdown("<h1 style='text-align: center; color: #666666;'>Vector Database RAG Proof of Concept</h1>", unsafe_allow_html=True)
24
- st.markdown("<h6 style='text-align: center; color: #666666;'>V1</h6>", unsafe_allow_html=True)
25
-
26
-
27
- weaviate_logger = logging.getLogger("httpx")
28
- weaviate_logger.setLevel(logging.WARNING)
29
-
30
- logger = logging.getLogger(__name__)
31
- logging.basicConfig(level=logging.INFO)
32
-
33
- # Function to load the CSS file
34
- def load_css(file_name):
35
- with open(file_name) as f:
36
- st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
37
-
38
- # Load the custom CSS
39
- load_css(".streamlit/main.css")
40
-
41
- st.markdown("<h1 style='text-align: center; color: #666666;'>Vector Database RAG Proof of Concept</h1>", unsafe_allow_html=True)
42
- st.markdown("<h6 style='text-align: center; color: #666666;'>V1</h6>", unsafe_allow_html=True)
43
-
44
- ######################################################################
45
- # MAINLINE
46
- #
47
- logger.info("#### MAINLINE ENTERED.")
48
-
49
- #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
50
- pathString = "/app/inputDocs"
51
- chunks = []
52
- webpageDocNames = []
53
- page_contentArray = []
54
- webpageChunks = []
55
- webpageTitles = []
56
- webpageChunksDocNames = []
57
-
58
- ######################################################
59
- # Connect to the Weaviate vector database.
60
- logger.info("#### Create Weaviate db client connection.")
61
- client = weaviate.WeaviateClient(
62
- connection_params=ConnectionParams.from_params(
63
- http_host="localhost",
64
- http_port="8080",
65
- http_secure=False,
66
- grpc_host="localhost",
67
- grpc_port="50051",
68
- grpc_secure=False
69
- ),
70
- additional_config=AdditionalConfig(
71
- timeout=Timeout(init=60, query=1800, insert=1800), # Values in seconds
72
- )
73
- )
74
- client.connect()
75
-
76
-
77
- #######################################################
78
- # Read each text input file, parse it into a document,
79
- # chunk it, collect chunks and document name.
80
- logger.info("#### Read and chunk input text files.")
81
- if not client.collections.exists("Documents") or not client.collections.exists("Documents") :
82
- for filename in os.listdir(pathString):
83
- logger.info(filename)
84
- path = Path(pathString + "/" + filename)
85
- filename = filename.rstrip(".html")
86
- webpageDocNames.append(filename)
87
- htmlLoader = BSHTMLLoader(path,"utf-8")
88
- htmlData = htmlLoader.load()
89
-
90
- title = htmlData[0].metadata['title']
91
- page_content = htmlData[0].page_content
92
-
93
- # Clean data. Remove multiple newlines, etc.
94
- page_content = re.sub(r'\n+', '\n',page_content)
95
-
96
- page_contentArray.append(page_content);
97
- webpageTitles.append(title)
98
- max_tokens = 1000
99
- tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
100
- logger.debug(f"### tokenizer: {tokenizer}")
101
- splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
102
- chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
103
-
104
- chunks = []
105
- for chnk in chunksOnePage:
106
- logger.debug(f"#### chnk in file: {chnk}")
107
- chunks.append(chnk)
108
- logger.debug(f"chunks: {chunks}")
109
- webpageChunks.append(chunks)
110
- webpageChunksDocNames.append(filename + "Chunks")
111
-
112
- logger.debug(f"### filename, title: {filename}, {title}")
113
- logger.debug(f"### webpageDocNames: {webpageDocNames}")
114
-
115
-
116
-
117
- ######################################################
118
- # Create database webpage and chunks collections.
119
- #wpCollection = createWebpageCollection()
120
- #wpChunkCollection = createChunksCollection()
121
- logger.info("#### createWebpageCollection() entered.")
122
- if not client.collections.exists("Documents"):
123
- #client.collections.delete("Documents")
124
- class_obj = {
125
- "class": "Documents",
126
- "description": "For first attempt at loading a Weviate database.",
127
- "vectorizer": "text2vec-transformers",
128
- "moduleConfig": {
129
- "text2vec-transformers": {
130
- "vectorizeClassName": False
131
- }
132
- },
133
- "vectorIndexType": "hnsw",
134
- "vectorIndexConfig": {
135
- "distance": "cosine",
136
- },
137
- "properties": [
138
- {
139
- "name": "title",
140
- "dataType": ["text"],
141
- "description": "HTML doc title.",
142
- "vectorizer": "text2vec-transformers",
143
- "moduleConfig": {
144
- "text2vec-transformers": {
145
- "vectorizePropertyName": True,
146
- "skip": False,
147
- "tokenization": "lowercase"
148
- }
149
- },
150
- "invertedIndexConfig": {
151
- "bm25": {
152
- "b": 0.75,
153
- "k1": 1.2
154
- },
155
- }
156
- },
157
- {
158
- "name": "content",
159
- "dataType": ["text"],
160
- "description": "HTML page content.",
161
- "moduleConfig": {
162
- "text2vec-transformers": {
163
- "vectorizePropertyName": True,
164
- "tokenization": "whitespace"
165
- }
166
- }
167
- }
168
- ]
169
- }
170
- wpCollection = client.collections.create_from_dict(class_obj)
171
-
172
- logger.info("#### createChunksCollection() entered.")
173
- if not client.collections.exists("Chunks"):
174
- #client.collections.delete("Chunks")
175
- class_obj = {
176
- "class": "Chunks",
177
- "description": "Collection for document chunks.",
178
- "vectorizer": "text2vec-transformers",
179
- "moduleConfig": {
180
- "text2vec-transformers": {
181
- "vectorizeClassName": True
182
- }
183
- },
184
- "vectorIndexType": "hnsw",
185
- "vectorIndexConfig": {
186
- "distance": "cosine",
187
- },
188
- "properties": [
189
- {
190
- "name": "chunk",
191
- "dataType": ["text"],
192
- "description": "Single webpage chunk.",
193
- "vectorizer": "text2vec-transformers",
194
- "moduleConfig": {
195
- "text2vec-transformers": {
196
- "vectorizePropertyName": False,
197
- "skip": False,
198
- "tokenization": "lowercase"
199
- }
200
- }
201
- },
202
- {
203
- "name": "chunk_index",
204
- "dataType": ["int"]
205
- },
206
- {
207
- "name": "webpage",
208
- "dataType": ["Documents"],
209
- "description": "Webpage content chunks.",
210
-
211
- "invertedIndexConfig": {
212
- "bm25": {
213
- "b": 0.75,
214
- "k1": 1.2
215
- }
216
- }
217
- }
218
- ]
219
- }
220
- wpChunkCollection = client.collections.create_from_dict(class_obj)
221
-
222
-
223
- ###########################################################
224
- # Create document and chunks objects in the database.
225
- if not client.collections.exists("Documents") :
226
- logger.info("#### Create page/doc db objects.")
227
- for i, className in enumerate(webpageDocNames):
228
- title = webpageTitles[i]
229
- logger.debug(f"## className, title: {className}, {title}")
230
- # Create Webpage Object
231
- page_content = page_contentArray[i]
232
- # Insert the document.
233
- wpCollectionObj_uuid = wpCollection.data.insert(
234
- {
235
- "name": className,
236
- "title": title,
237
- "content": page_content
238
- }
239
- )
240
-
241
- if not client.collections.exists("Chunks") :
242
- logger.info("#### Create chunk db objects.")
243
- # Insert the chunks for the document.
244
- for i2, chunk in enumerate(webpageChunks[i]):
245
- chunk_uuid = wpChunkCollection.data.insert(
246
- {
247
- "title": title,
248
- "chunk": chunk,
249
- "chunk_index": i2,
250
- "references":
251
- {
252
- "webpage": wpCollectionObj_uuid
253
- }
254
- }
255
- )
256
-
257
-
258
- #################################################################
259
- # Initialize the LLM.
260
- model_path = "/app/llama-2-7b-chat.Q4_0.gguf"
261
- llm = Llama(model_path,
262
- #*,
263
- n_gpu_layers=0,
264
- split_mode=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
265
- main_gpu=0,
266
- tensor_split=None,
267
- vocab_only=False,
268
- use_mmap=True,
269
- use_mlock=False,
270
- kv_overrides=None,
271
- seed=llama_cpp.LLAMA_DEFAULT_SEED,
272
- n_ctx=512,
273
- n_batch=512,
274
- n_threads=8,
275
- n_threads_batch=16,
276
- rope_scaling_type=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
277
- pooling_type=llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
278
- rope_freq_base=0.0,
279
- rope_freq_scale=0.0,
280
- yarn_ext_factor=-1.0,
281
- yarn_attn_factor=1.0,
282
- yarn_beta_fast=32.0,
283
- yarn_beta_slow=1.0,
284
- yarn_orig_ctx=0,
285
- logits_all=False,
286
- embedding=False,
287
- offload_kqv=True,
288
- last_n_tokens_size=64,
289
- lora_base=None,
290
- lora_scale=1.0,
291
- lora_path=None,
292
- numa=False,
293
- chat_format=None,
294
- chat_handler=None,
295
- draft_model=None,
296
- tokenizer=None,
297
- type_k=None,
298
- type_v=None,
299
- verbose=True
300
- )
301
-
302
-
303
- def getRagData(promptText):
304
- ###############################################################################
305
- # Initial the the sentence transformer and encode the query prompt.
306
- logger.info(f"#### Encode text query prompt to create vectors. {text}")
307
- model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
308
-
309
- vector = model.encode(promptText)
310
- vectorList = []
311
-
312
- logger.debug("#### Print vectors.")
313
- for vec in vector:
314
- vectorList.append(vec)
315
- logger.debug(f"vectorList: {vectorList[2]}")
316
-
317
- # Fetch chunks and print chunks.
318
- logger.info("#### Retrieve semchunks from db using vectors from prompt.")
319
- semChunks = wpChunkCollection.query.near_vector(
320
- near_vector=vectorList,
321
- distance=0.7,
322
- limit=3
323
- )
324
- logger.debug(f"### semChunks[0]: {semChunks}")
325
-
326
- # Print chunks, corresponding document and document title.
327
- ragData = ""
328
- logger.info("#### Print individual retrieved chunks.")
329
- for chunk in enumerate(semChunks.objects):
330
- logger.info(f"#### chunk: {chunk}")
331
- ragData = ragData + "\n" + chunk[0]
332
- webpage_uuid = chunk[1].properties['references']['webpage']
333
- logger.info(f"webpage_uuid: {webpage_uuid}")
334
- wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
335
- logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
336
- #collection = client.collections.get("Chunks")
337
- return ragData
338
-
339
-
340
- # Display UI
341
- col1, col2 = st.columns(2)
342
-
343
- with col1:
344
- if "sysTA" not in st.session_state:
345
- st.session_state.sysTA = st.text_area(label="sysTA",value="fdsaf fsdafdsa")
346
- elif "sysTAtext" in st.session_state:
347
- st.session_state.sysTA = st.text_area(label="sysTA",value=st.session_state.sysTAtext)
348
- else:
349
- st.session_state.sysTA = st.text_area(label="sysTA",value=st.session_state.sysTA)
350
-
351
- if "userpTA" not in st.session_state:
352
- userTextArea = st.text_area(label="userpTA",value="fdsaf fsdafdsa")
353
- elif "userpTAtext" in st.session_state:
354
- st.session_state.userpTA = st.text_area(label="userpTA",value=st.session_state.userpTAtext)
355
- else:
356
- st.session_state.userpTA = st.text_area(label="userpTA",value=st.session_state.userpTA)
357
-
358
- with col2:
359
- if "ragpTA" not in st.session_state:
360
- ragPromptTextArea = st.text_area(label="ragpTA",value="fdsaf fsdafdsa")
361
- elif "ragpTAtext" in st.session_state:
362
- st.session_state.ragpTA = st.text_area(label="ragpTA",value=st.session_state.ragpTAtext)
363
- else:
364
- st.session_state.ragTA = st.text_area(label="ragTA",value=st.session_state.ragTA)
365
-
366
- if "rspTA" not in st.session_state:
367
- responseTextArea = st.text_area(label="rspTA",value="fdsaf fsdafdsa")
368
- elif "rspTAtext" in st.session_state:
369
- st.session_state.rspTA = st.text_area(label="rspTA",value=st.session_state.rspTAtext)
370
- else:
371
- st.session_state.rspTA = st.text_area(label="rspTA",value=st.session_state.rspTA)
372
-
373
- def runLLM(prompt):
374
- max_tokens = 1000
375
- temperature = 0.3
376
- top_p = 0.1
377
- echo = True
378
- stop = ["Q", "\n"]
379
-
380
- modelOutput = llm(
381
- prompt,
382
- max_tokens=max_tokens,
383
- temperature=temperature,
384
- top_p=top_p,
385
- echo=echo,
386
- stop=stop,
387
- )
388
- result = modelOutput["choices"][0]["text"].strip()
389
- return(result)
390
-
391
- def setPrompt(pprompt,ragFlag):
392
- print("\n### setPrompt() entered. ragFlag: ",ragFlag)
393
- if ragFlag:
394
- ragPrompt = getRagData(pprompt)
395
- userPrompt = pprompt + "\n" + ragPrompt
396
- prompt = userPrompt
397
- userPrompt = "Using this information: " + ragPrompt \
398
- + "process the following statement or question and produce a a response" \
399
- + intialPrompt
400
- else:
401
- userPrompt = pprompt
402
- #prompt = f""" <s> [INST] <<SYS>> {systemTextArea.value} </SYS>> Q: {userPrompt} A: [/INST]"""
403
- return userPrompt
404
-
405
-
406
- def on_submitButton_clicked(b):
407
- logger.debug("\n### on_submitButton_clicked")
408
- st.session_state.sysTAtext = st.session_state.sysTA
409
- logger.info(f"sysTAtext: {st.session_state.sysTAtext}")
410
-
411
- st.session_state.userpTAtext = setPrompt("","")
412
- st.session_state.userpTA = st.session_state.userpTAtext
413
- logger.info(f"userpTAtext: {st.session_state.userpTAtext}")
414
-
415
- st.session_state.rspTAtext = runLLM(st.session_state.userpTAtext)
416
- st.session_state.rspTA = st.session_state.rspTAtext
417
- logger.info(f"rspTAtext: {st.session_state.rspTAtext}")
418
-
419
-
420
- with st.sidebar:
421
- st.selectRag = st.checkbox("Enable Query With RAG",value=False,key="selectRag",help=None,on_change=None,args=None,kwargs=None,disabled=False,label_visibility="visible")
422
- st.submitButton = st.button("Run LLM Query",key=None,help=None,on_click=on_submitButton_clicked,args=None,kwargs=None,type="secondary",disabled=False,use_container_width=False)
423
-
424
- #logger.info("#### Closing client db connection.")
425
- #client.close()
426
- #logger.info("#### Program terminating.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
semsearch.py CHANGED
@@ -18,6 +18,7 @@ import llama_cpp
18
  from llama_cpp import Llama
19
 
20
  import streamlit as st
 
21
 
22
 
23
  st.markdown("<h1 style='text-align: center; color: #666666;'>Vector Database RAG Proof of Concept</h1>", unsafe_allow_html=True)
@@ -30,6 +31,15 @@ weaviate_logger.setLevel(logging.WARNING)
30
  logger = logging.getLogger(__name__)
31
  logging.basicConfig(level=logging.INFO)
32
 
 
 
 
 
 
 
 
 
 
33
  # Function to load the CSS file
34
  def load_css(file_name):
35
  with open(file_name) as f:
 
18
  from llama_cpp import Llama
19
 
20
  import streamlit as st
21
+ import subprocess
22
 
23
 
24
  st.markdown("<h1 style='text-align: center; color: #666666;'>Vector Database RAG Proof of Concept</h1>", unsafe_allow_html=True)
 
31
  logger = logging.getLogger(__name__)
32
  logging.basicConfig(level=logging.INFO)
33
 
34
+
35
+ def runStartup():
36
+ result = subprocess.run(["bash", "startup.sh"], capture_output=True, text=True)
37
+ #print("Output:", result.stdout)
38
+ #print("Error:", result.stderr)
39
+ logger(f"Return code: {result.returncode}")
40
+ logger("### Running startup.sh")
41
+ runStartup()
42
+
43
  # Function to load the CSS file
44
  def load_css(file_name):
45
  with open(file_name) as f:
startup.sh CHANGED
@@ -62,8 +62,9 @@ env
62
 
63
  echo "#### Before sleep."
64
  sleep 30
 
65
 
66
- echo "#### Before /app/semsearch.py"
67
  #python /app/semsearch.py & #2>& 1 | tee /data/var/lib/weaviate/ss.log &
68
  #streamlit run /app/semsearch.py &
69
 
 
62
 
63
  echo "#### Before sleep."
64
  sleep 30
65
+ echo "#### startup.sh exiting."
66
 
67
+ #echo "#### Before /app/semsearch.py"
68
  #python /app/semsearch.py & #2>& 1 | tee /data/var/lib/weaviate/ss.log &
69
  #streamlit run /app/semsearch.py &
70