Spaces:
Sleeping
Sleeping
MVPilgrim
commited on
Commit
·
45bc919
1
Parent(s):
0d4153f
Got it running.
Browse files- Dockerfile +11 -6
- DockerfilePythonWeaviate +59 -0
- DockerfileTestWvT2v +15 -0
- multi-qa-MiniLM-L6-cos-v1 +1 -0
- requirements.txt +13 -12
- requirements_Orig.txt +19 -0
- semsearch.py +70 -88
- startup.sh +20 -8
Dockerfile
CHANGED
@@ -30,6 +30,8 @@ RUN go mod download
|
|
30 |
###############################################################################
|
31 |
# This image builds the weaviate server
|
32 |
FROM build_base AS server_builder
|
|
|
|
|
33 |
ARG TARGETARCH
|
34 |
ARG GITHASH="unknown"
|
35 |
ARG EXTRA_BUILD_ARGS=""
|
@@ -40,9 +42,11 @@ RUN CGO_ENABLED=0 GOARCH=$TARGETARCH go build $EXTRA_BUILD_ARGS \
|
|
40 |
|
41 |
###############################################################################
|
42 |
#python environment and app.
|
43 |
-
FROM python:3.11.5
|
44 |
#ENTRYPOINT ["/app/startup.sh"]
|
45 |
-
RUN apt update
|
|
|
|
|
46 |
WORKDIR /app
|
47 |
|
48 |
#RUN ls -l / || ls -l /lib || ls -l /usr || ls -l /usr/lib6 || echo "### An ls failed."
|
@@ -55,10 +59,11 @@ RUN chmod 755 /app/startup.sh
|
|
55 |
COPY --from=weaviate /bin/weaviate /app/weaviate
|
56 |
COPY --from=weaviate ./modules ./
|
57 |
|
58 |
-
COPY --from=server_builder /lib/libc.musl-x86_64.so.1 /lib
|
59 |
-
|
60 |
-
RUN
|
61 |
-
RUN
|
|
|
62 |
|
63 |
RUN mkdir -p /var/lib/weaviate/data y
|
64 |
RUN chmod -R 777 /var
|
|
|
30 |
###############################################################################
|
31 |
# This image builds the weaviate server
|
32 |
FROM build_base AS server_builder
|
33 |
+
RUN apk add python3.11.5
|
34 |
+
|
35 |
ARG TARGETARCH
|
36 |
ARG GITHASH="unknown"
|
37 |
ARG EXTRA_BUILD_ARGS=""
|
|
|
42 |
|
43 |
###############################################################################
|
44 |
#python environment and app.
|
45 |
+
#FROM python:3.11.5
|
46 |
#ENTRYPOINT ["/app/startup.sh"]
|
47 |
+
#RUN apt-get update && \
|
48 |
+
# apt-get install -y libc6 && \
|
49 |
+
# rm -rf /var/lib/apt/lists/*
|
50 |
WORKDIR /app
|
51 |
|
52 |
#RUN ls -l / || ls -l /lib || ls -l /usr || ls -l /usr/lib6 || echo "### An ls failed."
|
|
|
59 |
COPY --from=weaviate /bin/weaviate /app/weaviate
|
60 |
COPY --from=weaviate ./modules ./
|
61 |
|
62 |
+
#COPY --from=server_builder /lib/libc.musl-x86_64.so.1 /lib
|
63 |
+
#COPY /lib/libc.musl-x86_64.so.1 /lib
|
64 |
+
#RUN mkdir -p /usr/lib64 y
|
65 |
+
#RUN ls -l /usr/lib64
|
66 |
+
#RUN ln -s /usr/lib64/libc.so.6 /usr/lib64/libc.musl-x86_64.so.1
|
67 |
|
68 |
RUN mkdir -p /var/lib/weaviate/data y
|
69 |
RUN chmod -R 777 /var
|
DockerfilePythonWeaviate
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
###############################################################################
|
2 |
+
#python environment, main app and startup script.
|
3 |
+
FROM python:3.11.5
|
4 |
+
#FROM python:3.11.9-slim
|
5 |
+
#FROM python:3.11.9-alpine
|
6 |
+
#FROM python:3.11-bookworm
|
7 |
+
|
8 |
+
ENTRYPOINT ["/app/startup.sh"]
|
9 |
+
#RUN apt-get update && \
|
10 |
+
# apt-get install -y libc6 && \
|
11 |
+
# rm -rf /var/lib/apt/lists/*
|
12 |
+
WORKDIR /app
|
13 |
+
|
14 |
+
#RUN ls -l / || ls -l /lib || ls -l /usr || ls -l /usr/lib6 || echo "### An ls failed."
|
15 |
+
|
16 |
+
COPY ./requirements.txt /app/requirements.txt
|
17 |
+
COPY ./semsearch.py /app/semsearch.py
|
18 |
+
COPY ./startup.sh /app/startup.sh
|
19 |
+
RUN chmod 755 /app/startup.sh
|
20 |
+
|
21 |
+
COPY ./multi-qa-MiniLM-L6-cos-v1 /app/multi-qa-MiniLM-L6-cos-v1
|
22 |
+
|
23 |
+
RUN mkdir -p /app/inputDocs
|
24 |
+
COPY ./inputDocs/* /app/inputDocs
|
25 |
+
RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
|
26 |
+
RUN pip install https://files.pythonhosted.org/packages/13/87/e0cb08c2d4bd7d38ab63816b306c8b1e7cfdc0e59bd54462e8b0df069078/semantic_text_splitter-0.6.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
|
27 |
+
RUN pip show semantic-text-splitter
|
28 |
+
|
29 |
+
##############################################################################
|
30 |
+
# Install Weaviate
|
31 |
+
WORKDIR /app/weaviate
|
32 |
+
RUN wget -qO- https://github.com/weaviate/weaviate/releases/download/v1.24.10/weaviate-v1.24.10-linux-amd64.tar.gz | tar -xzf -
|
33 |
+
RUN ls -al /app/weaviate
|
34 |
+
|
35 |
+
# Set environment variables for Weaviate
|
36 |
+
ENV PATH="/app:/app/weaviate-v1.24.10-linux-x86_64:${PATH}"
|
37 |
+
# Expose the Weaviate port
|
38 |
+
EXPOSE 8080
|
39 |
+
|
40 |
+
##############################################################################
|
41 |
+
# Install text2vec-transformers
|
42 |
+
WORKDIR /app/text2vec-transformers
|
43 |
+
COPY --from=semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1 /app /app/text2vec-transformers
|
44 |
+
COPY --from=semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1 /usr/local/bin /app/text2vec-transformers/bin
|
45 |
+
|
46 |
+
COPY ./multi-qa-MiniLM-L6-cos-v1 /app/app/text2vec-transformers
|
47 |
+
|
48 |
+
ENV PATH="/app/text2vec-transformers:/app/text2vec-transformers/bin:${PATH}"
|
49 |
+
#RUN pip install -r requirements.txt
|
50 |
+
#RUN pip install nltk==3.8.1 optimum==1.13.2 onnxruntime==1.16.1 onnx==1.14.1
|
51 |
+
RUN ./custom_prerequisites.py
|
52 |
+
|
53 |
+
##############################
|
54 |
+
RUN useradd -m -u 1000 user
|
55 |
+
|
56 |
+
##############################################################################
|
57 |
+
# Start the weaviate vector database, text2vec-transformers and the semantic search app.
|
58 |
+
#RUN /app/startup.sh
|
59 |
+
CMD ["/app/startup.sh"]
|
DockerfileTestWvT2v
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Start with the official Weaviate image
|
2 |
+
FROM semitechnologies/weaviate:latest
|
3 |
+
|
4 |
+
# Set environment variables
|
5 |
+
ENV WEAVIATE_SERVE_MODULES text2vec-transformers
|
6 |
+
|
7 |
+
# Install Python and pip via apk, the package manager for Alpine
|
8 |
+
RUN apk update && apk add --no-cache python3 py3-pip transformers
|
9 |
+
#RUN pip3 install --no-cache-dir transformers
|
10 |
+
|
11 |
+
# Expose the default port for Weaviate
|
12 |
+
EXPOSE 8080
|
13 |
+
|
14 |
+
# Start Weaviate
|
15 |
+
CMD ["weaviate", "start"]
|
multi-qa-MiniLM-L6-cos-v1
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 38845167a107b59398111f0cfb430897cf1a4639
|
requirements.txt
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
-
|
2 |
-
torch
|
3 |
-
gradio
|
4 |
-
sentencepiece
|
5 |
-
protobuf
|
6 |
-
weaviate-client==4.5.1
|
7 |
sentence-transformers
|
8 |
langchain
|
9 |
lxml
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
weaviate-client==4.*
|
|
|
|
|
|
|
|
|
|
|
2 |
sentence-transformers
|
3 |
langchain
|
4 |
lxml
|
5 |
+
beautifulsoup4
|
6 |
+
|
7 |
+
transformers==4.34.1
|
8 |
+
fastapi==0.103.2
|
9 |
+
uvicorn==0.23.2
|
10 |
+
nltk==3.8.1
|
11 |
+
torch==2.0.1
|
12 |
+
sentencepiece==0.1.99
|
13 |
+
sentence-transformers==2.2.2
|
14 |
+
optimum==1.13.2
|
15 |
+
onnxruntime==1.16.1
|
16 |
+
onnx==1.14.1
|
requirements_Orig.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers==4.40.1
|
2 |
+
torch==2.3.0
|
3 |
+
#gradio
|
4 |
+
#sentencepiece
|
5 |
+
#protobuf
|
6 |
+
weaviate-client==4.*
|
7 |
+
sentence-transformers
|
8 |
+
langchain
|
9 |
+
lxml
|
10 |
+
#huggingface-hub
|
11 |
+
#semantic-text-splitter
|
12 |
+
#tokenizers
|
13 |
+
#json5
|
14 |
+
#regex
|
15 |
+
beautifulsoup4
|
16 |
+
uvicorn
|
17 |
+
fastapi
|
18 |
+
optimum==1.16.2
|
19 |
+
onnx
|
semsearch.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import weaviate
|
2 |
-
|
3 |
-
#from weaviate.embedded import EmbeddedOptions
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
from langchain_community.document_loaders import BSHTMLLoader
|
6 |
from pathlib import Path
|
@@ -11,9 +10,19 @@ from tokenizers import Tokenizer
|
|
11 |
import json
|
12 |
import os
|
13 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
|
|
|
|
15 |
def createChunksCollection():
|
16 |
-
|
17 |
if client.collections.exists("Chunks"):
|
18 |
client.collections.delete("Chunks")
|
19 |
|
@@ -62,11 +71,13 @@ def createChunksCollection():
|
|
62 |
}
|
63 |
]
|
64 |
}
|
65 |
-
|
66 |
return(client.collections.create_from_dict(class_obj))
|
67 |
|
|
|
|
|
|
|
68 |
def createWebpageCollection():
|
69 |
-
|
70 |
if client.collections.exists("Documents"):
|
71 |
client.collections.delete("Documents")
|
72 |
|
@@ -84,11 +95,6 @@ def createWebpageCollection():
|
|
84 |
"distance": "cosine",
|
85 |
},
|
86 |
"properties": [
|
87 |
-
#{
|
88 |
-
# "docname": "fdsa",
|
89 |
-
# "dataType": ["text"],
|
90 |
-
# "description": "Name of document"
|
91 |
-
#},
|
92 |
{
|
93 |
"name": "title",
|
94 |
"dataType": ["text"],
|
@@ -121,61 +127,43 @@ def createWebpageCollection():
|
|
121 |
}
|
122 |
]
|
123 |
}
|
124 |
-
|
125 |
return(client.collections.create_from_dict(class_obj))
|
126 |
|
127 |
|
128 |
-
|
129 |
# MAINLINE
|
130 |
#
|
|
|
|
|
131 |
#pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
|
132 |
-
pathString = "inputDocs"
|
133 |
chunks = []
|
134 |
webpageDocNames = []
|
135 |
-
#webpageChunksClassesNames = []
|
136 |
page_contentArray = []
|
137 |
webpageChunks = []
|
138 |
webpageTitles = []
|
139 |
webpageChunksDocNames = []
|
140 |
|
141 |
-
#client = weaviate.WeaviateClient(
|
142 |
-
# embedded_options=EmbeddedOptions(
|
143 |
-
# additional_env_vars={
|
144 |
-
# "ENABLE_MODULES": "backup-filesystem,text2vec-transformers",
|
145 |
-
# "BACKUP_FILESYSTEM_PATH": "/tmp/backups",
|
146 |
-
# "PERSISTENCE_DATA_PATH": "/var/lib/weaviate",
|
147 |
-
# "DEFAULT_VECTORIZER_MODULE": "text2vec-transformers"
|
148 |
-
# #"TRANSFORMERS_INFERENCE_API": "http://huggingface.co/spaces/MVPilgrim/WeaviateDB:8080"
|
149 |
-
#
|
150 |
-
# }
|
151 |
-
# )
|
152 |
-
#)
|
153 |
-
|
154 |
-
#client = weaviate.connect_to_custom(
|
155 |
-
# #http_host="http://huggingface.co/spaces/MVPilgrim/WeaviateDB",
|
156 |
-
# http_host="http://weaviate",
|
157 |
-
# http_port=8080,
|
158 |
-
# http_secure=False,
|
159 |
-
# #grpc_host="huggingface.co",
|
160 |
-
# grpc_host="127.0.0.1",
|
161 |
-
# grpc_port=50051,
|
162 |
-
# grpc_secure=False
|
163 |
-
# #auth_credentials=AuthApiKey(weaviate_key), # `weaviate_key`: your Weaviate API key
|
164 |
-
#)
|
165 |
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
)
|
169 |
-
|
170 |
-
#client = weaviate.connect_to_local(
|
171 |
-
# #cluster_url="http://localhost:8080"
|
172 |
-
#)
|
173 |
-
print("#### client: ",client)
|
174 |
-
|
175 |
client.connect()
|
176 |
|
|
|
|
|
|
|
|
|
177 |
for filename in os.listdir(pathString):
|
178 |
-
|
179 |
path = Path(pathString + "/" + filename)
|
180 |
filename = filename.rstrip(".html")
|
181 |
webpageDocNames.append(filename)
|
@@ -185,38 +173,43 @@ for filename in os.listdir(pathString):
|
|
185 |
title = htmlData[0].metadata['title']
|
186 |
page_content = htmlData[0].page_content
|
187 |
|
188 |
-
#
|
189 |
page_content = re.sub(r'\n+', '\n',page_content)
|
190 |
|
191 |
page_contentArray.append(page_content);
|
192 |
webpageTitles.append(title)
|
193 |
-
#htmlDocument = htmlData[0]
|
194 |
max_tokens = 1000
|
195 |
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
|
|
|
196 |
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
|
197 |
chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
|
198 |
|
199 |
chunks = []
|
200 |
for chnk in chunksOnePage:
|
201 |
-
|
202 |
chunks.append(chnk)
|
203 |
-
|
204 |
webpageChunks.append(chunks)
|
205 |
webpageChunksDocNames.append(filename + "Chunks")
|
206 |
|
207 |
-
|
208 |
-
|
|
|
209 |
|
|
|
|
|
210 |
wpCollection = createWebpageCollection()
|
211 |
wpChunkCollection = createChunksCollection()
|
212 |
|
|
|
|
|
|
|
213 |
for i, className in enumerate(webpageDocNames):
|
214 |
title = webpageTitles[i]
|
215 |
-
|
216 |
# Create Webpage Object
|
217 |
page_content = page_contentArray[i]
|
218 |
-
#
|
219 |
-
|
220 |
wpCollectionObj_uuid = wpCollection.data.insert(
|
221 |
{
|
222 |
"name": className,
|
@@ -225,8 +218,8 @@ for i, className in enumerate(webpageDocNames):
|
|
225 |
}
|
226 |
)
|
227 |
|
|
|
228 |
for i2, chunk in enumerate(webpageChunks[i]):
|
229 |
-
#print("#### chunk: ",chunk)
|
230 |
chunk_uuid = wpChunkCollection.data.insert(
|
231 |
{
|
232 |
"title": title,
|
@@ -238,55 +231,44 @@ for i, className in enumerate(webpageDocNames):
|
|
238 |
}
|
239 |
}
|
240 |
)
|
241 |
-
#print("### chunk_index,chunk: ",i2,",",chunk[0:20])
|
242 |
|
243 |
-
|
244 |
-
#text
|
245 |
-
#text = "turkey burgers golden fried with lots of mayonaise"
|
246 |
text = "human-made computer cognitive ability"
|
247 |
-
#text = "literature authors"
|
248 |
-
#text = "artifical intelligence"
|
249 |
|
250 |
|
251 |
-
|
|
|
|
|
|
|
|
|
252 |
vector = model.encode(text)
|
253 |
-
#print("#### vector: ",vector[0])
|
254 |
vectorList = []
|
255 |
|
|
|
256 |
for vec in vector:
|
257 |
vectorList.append(vec)
|
258 |
-
|
259 |
|
|
|
|
|
260 |
semChunks = wpChunkCollection.query.near_vector(
|
261 |
near_vector=vectorList,
|
262 |
distance=0.7,
|
263 |
limit=3
|
264 |
)
|
265 |
-
|
266 |
-
#print("### semChunks.objects[0]: ",semChunks.objects[0])
|
267 |
|
|
|
|
|
268 |
for chunk in enumerate(semChunks.objects):
|
269 |
-
|
270 |
-
#webpage_uuid = chunk.properties['references']['webpage']
|
271 |
-
#webpage_uuid = chunk.references.webpage
|
272 |
webpage_uuid = chunk[1].properties['references']['webpage']
|
273 |
-
|
274 |
wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
|
275 |
-
|
276 |
|
|
|
|
|
277 |
|
278 |
-
|
279 |
-
|
280 |
-
if False:
|
281 |
-
client = weaviate.connect_to_local(
|
282 |
-
#cluster_url="http://localhost:8080"
|
283 |
-
)
|
284 |
-
|
285 |
-
for item in wpCollection.iterator():
|
286 |
-
print(print("\n## webpage collection: ",item.uuid, item.properties))
|
287 |
-
|
288 |
-
for item in wpChunkCollection.iterator():
|
289 |
-
print(print("\n## chunk collection: ",item.uuid, item.properties))
|
290 |
-
|
291 |
-
client.close()
|
292 |
-
|
|
|
1 |
import weaviate
|
2 |
+
|
|
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
from langchain_community.document_loaders import BSHTMLLoader
|
5 |
from pathlib import Path
|
|
|
10 |
import json
|
11 |
import os
|
12 |
import re
|
13 |
+
import logging
|
14 |
+
|
15 |
+
weaviate_logger = logging.getLogger("httpx")
|
16 |
+
weaviate_logger.setLevel(logging.WARNING)
|
17 |
+
|
18 |
+
logger = logging.getLogger(__name__)
|
19 |
+
logging.basicConfig(level=logging.INFO)
|
20 |
+
|
21 |
|
22 |
+
#################################################################
|
23 |
+
# Create the chunks collection for the Weaviate database.
|
24 |
def createChunksCollection():
|
25 |
+
logger.info("#### createChunksCollection() entered.")
|
26 |
if client.collections.exists("Chunks"):
|
27 |
client.collections.delete("Chunks")
|
28 |
|
|
|
71 |
}
|
72 |
]
|
73 |
}
|
|
|
74 |
return(client.collections.create_from_dict(class_obj))
|
75 |
|
76 |
+
|
77 |
+
#####################################################################
|
78 |
+
# Create the document collection for the Weaviate database.
|
79 |
def createWebpageCollection():
|
80 |
+
logger.info("#### createWebpageCollection() entered.")
|
81 |
if client.collections.exists("Documents"):
|
82 |
client.collections.delete("Documents")
|
83 |
|
|
|
95 |
"distance": "cosine",
|
96 |
},
|
97 |
"properties": [
|
|
|
|
|
|
|
|
|
|
|
98 |
{
|
99 |
"name": "title",
|
100 |
"dataType": ["text"],
|
|
|
127 |
}
|
128 |
]
|
129 |
}
|
|
|
130 |
return(client.collections.create_from_dict(class_obj))
|
131 |
|
132 |
|
133 |
+
######################################################################
|
134 |
# MAINLINE
|
135 |
#
|
136 |
+
logger.info("#### MAINLINE ENTERED.")
|
137 |
+
|
138 |
#pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
|
139 |
+
pathString = "/app/inputDocs"
|
140 |
chunks = []
|
141 |
webpageDocNames = []
|
|
|
142 |
page_contentArray = []
|
143 |
webpageChunks = []
|
144 |
webpageTitles = []
|
145 |
webpageChunksDocNames = []
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
+
######################################################
|
149 |
+
# Connect to the Weaviate vector database.
|
150 |
+
logger.info("#### Create Weaviate db client connection.")
|
151 |
+
client = weaviate.connect_to_custom(
|
152 |
+
http_host="127.0.0.1",
|
153 |
+
http_port=8080,
|
154 |
+
http_secure=False,
|
155 |
+
grpc_host="127.0.0.1",
|
156 |
+
grpc_port=50051,
|
157 |
+
grpc_secure=False
|
158 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
client.connect()
|
160 |
|
161 |
+
#######################################################
|
162 |
+
# Read each text input file, parse it into a document,
|
163 |
+
# chunk it, collect chunks and document name.
|
164 |
+
logger.info("#### Read and chunk input text files.")
|
165 |
for filename in os.listdir(pathString):
|
166 |
+
logger.info(filename)
|
167 |
path = Path(pathString + "/" + filename)
|
168 |
filename = filename.rstrip(".html")
|
169 |
webpageDocNames.append(filename)
|
|
|
173 |
title = htmlData[0].metadata['title']
|
174 |
page_content = htmlData[0].page_content
|
175 |
|
176 |
+
# Clean data. Remove multiple newlines, etc.
|
177 |
page_content = re.sub(r'\n+', '\n',page_content)
|
178 |
|
179 |
page_contentArray.append(page_content);
|
180 |
webpageTitles.append(title)
|
|
|
181 |
max_tokens = 1000
|
182 |
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
|
183 |
+
logger.debug(f"### tokenizer: {tokenizer}")
|
184 |
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
|
185 |
chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
|
186 |
|
187 |
chunks = []
|
188 |
for chnk in chunksOnePage:
|
189 |
+
logger.debug(f"#### chnk in file: {chnk}")
|
190 |
chunks.append(chnk)
|
191 |
+
logger.debug(f"chunks: {chunks}")
|
192 |
webpageChunks.append(chunks)
|
193 |
webpageChunksDocNames.append(filename + "Chunks")
|
194 |
|
195 |
+
logger.debug(f"### filename, title: {filename}, {title}")
|
196 |
+
|
197 |
+
logger.debug(f"### webpageDocNames: {webpageDocNames}")
|
198 |
|
199 |
+
######################################################
|
200 |
+
# Create database webpage and chunks collections.
|
201 |
wpCollection = createWebpageCollection()
|
202 |
wpChunkCollection = createChunksCollection()
|
203 |
|
204 |
+
###########################################################
|
205 |
+
# Create document and chunks objects in the database.
|
206 |
+
logger.info("#### Create page/doc and chunk db objects.")
|
207 |
for i, className in enumerate(webpageDocNames):
|
208 |
title = webpageTitles[i]
|
209 |
+
logger.debug(f"## className, title: {className}, {title}")
|
210 |
# Create Webpage Object
|
211 |
page_content = page_contentArray[i]
|
212 |
+
# Insert the document.
|
|
|
213 |
wpCollectionObj_uuid = wpCollection.data.insert(
|
214 |
{
|
215 |
"name": className,
|
|
|
218 |
}
|
219 |
)
|
220 |
|
221 |
+
# Insert the chunks for the document.
|
222 |
for i2, chunk in enumerate(webpageChunks[i]):
|
|
|
223 |
chunk_uuid = wpChunkCollection.data.insert(
|
224 |
{
|
225 |
"title": title,
|
|
|
231 |
}
|
232 |
}
|
233 |
)
|
|
|
234 |
|
235 |
+
###############################################################################
|
236 |
+
# text contains prompt for vector DB.
|
|
|
237 |
text = "human-made computer cognitive ability"
|
|
|
|
|
238 |
|
239 |
|
240 |
+
###############################################################################
|
241 |
+
# Initial the the sentence transformer and encode the query prompt.
|
242 |
+
logger.info(f"#### Encode text query prompt to create vectors. {text}")
|
243 |
+
model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
|
244 |
+
|
245 |
vector = model.encode(text)
|
|
|
246 |
vectorList = []
|
247 |
|
248 |
+
logger.debug("#### Print vectors.")
|
249 |
for vec in vector:
|
250 |
vectorList.append(vec)
|
251 |
+
logger.debug(f"vectorList: {vectorList[2]}")
|
252 |
|
253 |
+
# Fetch chunks and print chunks.
|
254 |
+
logger.info("#### Retrieve semchunks from db using vectors from prompt.")
|
255 |
semChunks = wpChunkCollection.query.near_vector(
|
256 |
near_vector=vectorList,
|
257 |
distance=0.7,
|
258 |
limit=3
|
259 |
)
|
260 |
+
logger.debug(f"### semChunks[0]: {semChunks}")
|
|
|
261 |
|
262 |
+
# Print chunks, corresponding document and document title.
|
263 |
+
logger.info("#### Print individual retrieved chunks.")
|
264 |
for chunk in enumerate(semChunks.objects):
|
265 |
+
logger.info(f"#### chunk: {chunk}")
|
|
|
|
|
266 |
webpage_uuid = chunk[1].properties['references']['webpage']
|
267 |
+
logger.info(f"webpage_uuid: {webpage_uuid}")
|
268 |
wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
|
269 |
+
logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
|
270 |
|
271 |
+
logger.info("#### Closing client db connection.")
|
272 |
+
client.close()
|
273 |
|
274 |
+
logger.info("#### Program terminating.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
startup.sh
CHANGED
@@ -1,20 +1,32 @@
|
|
1 |
#! /bin/bash
|
2 |
|
3 |
echo "#### startup.sh entered."
|
4 |
-
ls -l /
|
5 |
-
ls -l /
|
6 |
-
ls -l /
|
7 |
|
8 |
-
|
9 |
-
|
|
|
|
|
10 |
|
|
|
|
|
11 |
echo "#### Before /app/weaviate"
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
echo "#### Before sleep."
|
15 |
-
sleep
|
16 |
|
17 |
echo "#### Before /app/semsearch.py"
|
18 |
python /app/semsearch.py &
|
19 |
|
20 |
-
wait
|
|
|
|
|
|
1 |
#! /bin/bash
|
2 |
|
3 |
echo "#### startup.sh entered."
|
4 |
+
echo "### ls -l /app"; ls -l /app
|
5 |
+
echo "### ls -l /app/weaviate"; ls -l /app/weaviate
|
6 |
+
echo "### ls -l /app/text2vec-transformers"; ls -l /app/text2vec-transformers
|
7 |
|
8 |
+
################################################
|
9 |
+
# Start tex2vec-transformers
|
10 |
+
echo "#### Before /app/text2vec-transformers"
|
11 |
+
/app/text2vec-transformers/bin/uvicorn app:app --host 0.0.0.0 --port 8081 --log-level warning &
|
12 |
|
13 |
+
###############################################
|
14 |
+
# Start the weaviate vector database server.
|
15 |
echo "#### Before /app/weaviate"
|
16 |
+
export AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED=true \
|
17 |
+
PERSISTENCE_DATA_PATH=/var/lib/weaviate \
|
18 |
+
DEFAULT_VECTORIZER_MODULE=text2vec-transformers \
|
19 |
+
ENABLE_MODULES=text2vec-transformers \
|
20 |
+
TRANSFORMERS_INFERENCE_API=http://127.0.0.1:8081 \
|
21 |
+
LOG_LEVEL=warning
|
22 |
+
/app/weaviate/weaviate --host 127.0.0.1 --port 8080 --scheme http &
|
23 |
|
24 |
echo "#### Before sleep."
|
25 |
+
sleep 60
|
26 |
|
27 |
echo "#### Before /app/semsearch.py"
|
28 |
python /app/semsearch.py &
|
29 |
|
30 |
+
wait
|
31 |
+
|
32 |
+
|