Spaces:
Sleeping
Sleeping
MVPilgrim
commited on
Commit
·
45bc919
1
Parent(s):
0d4153f
Got it running.
Browse files- Dockerfile +11 -6
- DockerfilePythonWeaviate +59 -0
- DockerfileTestWvT2v +15 -0
- multi-qa-MiniLM-L6-cos-v1 +1 -0
- requirements.txt +13 -12
- requirements_Orig.txt +19 -0
- semsearch.py +70 -88
- startup.sh +20 -8
Dockerfile
CHANGED
|
@@ -30,6 +30,8 @@ RUN go mod download
|
|
| 30 |
###############################################################################
|
| 31 |
# This image builds the weaviate server
|
| 32 |
FROM build_base AS server_builder
|
|
|
|
|
|
|
| 33 |
ARG TARGETARCH
|
| 34 |
ARG GITHASH="unknown"
|
| 35 |
ARG EXTRA_BUILD_ARGS=""
|
|
@@ -40,9 +42,11 @@ RUN CGO_ENABLED=0 GOARCH=$TARGETARCH go build $EXTRA_BUILD_ARGS \
|
|
| 40 |
|
| 41 |
###############################################################################
|
| 42 |
#python environment and app.
|
| 43 |
-
FROM python:3.11.5
|
| 44 |
#ENTRYPOINT ["/app/startup.sh"]
|
| 45 |
-
RUN apt update
|
|
|
|
|
|
|
| 46 |
WORKDIR /app
|
| 47 |
|
| 48 |
#RUN ls -l / || ls -l /lib || ls -l /usr || ls -l /usr/lib6 || echo "### An ls failed."
|
|
@@ -55,10 +59,11 @@ RUN chmod 755 /app/startup.sh
|
|
| 55 |
COPY --from=weaviate /bin/weaviate /app/weaviate
|
| 56 |
COPY --from=weaviate ./modules ./
|
| 57 |
|
| 58 |
-
COPY --from=server_builder /lib/libc.musl-x86_64.so.1 /lib
|
| 59 |
-
|
| 60 |
-
RUN
|
| 61 |
-
RUN
|
|
|
|
| 62 |
|
| 63 |
RUN mkdir -p /var/lib/weaviate/data y
|
| 64 |
RUN chmod -R 777 /var
|
|
|
|
| 30 |
###############################################################################
|
| 31 |
# This image builds the weaviate server
|
| 32 |
FROM build_base AS server_builder
|
| 33 |
+
RUN apk add python3.11.5
|
| 34 |
+
|
| 35 |
ARG TARGETARCH
|
| 36 |
ARG GITHASH="unknown"
|
| 37 |
ARG EXTRA_BUILD_ARGS=""
|
|
|
|
| 42 |
|
| 43 |
###############################################################################
|
| 44 |
#python environment and app.
|
| 45 |
+
#FROM python:3.11.5
|
| 46 |
#ENTRYPOINT ["/app/startup.sh"]
|
| 47 |
+
#RUN apt-get update && \
|
| 48 |
+
# apt-get install -y libc6 && \
|
| 49 |
+
# rm -rf /var/lib/apt/lists/*
|
| 50 |
WORKDIR /app
|
| 51 |
|
| 52 |
#RUN ls -l / || ls -l /lib || ls -l /usr || ls -l /usr/lib6 || echo "### An ls failed."
|
|
|
|
| 59 |
COPY --from=weaviate /bin/weaviate /app/weaviate
|
| 60 |
COPY --from=weaviate ./modules ./
|
| 61 |
|
| 62 |
+
#COPY --from=server_builder /lib/libc.musl-x86_64.so.1 /lib
|
| 63 |
+
#COPY /lib/libc.musl-x86_64.so.1 /lib
|
| 64 |
+
#RUN mkdir -p /usr/lib64 y
|
| 65 |
+
#RUN ls -l /usr/lib64
|
| 66 |
+
#RUN ln -s /usr/lib64/libc.so.6 /usr/lib64/libc.musl-x86_64.so.1
|
| 67 |
|
| 68 |
RUN mkdir -p /var/lib/weaviate/data y
|
| 69 |
RUN chmod -R 777 /var
|
DockerfilePythonWeaviate
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
###############################################################################
|
| 2 |
+
#python environment, main app and startup script.
|
| 3 |
+
FROM python:3.11.5
|
| 4 |
+
#FROM python:3.11.9-slim
|
| 5 |
+
#FROM python:3.11.9-alpine
|
| 6 |
+
#FROM python:3.11-bookworm
|
| 7 |
+
|
| 8 |
+
ENTRYPOINT ["/app/startup.sh"]
|
| 9 |
+
#RUN apt-get update && \
|
| 10 |
+
# apt-get install -y libc6 && \
|
| 11 |
+
# rm -rf /var/lib/apt/lists/*
|
| 12 |
+
WORKDIR /app
|
| 13 |
+
|
| 14 |
+
#RUN ls -l / || ls -l /lib || ls -l /usr || ls -l /usr/lib6 || echo "### An ls failed."
|
| 15 |
+
|
| 16 |
+
COPY ./requirements.txt /app/requirements.txt
|
| 17 |
+
COPY ./semsearch.py /app/semsearch.py
|
| 18 |
+
COPY ./startup.sh /app/startup.sh
|
| 19 |
+
RUN chmod 755 /app/startup.sh
|
| 20 |
+
|
| 21 |
+
COPY ./multi-qa-MiniLM-L6-cos-v1 /app/multi-qa-MiniLM-L6-cos-v1
|
| 22 |
+
|
| 23 |
+
RUN mkdir -p /app/inputDocs
|
| 24 |
+
COPY ./inputDocs/* /app/inputDocs
|
| 25 |
+
RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
|
| 26 |
+
RUN pip install https://files.pythonhosted.org/packages/13/87/e0cb08c2d4bd7d38ab63816b306c8b1e7cfdc0e59bd54462e8b0df069078/semantic_text_splitter-0.6.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
|
| 27 |
+
RUN pip show semantic-text-splitter
|
| 28 |
+
|
| 29 |
+
##############################################################################
|
| 30 |
+
# Install Weaviate
|
| 31 |
+
WORKDIR /app/weaviate
|
| 32 |
+
RUN wget -qO- https://github.com/weaviate/weaviate/releases/download/v1.24.10/weaviate-v1.24.10-linux-amd64.tar.gz | tar -xzf -
|
| 33 |
+
RUN ls -al /app/weaviate
|
| 34 |
+
|
| 35 |
+
# Set environment variables for Weaviate
|
| 36 |
+
ENV PATH="/app:/app/weaviate-v1.24.10-linux-x86_64:${PATH}"
|
| 37 |
+
# Expose the Weaviate port
|
| 38 |
+
EXPOSE 8080
|
| 39 |
+
|
| 40 |
+
##############################################################################
|
| 41 |
+
# Install text2vec-transformers
|
| 42 |
+
WORKDIR /app/text2vec-transformers
|
| 43 |
+
COPY --from=semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1 /app /app/text2vec-transformers
|
| 44 |
+
COPY --from=semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1 /usr/local/bin /app/text2vec-transformers/bin
|
| 45 |
+
|
| 46 |
+
COPY ./multi-qa-MiniLM-L6-cos-v1 /app/app/text2vec-transformers
|
| 47 |
+
|
| 48 |
+
ENV PATH="/app/text2vec-transformers:/app/text2vec-transformers/bin:${PATH}"
|
| 49 |
+
#RUN pip install -r requirements.txt
|
| 50 |
+
#RUN pip install nltk==3.8.1 optimum==1.13.2 onnxruntime==1.16.1 onnx==1.14.1
|
| 51 |
+
RUN ./custom_prerequisites.py
|
| 52 |
+
|
| 53 |
+
##############################
|
| 54 |
+
RUN useradd -m -u 1000 user
|
| 55 |
+
|
| 56 |
+
##############################################################################
|
| 57 |
+
# Start the weaviate vector database, text2vec-transformers and the semantic search app.
|
| 58 |
+
#RUN /app/startup.sh
|
| 59 |
+
CMD ["/app/startup.sh"]
|
DockerfileTestWvT2v
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Start with the official Weaviate image
|
| 2 |
+
FROM semitechnologies/weaviate:latest
|
| 3 |
+
|
| 4 |
+
# Set environment variables
|
| 5 |
+
ENV WEAVIATE_SERVE_MODULES text2vec-transformers
|
| 6 |
+
|
| 7 |
+
# Install Python and pip via apk, the package manager for Alpine
|
| 8 |
+
RUN apk update && apk add --no-cache python3 py3-pip transformers
|
| 9 |
+
#RUN pip3 install --no-cache-dir transformers
|
| 10 |
+
|
| 11 |
+
# Expose the default port for Weaviate
|
| 12 |
+
EXPOSE 8080
|
| 13 |
+
|
| 14 |
+
# Start Weaviate
|
| 15 |
+
CMD ["weaviate", "start"]
|
multi-qa-MiniLM-L6-cos-v1
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit 38845167a107b59398111f0cfb430897cf1a4639
|
requirements.txt
CHANGED
|
@@ -1,15 +1,16 @@
|
|
| 1 |
-
|
| 2 |
-
torch
|
| 3 |
-
gradio
|
| 4 |
-
sentencepiece
|
| 5 |
-
protobuf
|
| 6 |
-
weaviate-client==4.5.1
|
| 7 |
sentence-transformers
|
| 8 |
langchain
|
| 9 |
lxml
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
weaviate-client==4.*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
sentence-transformers
|
| 3 |
langchain
|
| 4 |
lxml
|
| 5 |
+
beautifulsoup4
|
| 6 |
+
|
| 7 |
+
transformers==4.34.1
|
| 8 |
+
fastapi==0.103.2
|
| 9 |
+
uvicorn==0.23.2
|
| 10 |
+
nltk==3.8.1
|
| 11 |
+
torch==2.0.1
|
| 12 |
+
sentencepiece==0.1.99
|
| 13 |
+
sentence-transformers==2.2.2
|
| 14 |
+
optimum==1.13.2
|
| 15 |
+
onnxruntime==1.16.1
|
| 16 |
+
onnx==1.14.1
|
requirements_Orig.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers==4.40.1
|
| 2 |
+
torch==2.3.0
|
| 3 |
+
#gradio
|
| 4 |
+
#sentencepiece
|
| 5 |
+
#protobuf
|
| 6 |
+
weaviate-client==4.*
|
| 7 |
+
sentence-transformers
|
| 8 |
+
langchain
|
| 9 |
+
lxml
|
| 10 |
+
#huggingface-hub
|
| 11 |
+
#semantic-text-splitter
|
| 12 |
+
#tokenizers
|
| 13 |
+
#json5
|
| 14 |
+
#regex
|
| 15 |
+
beautifulsoup4
|
| 16 |
+
uvicorn
|
| 17 |
+
fastapi
|
| 18 |
+
optimum==1.16.2
|
| 19 |
+
onnx
|
semsearch.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import weaviate
|
| 2 |
-
|
| 3 |
-
#from weaviate.embedded import EmbeddedOptions
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
from langchain_community.document_loaders import BSHTMLLoader
|
| 6 |
from pathlib import Path
|
|
@@ -11,9 +10,19 @@ from tokenizers import Tokenizer
|
|
| 11 |
import json
|
| 12 |
import os
|
| 13 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
|
|
|
|
|
|
| 15 |
def createChunksCollection():
|
| 16 |
-
|
| 17 |
if client.collections.exists("Chunks"):
|
| 18 |
client.collections.delete("Chunks")
|
| 19 |
|
|
@@ -62,11 +71,13 @@ def createChunksCollection():
|
|
| 62 |
}
|
| 63 |
]
|
| 64 |
}
|
| 65 |
-
|
| 66 |
return(client.collections.create_from_dict(class_obj))
|
| 67 |
|
|
|
|
|
|
|
|
|
|
| 68 |
def createWebpageCollection():
|
| 69 |
-
|
| 70 |
if client.collections.exists("Documents"):
|
| 71 |
client.collections.delete("Documents")
|
| 72 |
|
|
@@ -84,11 +95,6 @@ def createWebpageCollection():
|
|
| 84 |
"distance": "cosine",
|
| 85 |
},
|
| 86 |
"properties": [
|
| 87 |
-
#{
|
| 88 |
-
# "docname": "fdsa",
|
| 89 |
-
# "dataType": ["text"],
|
| 90 |
-
# "description": "Name of document"
|
| 91 |
-
#},
|
| 92 |
{
|
| 93 |
"name": "title",
|
| 94 |
"dataType": ["text"],
|
|
@@ -121,61 +127,43 @@ def createWebpageCollection():
|
|
| 121 |
}
|
| 122 |
]
|
| 123 |
}
|
| 124 |
-
|
| 125 |
return(client.collections.create_from_dict(class_obj))
|
| 126 |
|
| 127 |
|
| 128 |
-
|
| 129 |
# MAINLINE
|
| 130 |
#
|
|
|
|
|
|
|
| 131 |
#pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
|
| 132 |
-
pathString = "inputDocs"
|
| 133 |
chunks = []
|
| 134 |
webpageDocNames = []
|
| 135 |
-
#webpageChunksClassesNames = []
|
| 136 |
page_contentArray = []
|
| 137 |
webpageChunks = []
|
| 138 |
webpageTitles = []
|
| 139 |
webpageChunksDocNames = []
|
| 140 |
|
| 141 |
-
#client = weaviate.WeaviateClient(
|
| 142 |
-
# embedded_options=EmbeddedOptions(
|
| 143 |
-
# additional_env_vars={
|
| 144 |
-
# "ENABLE_MODULES": "backup-filesystem,text2vec-transformers",
|
| 145 |
-
# "BACKUP_FILESYSTEM_PATH": "/tmp/backups",
|
| 146 |
-
# "PERSISTENCE_DATA_PATH": "/var/lib/weaviate",
|
| 147 |
-
# "DEFAULT_VECTORIZER_MODULE": "text2vec-transformers"
|
| 148 |
-
# #"TRANSFORMERS_INFERENCE_API": "http://huggingface.co/spaces/MVPilgrim/WeaviateDB:8080"
|
| 149 |
-
#
|
| 150 |
-
# }
|
| 151 |
-
# )
|
| 152 |
-
#)
|
| 153 |
-
|
| 154 |
-
#client = weaviate.connect_to_custom(
|
| 155 |
-
# #http_host="http://huggingface.co/spaces/MVPilgrim/WeaviateDB",
|
| 156 |
-
# http_host="http://weaviate",
|
| 157 |
-
# http_port=8080,
|
| 158 |
-
# http_secure=False,
|
| 159 |
-
# #grpc_host="huggingface.co",
|
| 160 |
-
# grpc_host="127.0.0.1",
|
| 161 |
-
# grpc_port=50051,
|
| 162 |
-
# grpc_secure=False
|
| 163 |
-
# #auth_credentials=AuthApiKey(weaviate_key), # `weaviate_key`: your Weaviate API key
|
| 164 |
-
#)
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
)
|
| 169 |
-
|
| 170 |
-
#client = weaviate.connect_to_local(
|
| 171 |
-
# #cluster_url="http://localhost:8080"
|
| 172 |
-
#)
|
| 173 |
-
print("#### client: ",client)
|
| 174 |
-
|
| 175 |
client.connect()
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
for filename in os.listdir(pathString):
|
| 178 |
-
|
| 179 |
path = Path(pathString + "/" + filename)
|
| 180 |
filename = filename.rstrip(".html")
|
| 181 |
webpageDocNames.append(filename)
|
|
@@ -185,38 +173,43 @@ for filename in os.listdir(pathString):
|
|
| 185 |
title = htmlData[0].metadata['title']
|
| 186 |
page_content = htmlData[0].page_content
|
| 187 |
|
| 188 |
-
#
|
| 189 |
page_content = re.sub(r'\n+', '\n',page_content)
|
| 190 |
|
| 191 |
page_contentArray.append(page_content);
|
| 192 |
webpageTitles.append(title)
|
| 193 |
-
#htmlDocument = htmlData[0]
|
| 194 |
max_tokens = 1000
|
| 195 |
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
|
|
|
|
| 196 |
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
|
| 197 |
chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
|
| 198 |
|
| 199 |
chunks = []
|
| 200 |
for chnk in chunksOnePage:
|
| 201 |
-
|
| 202 |
chunks.append(chnk)
|
| 203 |
-
|
| 204 |
webpageChunks.append(chunks)
|
| 205 |
webpageChunksDocNames.append(filename + "Chunks")
|
| 206 |
|
| 207 |
-
|
| 208 |
-
|
|
|
|
| 209 |
|
|
|
|
|
|
|
| 210 |
wpCollection = createWebpageCollection()
|
| 211 |
wpChunkCollection = createChunksCollection()
|
| 212 |
|
|
|
|
|
|
|
|
|
|
| 213 |
for i, className in enumerate(webpageDocNames):
|
| 214 |
title = webpageTitles[i]
|
| 215 |
-
|
| 216 |
# Create Webpage Object
|
| 217 |
page_content = page_contentArray[i]
|
| 218 |
-
#
|
| 219 |
-
|
| 220 |
wpCollectionObj_uuid = wpCollection.data.insert(
|
| 221 |
{
|
| 222 |
"name": className,
|
|
@@ -225,8 +218,8 @@ for i, className in enumerate(webpageDocNames):
|
|
| 225 |
}
|
| 226 |
)
|
| 227 |
|
|
|
|
| 228 |
for i2, chunk in enumerate(webpageChunks[i]):
|
| 229 |
-
#print("#### chunk: ",chunk)
|
| 230 |
chunk_uuid = wpChunkCollection.data.insert(
|
| 231 |
{
|
| 232 |
"title": title,
|
|
@@ -238,55 +231,44 @@ for i, className in enumerate(webpageDocNames):
|
|
| 238 |
}
|
| 239 |
}
|
| 240 |
)
|
| 241 |
-
#print("### chunk_index,chunk: ",i2,",",chunk[0:20])
|
| 242 |
|
| 243 |
-
|
| 244 |
-
#text
|
| 245 |
-
#text = "turkey burgers golden fried with lots of mayonaise"
|
| 246 |
text = "human-made computer cognitive ability"
|
| 247 |
-
#text = "literature authors"
|
| 248 |
-
#text = "artifical intelligence"
|
| 249 |
|
| 250 |
|
| 251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
vector = model.encode(text)
|
| 253 |
-
#print("#### vector: ",vector[0])
|
| 254 |
vectorList = []
|
| 255 |
|
|
|
|
| 256 |
for vec in vector:
|
| 257 |
vectorList.append(vec)
|
| 258 |
-
|
| 259 |
|
|
|
|
|
|
|
| 260 |
semChunks = wpChunkCollection.query.near_vector(
|
| 261 |
near_vector=vectorList,
|
| 262 |
distance=0.7,
|
| 263 |
limit=3
|
| 264 |
)
|
| 265 |
-
|
| 266 |
-
#print("### semChunks.objects[0]: ",semChunks.objects[0])
|
| 267 |
|
|
|
|
|
|
|
| 268 |
for chunk in enumerate(semChunks.objects):
|
| 269 |
-
|
| 270 |
-
#webpage_uuid = chunk.properties['references']['webpage']
|
| 271 |
-
#webpage_uuid = chunk.references.webpage
|
| 272 |
webpage_uuid = chunk[1].properties['references']['webpage']
|
| 273 |
-
|
| 274 |
wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
|
| 275 |
-
|
| 276 |
|
|
|
|
|
|
|
| 277 |
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
if False:
|
| 281 |
-
client = weaviate.connect_to_local(
|
| 282 |
-
#cluster_url="http://localhost:8080"
|
| 283 |
-
)
|
| 284 |
-
|
| 285 |
-
for item in wpCollection.iterator():
|
| 286 |
-
print(print("\n## webpage collection: ",item.uuid, item.properties))
|
| 287 |
-
|
| 288 |
-
for item in wpChunkCollection.iterator():
|
| 289 |
-
print(print("\n## chunk collection: ",item.uuid, item.properties))
|
| 290 |
-
|
| 291 |
-
client.close()
|
| 292 |
-
|
|
|
|
| 1 |
import weaviate
|
| 2 |
+
|
|
|
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
from langchain_community.document_loaders import BSHTMLLoader
|
| 5 |
from pathlib import Path
|
|
|
|
| 10 |
import json
|
| 11 |
import os
|
| 12 |
import re
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
weaviate_logger = logging.getLogger("httpx")
|
| 16 |
+
weaviate_logger.setLevel(logging.WARNING)
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
logging.basicConfig(level=logging.INFO)
|
| 20 |
+
|
| 21 |
|
| 22 |
+
#################################################################
|
| 23 |
+
# Create the chunks collection for the Weaviate database.
|
| 24 |
def createChunksCollection():
|
| 25 |
+
logger.info("#### createChunksCollection() entered.")
|
| 26 |
if client.collections.exists("Chunks"):
|
| 27 |
client.collections.delete("Chunks")
|
| 28 |
|
|
|
|
| 71 |
}
|
| 72 |
]
|
| 73 |
}
|
|
|
|
| 74 |
return(client.collections.create_from_dict(class_obj))
|
| 75 |
|
| 76 |
+
|
| 77 |
+
#####################################################################
|
| 78 |
+
# Create the document collection for the Weaviate database.
|
| 79 |
def createWebpageCollection():
|
| 80 |
+
logger.info("#### createWebpageCollection() entered.")
|
| 81 |
if client.collections.exists("Documents"):
|
| 82 |
client.collections.delete("Documents")
|
| 83 |
|
|
|
|
| 95 |
"distance": "cosine",
|
| 96 |
},
|
| 97 |
"properties": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
{
|
| 99 |
"name": "title",
|
| 100 |
"dataType": ["text"],
|
|
|
|
| 127 |
}
|
| 128 |
]
|
| 129 |
}
|
|
|
|
| 130 |
return(client.collections.create_from_dict(class_obj))
|
| 131 |
|
| 132 |
|
| 133 |
+
######################################################################
|
| 134 |
# MAINLINE
|
| 135 |
#
|
| 136 |
+
logger.info("#### MAINLINE ENTERED.")
|
| 137 |
+
|
| 138 |
#pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
|
| 139 |
+
pathString = "/app/inputDocs"
|
| 140 |
chunks = []
|
| 141 |
webpageDocNames = []
|
|
|
|
| 142 |
page_contentArray = []
|
| 143 |
webpageChunks = []
|
| 144 |
webpageTitles = []
|
| 145 |
webpageChunksDocNames = []
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
+
######################################################
|
| 149 |
+
# Connect to the Weaviate vector database.
|
| 150 |
+
logger.info("#### Create Weaviate db client connection.")
|
| 151 |
+
client = weaviate.connect_to_custom(
|
| 152 |
+
http_host="127.0.0.1",
|
| 153 |
+
http_port=8080,
|
| 154 |
+
http_secure=False,
|
| 155 |
+
grpc_host="127.0.0.1",
|
| 156 |
+
grpc_port=50051,
|
| 157 |
+
grpc_secure=False
|
| 158 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
client.connect()
|
| 160 |
|
| 161 |
+
#######################################################
|
| 162 |
+
# Read each text input file, parse it into a document,
|
| 163 |
+
# chunk it, collect chunks and document name.
|
| 164 |
+
logger.info("#### Read and chunk input text files.")
|
| 165 |
for filename in os.listdir(pathString):
|
| 166 |
+
logger.info(filename)
|
| 167 |
path = Path(pathString + "/" + filename)
|
| 168 |
filename = filename.rstrip(".html")
|
| 169 |
webpageDocNames.append(filename)
|
|
|
|
| 173 |
title = htmlData[0].metadata['title']
|
| 174 |
page_content = htmlData[0].page_content
|
| 175 |
|
| 176 |
+
# Clean data. Remove multiple newlines, etc.
|
| 177 |
page_content = re.sub(r'\n+', '\n',page_content)
|
| 178 |
|
| 179 |
page_contentArray.append(page_content);
|
| 180 |
webpageTitles.append(title)
|
|
|
|
| 181 |
max_tokens = 1000
|
| 182 |
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
|
| 183 |
+
logger.debug(f"### tokenizer: {tokenizer}")
|
| 184 |
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
|
| 185 |
chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
|
| 186 |
|
| 187 |
chunks = []
|
| 188 |
for chnk in chunksOnePage:
|
| 189 |
+
logger.debug(f"#### chnk in file: {chnk}")
|
| 190 |
chunks.append(chnk)
|
| 191 |
+
logger.debug(f"chunks: {chunks}")
|
| 192 |
webpageChunks.append(chunks)
|
| 193 |
webpageChunksDocNames.append(filename + "Chunks")
|
| 194 |
|
| 195 |
+
logger.debug(f"### filename, title: {filename}, {title}")
|
| 196 |
+
|
| 197 |
+
logger.debug(f"### webpageDocNames: {webpageDocNames}")
|
| 198 |
|
| 199 |
+
######################################################
|
| 200 |
+
# Create database webpage and chunks collections.
|
| 201 |
wpCollection = createWebpageCollection()
|
| 202 |
wpChunkCollection = createChunksCollection()
|
| 203 |
|
| 204 |
+
###########################################################
|
| 205 |
+
# Create document and chunks objects in the database.
|
| 206 |
+
logger.info("#### Create page/doc and chunk db objects.")
|
| 207 |
for i, className in enumerate(webpageDocNames):
|
| 208 |
title = webpageTitles[i]
|
| 209 |
+
logger.debug(f"## className, title: {className}, {title}")
|
| 210 |
# Create Webpage Object
|
| 211 |
page_content = page_contentArray[i]
|
| 212 |
+
# Insert the document.
|
|
|
|
| 213 |
wpCollectionObj_uuid = wpCollection.data.insert(
|
| 214 |
{
|
| 215 |
"name": className,
|
|
|
|
| 218 |
}
|
| 219 |
)
|
| 220 |
|
| 221 |
+
# Insert the chunks for the document.
|
| 222 |
for i2, chunk in enumerate(webpageChunks[i]):
|
|
|
|
| 223 |
chunk_uuid = wpChunkCollection.data.insert(
|
| 224 |
{
|
| 225 |
"title": title,
|
|
|
|
| 231 |
}
|
| 232 |
}
|
| 233 |
)
|
|
|
|
| 234 |
|
| 235 |
+
###############################################################################
|
| 236 |
+
# text contains prompt for vector DB.
|
|
|
|
| 237 |
text = "human-made computer cognitive ability"
|
|
|
|
|
|
|
| 238 |
|
| 239 |
|
| 240 |
+
###############################################################################
|
| 241 |
+
# Initial the the sentence transformer and encode the query prompt.
|
| 242 |
+
logger.info(f"#### Encode text query prompt to create vectors. {text}")
|
| 243 |
+
model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
|
| 244 |
+
|
| 245 |
vector = model.encode(text)
|
|
|
|
| 246 |
vectorList = []
|
| 247 |
|
| 248 |
+
logger.debug("#### Print vectors.")
|
| 249 |
for vec in vector:
|
| 250 |
vectorList.append(vec)
|
| 251 |
+
logger.debug(f"vectorList: {vectorList[2]}")
|
| 252 |
|
| 253 |
+
# Fetch chunks and print chunks.
|
| 254 |
+
logger.info("#### Retrieve semchunks from db using vectors from prompt.")
|
| 255 |
semChunks = wpChunkCollection.query.near_vector(
|
| 256 |
near_vector=vectorList,
|
| 257 |
distance=0.7,
|
| 258 |
limit=3
|
| 259 |
)
|
| 260 |
+
logger.debug(f"### semChunks[0]: {semChunks}")
|
|
|
|
| 261 |
|
| 262 |
+
# Print chunks, corresponding document and document title.
|
| 263 |
+
logger.info("#### Print individual retrieved chunks.")
|
| 264 |
for chunk in enumerate(semChunks.objects):
|
| 265 |
+
logger.info(f"#### chunk: {chunk}")
|
|
|
|
|
|
|
| 266 |
webpage_uuid = chunk[1].properties['references']['webpage']
|
| 267 |
+
logger.info(f"webpage_uuid: {webpage_uuid}")
|
| 268 |
wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
|
| 269 |
+
logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
|
| 270 |
|
| 271 |
+
logger.info("#### Closing client db connection.")
|
| 272 |
+
client.close()
|
| 273 |
|
| 274 |
+
logger.info("#### Program terminating.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
startup.sh
CHANGED
|
@@ -1,20 +1,32 @@
|
|
| 1 |
#! /bin/bash
|
| 2 |
|
| 3 |
echo "#### startup.sh entered."
|
| 4 |
-
ls -l /
|
| 5 |
-
ls -l /
|
| 6 |
-
ls -l /
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
|
|
|
|
|
|
|
| 11 |
echo "#### Before /app/weaviate"
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
echo "#### Before sleep."
|
| 15 |
-
sleep
|
| 16 |
|
| 17 |
echo "#### Before /app/semsearch.py"
|
| 18 |
python /app/semsearch.py &
|
| 19 |
|
| 20 |
-
wait
|
|
|
|
|
|
|
|
|
| 1 |
#! /bin/bash
|
| 2 |
|
| 3 |
echo "#### startup.sh entered."
|
| 4 |
+
echo "### ls -l /app"; ls -l /app
|
| 5 |
+
echo "### ls -l /app/weaviate"; ls -l /app/weaviate
|
| 6 |
+
echo "### ls -l /app/text2vec-transformers"; ls -l /app/text2vec-transformers
|
| 7 |
|
| 8 |
+
################################################
|
| 9 |
+
# Start tex2vec-transformers
|
| 10 |
+
echo "#### Before /app/text2vec-transformers"
|
| 11 |
+
/app/text2vec-transformers/bin/uvicorn app:app --host 0.0.0.0 --port 8081 --log-level warning &
|
| 12 |
|
| 13 |
+
###############################################
|
| 14 |
+
# Start the weaviate vector database server.
|
| 15 |
echo "#### Before /app/weaviate"
|
| 16 |
+
export AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED=true \
|
| 17 |
+
PERSISTENCE_DATA_PATH=/var/lib/weaviate \
|
| 18 |
+
DEFAULT_VECTORIZER_MODULE=text2vec-transformers \
|
| 19 |
+
ENABLE_MODULES=text2vec-transformers \
|
| 20 |
+
TRANSFORMERS_INFERENCE_API=http://127.0.0.1:8081 \
|
| 21 |
+
LOG_LEVEL=warning
|
| 22 |
+
/app/weaviate/weaviate --host 127.0.0.1 --port 8080 --scheme http &
|
| 23 |
|
| 24 |
echo "#### Before sleep."
|
| 25 |
+
sleep 60
|
| 26 |
|
| 27 |
echo "#### Before /app/semsearch.py"
|
| 28 |
python /app/semsearch.py &
|
| 29 |
|
| 30 |
+
wait
|
| 31 |
+
|
| 32 |
+
|