Spaces:

MVPilgrim
/

SemanticSearchPOC

Sleeping

App Files Files

xet

Community

MVPilgrim commited on May 5, 2024

Commit

45bc919

1 Parent(s): 0d4153f

Got it running.

Browse files

Files changed (8) hide show

Dockerfile +11 -6
DockerfilePythonWeaviate +59 -0
DockerfileTestWvT2v +15 -0
multi-qa-MiniLM-L6-cos-v1 +1 -0
requirements.txt +13 -12
requirements_Orig.txt +19 -0
semsearch.py +70 -88
startup.sh +20 -8

Dockerfile CHANGED Viewed

@@ -30,6 +30,8 @@ RUN go mod download
 ###############################################################################
 # This image builds the weaviate server
 FROM build_base AS server_builder
 ARG TARGETARCH
 ARG GITHASH="unknown"
 ARG EXTRA_BUILD_ARGS=""
@@ -40,9 +42,11 @@ RUN CGO_ENABLED=0 GOARCH=$TARGETARCH go build $EXTRA_BUILD_ARGS \
 ###############################################################################
 #python environment and app.
-FROM python:3.11.5
 #ENTRYPOINT ["/app/startup.sh"]
-RUN apt update
 WORKDIR /app
 #RUN  ls -l / || ls -l /lib || ls -l /usr || ls -l /usr/lib6 || echo "### An ls failed."
@@ -55,10 +59,11 @@ RUN  chmod 755          /app/startup.sh
 COPY --from=weaviate /bin/weaviate /app/weaviate
 COPY --from=weaviate ./modules ./
-COPY --from=server_builder /lib/libc.musl-x86_64.so.1 /lib
-RUN  mkdir -p /usr/lib64 y
-RUN  ls -l /usr/lib64
-RUN  ln -s /usr/lib64/libc.so.6 /usr/lib64/libc.musl-x86_64.so.1
 RUN  mkdir -p /var/lib/weaviate/data y
 RUN  chmod -R 777 /var

 ###############################################################################
 # This image builds the weaviate server
 FROM build_base AS server_builder
+RUN apk add python3.11.5
 ARG TARGETARCH
 ARG GITHASH="unknown"
 ARG EXTRA_BUILD_ARGS=""
 ###############################################################################
 #python environment and app.
+#FROM python:3.11.5
 #ENTRYPOINT ["/app/startup.sh"]
+#RUN apt-get update && \
+#    apt-get install -y libc6 && \
+#    rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 #RUN  ls -l / || ls -l /lib || ls -l /usr || ls -l /usr/lib6 || echo "### An ls failed."
 COPY --from=weaviate /bin/weaviate /app/weaviate
 COPY --from=weaviate ./modules ./
+#COPY --from=server_builder /lib/libc.musl-x86_64.so.1 /lib
+#COPY /lib/libc.musl-x86_64.so.1 /lib
+#RUN  mkdir -p /usr/lib64 y
+#RUN  ls -l /usr/lib64
+#RUN  ln -s /usr/lib64/libc.so.6 /usr/lib64/libc.musl-x86_64.so.1
 RUN  mkdir -p /var/lib/weaviate/data y
 RUN  chmod -R 777 /var

DockerfilePythonWeaviate ADDED Viewed

	@@ -0,0 +1,59 @@

+###############################################################################
+#python environment, main app and startup script.
+FROM python:3.11.5
+#FROM python:3.11.9-slim
+#FROM python:3.11.9-alpine
+#FROM python:3.11-bookworm
+ENTRYPOINT ["/app/startup.sh"]
+#RUN apt-get update && \
+#    apt-get install -y libc6 && \
+#    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+#RUN  ls -l / || ls -l /lib || ls -l /usr || ls -l /usr/lib6 || echo "### An ls failed."
+COPY ./requirements.txt /app/requirements.txt
+COPY ./semsearch.py     /app/semsearch.py
+COPY ./startup.sh       /app/startup.sh
+RUN  chmod 755          /app/startup.sh
+COPY ./multi-qa-MiniLM-L6-cos-v1 /app/multi-qa-MiniLM-L6-cos-v1
+RUN mkdir -p /app/inputDocs
+COPY ./inputDocs/*      /app/inputDocs
+RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
+RUN pip install https://files.pythonhosted.org/packages/13/87/e0cb08c2d4bd7d38ab63816b306c8b1e7cfdc0e59bd54462e8b0df069078/semantic_text_splitter-0.6.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+RUN pip show semantic-text-splitter
+##############################################################################
+# Install Weaviate
+WORKDIR /app/weaviate
+RUN wget -qO- https://github.com/weaviate/weaviate/releases/download/v1.24.10/weaviate-v1.24.10-linux-amd64.tar.gz | tar -xzf -
+RUN ls -al /app/weaviate
+# Set environment variables for Weaviate
+ENV PATH="/app:/app/weaviate-v1.24.10-linux-x86_64:${PATH}"
+# Expose the Weaviate port
+EXPOSE 8080
+##############################################################################
+# Install text2vec-transformers
+WORKDIR /app/text2vec-transformers
+COPY --from=semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1 /app /app/text2vec-transformers
+COPY --from=semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1 /usr/local/bin /app/text2vec-transformers/bin
+COPY ./multi-qa-MiniLM-L6-cos-v1 /app/app/text2vec-transformers
+ENV PATH="/app/text2vec-transformers:/app/text2vec-transformers/bin:${PATH}"
+#RUN pip install -r requirements.txt
+#RUN pip install nltk==3.8.1 optimum==1.13.2 onnxruntime==1.16.1 onnx==1.14.1
+RUN ./custom_prerequisites.py
+##############################
+RUN useradd -m -u 1000 user
+##############################################################################
+# Start the weaviate vector database, text2vec-transformers and the semantic search app.
+#RUN /app/startup.sh
+CMD ["/app/startup.sh"]

DockerfileTestWvT2v ADDED Viewed

	@@ -0,0 +1,15 @@

+# Start with the official Weaviate image
+FROM semitechnologies/weaviate:latest
+# Set environment variables
+ENV WEAVIATE_SERVE_MODULES text2vec-transformers
+# Install Python and pip via apk, the package manager for Alpine
+RUN apk update && apk add --no-cache python3 py3-pip transformers
+#RUN pip3 install --no-cache-dir transformers
+# Expose the default port for Weaviate
+EXPOSE 8080
+# Start Weaviate
+CMD ["weaviate", "start"]

multi-qa-MiniLM-L6-cos-v1 ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 38845167a107b59398111f0cfb430897cf1a4639

requirements.txt CHANGED Viewed

@@ -1,15 +1,16 @@
-transformers
-torch
-gradio
-sentencepiece
-protobuf
-weaviate-client==4.5.1
 sentence-transformers
 langchain
 lxml
-huggingface-hub
-#semantic-text-splitter
-tokenizers
-json5
-regex
-beautifulsoup4

+weaviate-client==4.*
 sentence-transformers
 langchain
 lxml
+beautifulsoup4
+transformers==4.34.1
+fastapi==0.103.2
+uvicorn==0.23.2
+nltk==3.8.1
+torch==2.0.1
+sentencepiece==0.1.99
+sentence-transformers==2.2.2
+optimum==1.13.2
+onnxruntime==1.16.1
+onnx==1.14.1

requirements_Orig.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+transformers==4.40.1
+torch==2.3.0
+#gradio
+#sentencepiece
+#protobuf
+weaviate-client==4.*
+sentence-transformers
+langchain
+lxml
+#huggingface-hub
+#semantic-text-splitter
+#tokenizers
+#json5
+#regex
+beautifulsoup4
+uvicorn
+fastapi
+optimum==1.16.2
+onnx

semsearch.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import weaviate
-#import weaviate.classes as wvc
-#from weaviate.embedded import EmbeddedOptions
 from sentence_transformers import SentenceTransformer
 from langchain_community.document_loaders import BSHTMLLoader
 from pathlib import Path
@@ -11,9 +10,19 @@ from tokenizers import Tokenizer
 import json
 import os
 import re
 def createChunksCollection():
-    print("#### createChunksCollection() entered.")
     if client.collections.exists("Chunks"):
         client.collections.delete("Chunks")
@@ -62,11 +71,13 @@ def createChunksCollection():
             }
         ]
     }
     return(client.collections.create_from_dict(class_obj))
 def createWebpageCollection():
-    print("#### createWebpageCollection() entered.")
     if client.collections.exists("Documents"):
         client.collections.delete("Documents")
@@ -84,11 +95,6 @@ def createWebpageCollection():
           "distance": "cosine",
       },
       "properties": [
-            #{
-            #    "docname": "fdsa",
-            #    "dataType": ["text"],
-            #    "description": "Name of document"
-            #},
             {
                 "name": "title",
                 "dataType": ["text"],
@@ -121,61 +127,43 @@ def createWebpageCollection():
             }
         ]
     }
     return(client.collections.create_from_dict(class_obj))
-#
 # MAINLINE
 #
 #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
-pathString = "inputDocs"
 chunks = []
 webpageDocNames = []
-#webpageChunksClassesNames = []
 page_contentArray = []
 webpageChunks = []
 webpageTitles = []
 webpageChunksDocNames = []
-#client = weaviate.WeaviateClient(
-#    embedded_options=EmbeddedOptions(
-#        additional_env_vars={
-#            "ENABLE_MODULES": "backup-filesystem,text2vec-transformers",
-#            "BACKUP_FILESYSTEM_PATH": "/tmp/backups",
-#            "PERSISTENCE_DATA_PATH": "/var/lib/weaviate",
-#            "DEFAULT_VECTORIZER_MODULE": "text2vec-transformers"
-#            #"TRANSFORMERS_INFERENCE_API": "http://huggingface.co/spaces/MVPilgrim/WeaviateDB:8080"
-#
-#        }
-#    )
-#)
-#client = weaviate.connect_to_custom(
-#    #http_host="http://huggingface.co/spaces/MVPilgrim/WeaviateDB",
-#    http_host="http://weaviate",
-#    http_port=8080,
-#    http_secure=False,
-#    #grpc_host="huggingface.co",
-#    grpc_host="127.0.0.1",
-#    grpc_port=50051,
-#    grpc_secure=False
-#    #auth_credentials=AuthApiKey(weaviate_key),   # `weaviate_key`: your Weaviate API key
-#)
-client = weaviate.Client(
-    url="http://localhost:8080"
 )
-#client = weaviate.connect_to_local(
-#    #cluster_url="http://localhost:8080"
-#)
-print("#### client: ",client)
 client.connect()
 for filename in os.listdir(pathString):
-    print(filename)
     path = Path(pathString + "/" + filename)
     filename = filename.rstrip(".html")
     webpageDocNames.append(filename)
@@ -185,38 +173,43 @@ for filename in os.listdir(pathString):
     title   = htmlData[0].metadata['title']
     page_content = htmlData[0].page_content
-    #  Clean data. Remove multiple newlines, etc.
     page_content = re.sub(r'\n+', '\n',page_content)
     page_contentArray.append(page_content);
     webpageTitles.append(title)
-    #htmlDocument = htmlData[0]
     max_tokens = 1000
     tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
     splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
     chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
     chunks = []
     for chnk in chunksOnePage:
-        #print("\n\n#### chnk: ",chnk)
         chunks.append(chnk)
-        #print("chunks: ",chunks)
     webpageChunks.append(chunks)
     webpageChunksDocNames.append(filename + "Chunks")
-    print("### filename, title: ",filename,",",title)
-print("### webpageDocNames: ",webpageDocNames)
 wpCollection = createWebpageCollection()
 wpChunkCollection = createChunksCollection()
 for i, className in enumerate(webpageDocNames):
     title = webpageTitles[i]
-    print("## className, title: ",className,",",title)
     # Create Webpage Object
     page_content = page_contentArray[i]
-    #print("\n#### page_content: ",page_content)
     wpCollectionObj_uuid = wpCollection.data.insert(
       {
         "name": className,
@@ -225,8 +218,8 @@ for i, className in enumerate(webpageDocNames):
       }
     )
     for i2, chunk in enumerate(webpageChunks[i]):
-        #print("#### chunk: ",chunk)
         chunk_uuid = wpChunkCollection.data.insert(
           {
             "title": title,
@@ -238,55 +231,44 @@ for i, className in enumerate(webpageDocNames):
             }
           }
         )
-        #print("### chunk_index,chunk: ",i2,",",chunk[0:20])
-#text = "List the main capabilities of artificial intelligence."
-#text = "List three of the greatest Norwegian authors."
-#text = "turkey burgers golden fried with lots of mayonaise"
 text = "human-made computer cognitive ability"
-#text = "literature authors"
-#text = "artifical intelligence"
-model = SentenceTransformer('../multi-qa-MiniLM-L6-cos-v1')
 vector = model.encode(text)
-#print("#### vector: ",vector[0])
 vectorList = []
 for vec in vector:
     vectorList.append(vec)
-print("vectorList: ",vectorList[2])
 semChunks = wpChunkCollection.query.near_vector(
     near_vector=vectorList,
     distance=0.7,
     limit=3
 )
-print("### semChunks[0]: ",semChunks)
-#print("### semChunks.objects[0]: ",semChunks.objects[0])
 for chunk in enumerate(semChunks.objects):
-    print("\n\n#### chunk: ",chunk)
-    #webpage_uuid = chunk.properties['references']['webpage']
-    #webpage_uuid = chunk.references.webpage
     webpage_uuid = chunk[1].properties['references']['webpage']
-    print("\nwebpage_uuid: ",webpage_uuid)
     wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
-    print("\n\n### wpFromChunk title: ",wpFromChunk.properties['title'])
-#print("response: ",response)
-if False:
-    client = weaviate.connect_to_local(
-        #cluster_url="http://localhost:8080"
-    )
-    for item in wpCollection.iterator():
-        print(print("\n## webpage collection: ",item.uuid, item.properties))
-    for item in wpChunkCollection.iterator():
-        print(print("\n## chunk collection: ",item.uuid, item.properties))
-    client.close()

 import weaviate
 from sentence_transformers import SentenceTransformer
 from langchain_community.document_loaders import BSHTMLLoader
 from pathlib import Path
 import json
 import os
 import re
+import logging
+weaviate_logger = logging.getLogger("httpx")
+weaviate_logger.setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+#################################################################
+# Create the chunks collection for the Weaviate database.
 def createChunksCollection():
+    logger.info("#### createChunksCollection() entered.")
     if client.collections.exists("Chunks"):
         client.collections.delete("Chunks")
             }
         ]
     }
     return(client.collections.create_from_dict(class_obj))
+#####################################################################
+# Create the document collection for the Weaviate database.
 def createWebpageCollection():
+    logger.info("#### createWebpageCollection() entered.")
     if client.collections.exists("Documents"):
         client.collections.delete("Documents")
           "distance": "cosine",
       },
       "properties": [
             {
                 "name": "title",
                 "dataType": ["text"],
             }
         ]
     }
     return(client.collections.create_from_dict(class_obj))
+######################################################################
 # MAINLINE
 #
+logger.info("#### MAINLINE ENTERED.")
 #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
+pathString = "/app/inputDocs"
 chunks = []
 webpageDocNames = []
 page_contentArray = []
 webpageChunks = []
 webpageTitles = []
 webpageChunksDocNames = []
+######################################################
+# Connect to the Weaviate vector database.
+logger.info("#### Create Weaviate db client connection.")
+client = weaviate.connect_to_custom(
+    http_host="127.0.0.1",
+    http_port=8080,
+    http_secure=False,
+    grpc_host="127.0.0.1",
+    grpc_port=50051,
+    grpc_secure=False
 )
 client.connect()
+#######################################################
+# Read each text input file, parse it into a document,
+# chunk it, collect chunks and document name.
+logger.info("#### Read and chunk input text files.")
 for filename in os.listdir(pathString):
+    logger.info(filename)
     path = Path(pathString + "/" + filename)
     filename = filename.rstrip(".html")
     webpageDocNames.append(filename)
     title   = htmlData[0].metadata['title']
     page_content = htmlData[0].page_content
+    # Clean data. Remove multiple newlines, etc.
     page_content = re.sub(r'\n+', '\n',page_content)
     page_contentArray.append(page_content);
     webpageTitles.append(title)
     max_tokens = 1000
     tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
+    logger.debug(f"### tokenizer: {tokenizer}")
     splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
     chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
     chunks = []
     for chnk in chunksOnePage:
+        logger.debug(f"#### chnk in file: {chnk}")
         chunks.append(chnk)
+        logger.debug(f"chunks: {chunks}")
     webpageChunks.append(chunks)
     webpageChunksDocNames.append(filename + "Chunks")
+    logger.debug(f"### filename, title: {filename}, {title}")
+logger.debug(f"### webpageDocNames: {webpageDocNames}")
+######################################################
+# Create database webpage and chunks collections.
 wpCollection = createWebpageCollection()
 wpChunkCollection = createChunksCollection()
+###########################################################
+# Create document and chunks objects in the database.
+logger.info("#### Create page/doc and chunk db objects.")
 for i, className in enumerate(webpageDocNames):
     title = webpageTitles[i]
+    logger.debug(f"## className, title: {className}, {title}")
     # Create Webpage Object
     page_content = page_contentArray[i]
+    # Insert the document.
     wpCollectionObj_uuid = wpCollection.data.insert(
       {
         "name": className,
       }
     )
+    # Insert the chunks for the document.
     for i2, chunk in enumerate(webpageChunks[i]):
         chunk_uuid = wpChunkCollection.data.insert(
           {
             "title": title,
             }
           }
         )
+###############################################################################
+# text contains prompt for vector DB.
 text = "human-made computer cognitive ability"
+###############################################################################
+# Initial the the sentence transformer and encode the query prompt.
+logger.info(f"#### Encode text query prompt to create vectors. {text}")
+model = SentenceTransformer('/app/multi-qa-MiniLM-L6-cos-v1')
 vector = model.encode(text)
 vectorList = []
+logger.debug("#### Print vectors.")
 for vec in vector:
     vectorList.append(vec)
+logger.debug(f"vectorList: {vectorList[2]}")
+# Fetch chunks and print chunks.
+logger.info("#### Retrieve semchunks from db using vectors from prompt.")
 semChunks = wpChunkCollection.query.near_vector(
     near_vector=vectorList,
     distance=0.7,
     limit=3
 )
+logger.debug(f"### semChunks[0]: {semChunks}")
+# Print chunks, corresponding document and document title.
+logger.info("#### Print individual retrieved chunks.")
 for chunk in enumerate(semChunks.objects):
+    logger.info(f"#### chunk: {chunk}")
     webpage_uuid = chunk[1].properties['references']['webpage']
+    logger.info(f"webpage_uuid: {webpage_uuid}")
     wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
+    logger.info(f"### wpFromChunk title: {wpFromChunk.properties['title']}")
+logger.info("#### Closing client db connection.")
+client.close()
+logger.info("#### Program terminating.")

startup.sh CHANGED Viewed

@@ -1,20 +1,32 @@
 #! /bin/bash
 echo "#### startup.sh entered."
-ls -l /
-ls -l /lib
-ls -l /usr/lib64
-echo "#### ldd weaviate"
-ldd /app/weaviate
 echo "#### Before /app/weaviate"
-/app/weaviate --host 127.0.0.1 --port 8080 --scheme http &
 echo "#### Before sleep."
-sleep 10
 echo "#### Before /app/semsearch.py"
 python /app/semsearch.py &
-wait

 #! /bin/bash
 echo "#### startup.sh entered."
+echo "### ls -l /app"; ls -l /app
+echo "### ls -l /app/weaviate"; ls -l /app/weaviate
+echo "### ls -l /app/text2vec-transformers"; ls -l /app/text2vec-transformers
+################################################
+# Start tex2vec-transformers
+echo "#### Before /app/text2vec-transformers"
+/app/text2vec-transformers/bin/uvicorn app:app --host 0.0.0.0 --port 8081 --log-level warning &
+###############################################
+# Start the weaviate vector database server.
 echo "#### Before /app/weaviate"
+export AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED=true \
+       PERSISTENCE_DATA_PATH=/var/lib/weaviate \
+       DEFAULT_VECTORIZER_MODULE=text2vec-transformers \
+	   ENABLE_MODULES=text2vec-transformers \
+	   TRANSFORMERS_INFERENCE_API=http://127.0.0.1:8081 \
+	   LOG_LEVEL=warning
+/app/weaviate/weaviate --host 127.0.0.1 --port 8080 --scheme http &
 echo "#### Before sleep."
+sleep 60
 echo "#### Before /app/semsearch.py"
 python /app/semsearch.py &
+wait