import weaviate==1.23.7 #import weaviate.classes as wvc #from weaviate.embedded import EmbeddedOptions from sentence_transformers import SentenceTransformer from langchain_community.document_loaders import BSHTMLLoader from pathlib import Path from lxml import html import logging from semantic_text_splitter import HuggingFaceTextSplitter from tokenizers import Tokenizer import json import os import re def createChunksCollection(): print("#### createChunksCollection() entered.") if client.collections.exists("Chunks"): client.collections.delete("Chunks") class_obj = { "class": "Chunks", "description": "Collection for document chunks.", "vectorizer": "text2vec-transformers", "moduleConfig": { "text2vec-transformers": { "vectorizeClassName": True } }, "vectorIndexType": "hnsw", "vectorIndexConfig": { "distance": "cosine", }, "properties": [ { "name": "chunk", "dataType": ["text"], "description": "Single webpage chunk.", "vectorizer": "text2vec-transformers", "moduleConfig": { "text2vec-transformers": { "vectorizePropertyName": False, "skip": False, "tokenization": "lowercase" } } }, { "name": "chunk_index", "dataType": ["int"] }, { "name": "webpage", "dataType": ["Documents"], "description": "Webpage content chunks.", "invertedIndexConfig": { "bm25": { "b": 0.75, "k1": 1.2 } } } ] } return(client.collections.create_from_dict(class_obj)) def createWebpageCollection(): print("#### createWebpageCollection() entered.") if client.collections.exists("Documents"): client.collections.delete("Documents") class_obj = { "class": "Documents", "description": "For first attempt at loading a Weviate database.", "vectorizer": "text2vec-transformers", "moduleConfig": { "text2vec-transformers": { "vectorizeClassName": False } }, "vectorIndexType": "hnsw", "vectorIndexConfig": { "distance": "cosine", }, "properties": [ #{ # "docname": "fdsa", # "dataType": ["text"], # "description": "Name of document" #}, { "name": "title", "dataType": ["text"], "description": "HTML doc title.", "vectorizer": "text2vec-transformers", "moduleConfig": { "text2vec-transformers": { "vectorizePropertyName": True, "skip": False, "tokenization": "lowercase" } }, "invertedIndexConfig": { "bm25": { "b": 0.75, "k1": 1.2 }, } }, { "name": "content", "dataType": ["text"], "description": "HTML page content.", "moduleConfig": { "text2vec-transformers": { "vectorizePropertyName": True, "tokenization": "whitespace" } } } ] } return(client.collections.create_from_dict(class_obj)) # # MAINLINE # #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML" pathString = "inputDocs" chunks = [] webpageDocNames = [] #webpageChunksClassesNames = [] page_contentArray = [] webpageChunks = [] webpageTitles = [] webpageChunksDocNames = [] #client = weaviate.WeaviateClient( # embedded_options=EmbeddedOptions( # additional_env_vars={ # "ENABLE_MODULES": "backup-filesystem,text2vec-transformers", # "BACKUP_FILESYSTEM_PATH": "/tmp/backups", # "PERSISTENCE_DATA_PATH": "/var/lib/weaviate", # "DEFAULT_VECTORIZER_MODULE": "text2vec-transformers" # #"TRANSFORMERS_INFERENCE_API": "http://huggingface.co/spaces/MVPilgrim/WeaviateDB:8080" # # } # ) #) #client = weaviate.connect_to_custom( # #http_host="http://huggingface.co/spaces/MVPilgrim/WeaviateDB", # http_host="http://weaviate", # http_port=8080, # http_secure=False, # #grpc_host="huggingface.co", # grpc_host="127.0.0.1", # grpc_port=50051, # grpc_secure=False # #auth_credentials=AuthApiKey(weaviate_key), # `weaviate_key`: your Weaviate API key #) #client = weaviate.Client( # url="http://localhost:8080" #) client = weaviate.connect_to_local( #cluster_url="http://localhost:8080" ) print("#### client: ",client) client.connect() for filename in os.listdir(pathString): print(filename) path = Path(pathString + "/" + filename) filename = filename.rstrip(".html") webpageDocNames.append(filename) htmlLoader = BSHTMLLoader(path,"utf-8") htmlData = htmlLoader.load() title = htmlData[0].metadata['title'] page_content = htmlData[0].page_content # Clean data. Remove multiple newlines, etc. page_content = re.sub(r'\n+', '\n',page_content) page_contentArray.append(page_content); webpageTitles.append(title) #htmlDocument = htmlData[0] max_tokens = 1000 tokenizer = Tokenizer.from_pretrained("bert-base-uncased") splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True) chunksOnePage = splitter.chunks(page_content, chunk_capacity=50) chunks = [] for chnk in chunksOnePage: #print("\n\n#### chnk: ",chnk) chunks.append(chnk) #print("chunks: ",chunks) webpageChunks.append(chunks) webpageChunksDocNames.append(filename + "Chunks") print("### filename, title: ",filename,",",title) print("### webpageDocNames: ",webpageDocNames) wpCollection = createWebpageCollection() wpChunkCollection = createChunksCollection() for i, className in enumerate(webpageDocNames): title = webpageTitles[i] print("## className, title: ",className,",",title) # Create Webpage Object page_content = page_contentArray[i] #print("\n#### page_content: ",page_content) wpCollectionObj_uuid = wpCollection.data.insert( { "name": className, "title": title, "content": page_content } ) for i2, chunk in enumerate(webpageChunks[i]): #print("#### chunk: ",chunk) chunk_uuid = wpChunkCollection.data.insert( { "title": title, "chunk": chunk, "chunk_index": i2, "references": { "webpage": wpCollectionObj_uuid } } ) #print("### chunk_index,chunk: ",i2,",",chunk[0:20]) #text = "List the main capabilities of artificial intelligence." #text = "List three of the greatest Norwegian authors." #text = "turkey burgers golden fried with lots of mayonaise" text = "human-made computer cognitive ability" #text = "literature authors" #text = "artifical intelligence" model = SentenceTransformer('../multi-qa-MiniLM-L6-cos-v1') vector = model.encode(text) #print("#### vector: ",vector[0]) vectorList = [] for vec in vector: vectorList.append(vec) print("vectorList: ",vectorList[2]) semChunks = wpChunkCollection.query.near_vector( near_vector=vectorList, distance=0.7, limit=3 ) print("### semChunks[0]: ",semChunks) #print("### semChunks.objects[0]: ",semChunks.objects[0]) for chunk in enumerate(semChunks.objects): print("\n\n#### chunk: ",chunk) #webpage_uuid = chunk.properties['references']['webpage'] #webpage_uuid = chunk.references.webpage webpage_uuid = chunk[1].properties['references']['webpage'] print("\nwebpage_uuid: ",webpage_uuid) wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid) print("\n\n### wpFromChunk title: ",wpFromChunk.properties['title']) #print("response: ",response) if False: client = weaviate.connect_to_local( #cluster_url="http://localhost:8080" ) for item in wpCollection.iterator(): print(print("\n## webpage collection: ",item.uuid, item.properties)) for item in wpChunkCollection.iterator(): print(print("\n## chunk collection: ",item.uuid, item.properties)) client.close()