Spaces:
Running
Running
import weaviate | |
#import weaviate.classes as wvc | |
#from weaviate.embedded import EmbeddedOptions | |
from sentence_transformers import SentenceTransformer | |
from langchain_community.document_loaders import BSHTMLLoader | |
from pathlib import Path | |
from lxml import html | |
import logging | |
from semantic_text_splitter import HuggingFaceTextSplitter | |
from tokenizers import Tokenizer | |
import json | |
import os | |
import re | |
def createChunksCollection(): | |
print("#### createChunksCollection() entered.") | |
if client.collections.exists("Chunks"): | |
client.collections.delete("Chunks") | |
class_obj = { | |
"class": "Chunks", | |
"description": "Collection for document chunks.", | |
"vectorizer": "text2vec-transformers", | |
"moduleConfig": { | |
"text2vec-transformers": { | |
"vectorizeClassName": True | |
} | |
}, | |
"vectorIndexType": "hnsw", | |
"vectorIndexConfig": { | |
"distance": "cosine", | |
}, | |
"properties": [ | |
{ | |
"name": "chunk", | |
"dataType": ["text"], | |
"description": "Single webpage chunk.", | |
"vectorizer": "text2vec-transformers", | |
"moduleConfig": { | |
"text2vec-transformers": { | |
"vectorizePropertyName": False, | |
"skip": False, | |
"tokenization": "lowercase" | |
} | |
} | |
}, | |
{ | |
"name": "chunk_index", | |
"dataType": ["int"] | |
}, | |
{ | |
"name": "webpage", | |
"dataType": ["Documents"], | |
"description": "Webpage content chunks.", | |
"invertedIndexConfig": { | |
"bm25": { | |
"b": 0.75, | |
"k1": 1.2 | |
} | |
} | |
} | |
] | |
} | |
return(client.collections.create_from_dict(class_obj)) | |
def createWebpageCollection(): | |
print("#### createWebpageCollection() entered.") | |
if client.collections.exists("Documents"): | |
client.collections.delete("Documents") | |
class_obj = { | |
"class": "Documents", | |
"description": "For first attempt at loading a Weviate database.", | |
"vectorizer": "text2vec-transformers", | |
"moduleConfig": { | |
"text2vec-transformers": { | |
"vectorizeClassName": False | |
} | |
}, | |
"vectorIndexType": "hnsw", | |
"vectorIndexConfig": { | |
"distance": "cosine", | |
}, | |
"properties": [ | |
#{ | |
# "docname": "fdsa", | |
# "dataType": ["text"], | |
# "description": "Name of document" | |
#}, | |
{ | |
"name": "title", | |
"dataType": ["text"], | |
"description": "HTML doc title.", | |
"vectorizer": "text2vec-transformers", | |
"moduleConfig": { | |
"text2vec-transformers": { | |
"vectorizePropertyName": True, | |
"skip": False, | |
"tokenization": "lowercase" | |
} | |
}, | |
"invertedIndexConfig": { | |
"bm25": { | |
"b": 0.75, | |
"k1": 1.2 | |
}, | |
} | |
}, | |
{ | |
"name": "content", | |
"dataType": ["text"], | |
"description": "HTML page content.", | |
"moduleConfig": { | |
"text2vec-transformers": { | |
"vectorizePropertyName": True, | |
"tokenization": "whitespace" | |
} | |
} | |
} | |
] | |
} | |
return(client.collections.create_from_dict(class_obj)) | |
# | |
# MAINLINE | |
# | |
#pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML" | |
pathString = "inputDocs" | |
chunks = [] | |
webpageDocNames = [] | |
#webpageChunksClassesNames = [] | |
page_contentArray = [] | |
webpageChunks = [] | |
webpageTitles = [] | |
webpageChunksDocNames = [] | |
#client = weaviate.WeaviateClient( | |
# embedded_options=EmbeddedOptions( | |
# additional_env_vars={ | |
# "ENABLE_MODULES": "backup-filesystem,text2vec-transformers", | |
# "BACKUP_FILESYSTEM_PATH": "/tmp/backups", | |
# "PERSISTENCE_DATA_PATH": "/var/lib/weaviate", | |
# "DEFAULT_VECTORIZER_MODULE": "text2vec-transformers" | |
# #"TRANSFORMERS_INFERENCE_API": "http://huggingface.co/spaces/MVPilgrim/WeaviateDB:8080" | |
# | |
# } | |
# ) | |
#) | |
#client = weaviate.connect_to_custom( | |
# #http_host="http://huggingface.co/spaces/MVPilgrim/WeaviateDB", | |
# http_host="http://weaviate", | |
# http_port=8080, | |
# http_secure=False, | |
# #grpc_host="huggingface.co", | |
# grpc_host="127.0.0.1", | |
# grpc_port=50051, | |
# grpc_secure=False | |
# #auth_credentials=AuthApiKey(weaviate_key), # `weaviate_key`: your Weaviate API key | |
#) | |
client = weaviate.Client( | |
url="http://localhost:8080" | |
) | |
#client = weaviate.connect_to_local( | |
# #cluster_url="http://localhost:8080" | |
#) | |
print("#### client: ",client) | |
client.connect() | |
for filename in os.listdir(pathString): | |
print(filename) | |
path = Path(pathString + "/" + filename) | |
filename = filename.rstrip(".html") | |
webpageDocNames.append(filename) | |
htmlLoader = BSHTMLLoader(path,"utf-8") | |
htmlData = htmlLoader.load() | |
title = htmlData[0].metadata['title'] | |
page_content = htmlData[0].page_content | |
# Clean data. Remove multiple newlines, etc. | |
page_content = re.sub(r'\n+', '\n',page_content) | |
page_contentArray.append(page_content); | |
webpageTitles.append(title) | |
#htmlDocument = htmlData[0] | |
max_tokens = 1000 | |
tokenizer = Tokenizer.from_pretrained("bert-base-uncased") | |
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True) | |
chunksOnePage = splitter.chunks(page_content, chunk_capacity=50) | |
chunks = [] | |
for chnk in chunksOnePage: | |
#print("\n\n#### chnk: ",chnk) | |
chunks.append(chnk) | |
#print("chunks: ",chunks) | |
webpageChunks.append(chunks) | |
webpageChunksDocNames.append(filename + "Chunks") | |
print("### filename, title: ",filename,",",title) | |
print("### webpageDocNames: ",webpageDocNames) | |
wpCollection = createWebpageCollection() | |
wpChunkCollection = createChunksCollection() | |
for i, className in enumerate(webpageDocNames): | |
title = webpageTitles[i] | |
print("## className, title: ",className,",",title) | |
# Create Webpage Object | |
page_content = page_contentArray[i] | |
#print("\n#### page_content: ",page_content) | |
wpCollectionObj_uuid = wpCollection.data.insert( | |
{ | |
"name": className, | |
"title": title, | |
"content": page_content | |
} | |
) | |
for i2, chunk in enumerate(webpageChunks[i]): | |
#print("#### chunk: ",chunk) | |
chunk_uuid = wpChunkCollection.data.insert( | |
{ | |
"title": title, | |
"chunk": chunk, | |
"chunk_index": i2, | |
"references": | |
{ | |
"webpage": wpCollectionObj_uuid | |
} | |
} | |
) | |
#print("### chunk_index,chunk: ",i2,",",chunk[0:20]) | |
#text = "List the main capabilities of artificial intelligence." | |
#text = "List three of the greatest Norwegian authors." | |
#text = "turkey burgers golden fried with lots of mayonaise" | |
text = "human-made computer cognitive ability" | |
#text = "literature authors" | |
#text = "artifical intelligence" | |
model = SentenceTransformer('../multi-qa-MiniLM-L6-cos-v1') | |
vector = model.encode(text) | |
#print("#### vector: ",vector[0]) | |
vectorList = [] | |
for vec in vector: | |
vectorList.append(vec) | |
print("vectorList: ",vectorList[2]) | |
semChunks = wpChunkCollection.query.near_vector( | |
near_vector=vectorList, | |
distance=0.7, | |
limit=3 | |
) | |
print("### semChunks[0]: ",semChunks) | |
#print("### semChunks.objects[0]: ",semChunks.objects[0]) | |
for chunk in enumerate(semChunks.objects): | |
print("\n\n#### chunk: ",chunk) | |
#webpage_uuid = chunk.properties['references']['webpage'] | |
#webpage_uuid = chunk.references.webpage | |
webpage_uuid = chunk[1].properties['references']['webpage'] | |
print("\nwebpage_uuid: ",webpage_uuid) | |
wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid) | |
print("\n\n### wpFromChunk title: ",wpFromChunk.properties['title']) | |
#print("response: ",response) | |
if False: | |
client = weaviate.connect_to_local( | |
#cluster_url="http://localhost:8080" | |
) | |
for item in wpCollection.iterator(): | |
print(print("\n## webpage collection: ",item.uuid, item.properties)) | |
for item in wpChunkCollection.iterator(): | |
print(print("\n## chunk collection: ",item.uuid, item.properties)) | |
client.close() | |