Spaces:
Running
Running
Create semsearch.py
Browse files- semsearch.py +284 -0
semsearch.py
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import weaviate
|
2 |
+
import weaviate.classes as wvc
|
3 |
+
from weaviate.embedded import EmbeddedOptions
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
from langchain_community.document_loaders import BSHTMLLoader
|
6 |
+
from pathlib import Path
|
7 |
+
from lxml import html
|
8 |
+
import logging
|
9 |
+
from semantic_text_splitter import HuggingFaceTextSplitter
|
10 |
+
from tokenizers import Tokenizer
|
11 |
+
import json
|
12 |
+
import os
|
13 |
+
import re
|
14 |
+
|
15 |
+
def createChunksCollection():
|
16 |
+
print("#### createChunksCollection() entered.")
|
17 |
+
if client.collections.exists("Chunks"):
|
18 |
+
client.collections.delete("Chunks")
|
19 |
+
|
20 |
+
class_obj = {
|
21 |
+
"class": "Chunks",
|
22 |
+
"description": "Collection for document chunks.",
|
23 |
+
"vectorizer": "text2vec-transformers",
|
24 |
+
"moduleConfig": {
|
25 |
+
"text2vec-transformers": {
|
26 |
+
"vectorizeClassName": True
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"vectorIndexType": "hnsw",
|
30 |
+
"vectorIndexConfig": {
|
31 |
+
"distance": "cosine",
|
32 |
+
},
|
33 |
+
"properties": [
|
34 |
+
{
|
35 |
+
"name": "chunk",
|
36 |
+
"dataType": ["text"],
|
37 |
+
"description": "Single webpage chunk.",
|
38 |
+
"vectorizer": "text2vec-transformers",
|
39 |
+
"moduleConfig": {
|
40 |
+
"text2vec-transformers": {
|
41 |
+
"vectorizePropertyName": False,
|
42 |
+
"skip": False,
|
43 |
+
"tokenization": "lowercase"
|
44 |
+
}
|
45 |
+
}
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"name": "chunk_index",
|
49 |
+
"dataType": ["int"]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"name": "webpage",
|
53 |
+
"dataType": ["Documents"],
|
54 |
+
"description": "Webpage content chunks.",
|
55 |
+
|
56 |
+
"invertedIndexConfig": {
|
57 |
+
"bm25": {
|
58 |
+
"b": 0.75,
|
59 |
+
"k1": 1.2
|
60 |
+
}
|
61 |
+
}
|
62 |
+
}
|
63 |
+
]
|
64 |
+
}
|
65 |
+
|
66 |
+
return(client.collections.create_from_dict(class_obj))
|
67 |
+
|
68 |
+
def createWebpageCollection():
|
69 |
+
print("#### createWebpageCollection() entered.")
|
70 |
+
if client.collections.exists("Documents"):
|
71 |
+
client.collections.delete("Documents")
|
72 |
+
|
73 |
+
class_obj = {
|
74 |
+
"class": "Documents",
|
75 |
+
"description": "For first attempt at loading a Weviate database.",
|
76 |
+
"vectorizer": "text2vec-transformers",
|
77 |
+
"moduleConfig": {
|
78 |
+
"text2vec-transformers": {
|
79 |
+
"vectorizeClassName": False
|
80 |
+
}
|
81 |
+
},
|
82 |
+
"vectorIndexType": "hnsw",
|
83 |
+
"vectorIndexConfig": {
|
84 |
+
"distance": "cosine",
|
85 |
+
},
|
86 |
+
"properties": [
|
87 |
+
#{
|
88 |
+
# "docname": "fdsa",
|
89 |
+
# "dataType": ["text"],
|
90 |
+
# "description": "Name of document"
|
91 |
+
#},
|
92 |
+
{
|
93 |
+
"name": "title",
|
94 |
+
"dataType": ["text"],
|
95 |
+
"description": "HTML doc title.",
|
96 |
+
"vectorizer": "text2vec-transformers",
|
97 |
+
"moduleConfig": {
|
98 |
+
"text2vec-transformers": {
|
99 |
+
"vectorizePropertyName": True,
|
100 |
+
"skip": False,
|
101 |
+
"tokenization": "lowercase"
|
102 |
+
}
|
103 |
+
},
|
104 |
+
"invertedIndexConfig": {
|
105 |
+
"bm25": {
|
106 |
+
"b": 0.75,
|
107 |
+
"k1": 1.2
|
108 |
+
},
|
109 |
+
}
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"name": "content",
|
113 |
+
"dataType": ["text"],
|
114 |
+
"description": "HTML page content.",
|
115 |
+
"moduleConfig": {
|
116 |
+
"text2vec-transformers": {
|
117 |
+
"vectorizePropertyName": True,
|
118 |
+
"tokenization": "whitespace"
|
119 |
+
}
|
120 |
+
}
|
121 |
+
}
|
122 |
+
]
|
123 |
+
}
|
124 |
+
|
125 |
+
return(client.collections.create_from_dict(class_obj))
|
126 |
+
|
127 |
+
|
128 |
+
#
|
129 |
+
# MAINLINE
|
130 |
+
#
|
131 |
+
#pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
|
132 |
+
pathString = "inputDocs"
|
133 |
+
chunks = []
|
134 |
+
webpageDocNames = []
|
135 |
+
#webpageChunksClassesNames = []
|
136 |
+
page_contentArray = []
|
137 |
+
webpageChunks = []
|
138 |
+
webpageTitles = []
|
139 |
+
webpageChunksDocNames = []
|
140 |
+
|
141 |
+
#client = weaviate.WeaviateClient(
|
142 |
+
# embedded_options=EmbeddedOptions(
|
143 |
+
# additional_env_vars={
|
144 |
+
# "ENABLE_MODULES": "backup-filesystem,text2vec-transformers",
|
145 |
+
# "BACKUP_FILESYSTEM_PATH": "/tmp/backups",
|
146 |
+
# "PERSISTENCE_DATA_PATH": "/var/lib/weaviate",
|
147 |
+
# "DEFAULT_VECTORIZER_MODULE": "text2vec-transformers"
|
148 |
+
# #"TRANSFORMERS_INFERENCE_API": "http://huggingface.co/spaces/MVPilgrim/WeaviateDB:8080"
|
149 |
+
#
|
150 |
+
# }
|
151 |
+
# )
|
152 |
+
#)
|
153 |
+
|
154 |
+
client = weaviate.connect_to_custom(
|
155 |
+
#http_host="http://huggingface.co/spaces/MVPilgrim/WeaviateDB",
|
156 |
+
http_host="http://127.0.0.1",
|
157 |
+
http_port=8080,
|
158 |
+
http_secure=False,
|
159 |
+
#grpc_host="huggingface.co",
|
160 |
+
grpc_host="127.0.0.1",
|
161 |
+
grpc_port=50051,
|
162 |
+
grpc_secure=False
|
163 |
+
#auth_credentials=AuthApiKey(weaviate_key), # `weaviate_key`: your Weaviate API key
|
164 |
+
)
|
165 |
+
print("#### client: ",client)
|
166 |
+
|
167 |
+
client.connect()
|
168 |
+
|
169 |
+
for filename in os.listdir(pathString):
|
170 |
+
print(filename)
|
171 |
+
path = Path(pathString + "/" + filename)
|
172 |
+
filename = filename.rstrip(".html")
|
173 |
+
webpageDocNames.append(filename)
|
174 |
+
htmlLoader = BSHTMLLoader(path,"utf-8")
|
175 |
+
htmlData = htmlLoader.load()
|
176 |
+
|
177 |
+
title = htmlData[0].metadata['title']
|
178 |
+
page_content = htmlData[0].page_content
|
179 |
+
|
180 |
+
# Clean data. Remove multiple newlines, etc.
|
181 |
+
page_content = re.sub(r'\n+', '\n',page_content)
|
182 |
+
|
183 |
+
page_contentArray.append(page_content);
|
184 |
+
webpageTitles.append(title)
|
185 |
+
#htmlDocument = htmlData[0]
|
186 |
+
max_tokens = 1000
|
187 |
+
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
|
188 |
+
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
|
189 |
+
chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
|
190 |
+
|
191 |
+
chunks = []
|
192 |
+
for chnk in chunksOnePage:
|
193 |
+
#print("\n\n#### chnk: ",chnk)
|
194 |
+
chunks.append(chnk)
|
195 |
+
#print("chunks: ",chunks)
|
196 |
+
webpageChunks.append(chunks)
|
197 |
+
webpageChunksDocNames.append(filename + "Chunks")
|
198 |
+
|
199 |
+
print("### filename, title: ",filename,",",title)
|
200 |
+
print("### webpageDocNames: ",webpageDocNames)
|
201 |
+
|
202 |
+
wpCollection = createWebpageCollection()
|
203 |
+
wpChunkCollection = createChunksCollection()
|
204 |
+
|
205 |
+
for i, className in enumerate(webpageDocNames):
|
206 |
+
title = webpageTitles[i]
|
207 |
+
print("## className, title: ",className,",",title)
|
208 |
+
# Create Webpage Object
|
209 |
+
page_content = page_contentArray[i]
|
210 |
+
#print("\n#### page_content: ",page_content)
|
211 |
+
|
212 |
+
wpCollectionObj_uuid = wpCollection.data.insert(
|
213 |
+
{
|
214 |
+
"name": className,
|
215 |
+
"title": title,
|
216 |
+
"content": page_content
|
217 |
+
}
|
218 |
+
)
|
219 |
+
|
220 |
+
for i2, chunk in enumerate(webpageChunks[i]):
|
221 |
+
#print("#### chunk: ",chunk)
|
222 |
+
chunk_uuid = wpChunkCollection.data.insert(
|
223 |
+
{
|
224 |
+
"title": title,
|
225 |
+
"chunk": chunk,
|
226 |
+
"chunk_index": i2,
|
227 |
+
"references":
|
228 |
+
{
|
229 |
+
"webpage": wpCollectionObj_uuid
|
230 |
+
}
|
231 |
+
}
|
232 |
+
)
|
233 |
+
#print("### chunk_index,chunk: ",i2,",",chunk[0:20])
|
234 |
+
|
235 |
+
#text = "List the main capabilities of artificial intelligence."
|
236 |
+
#text = "List three of the greatest Norwegian authors."
|
237 |
+
#text = "turkey burgers golden fried with lots of mayonaise"
|
238 |
+
text = "human-made computer cognitive ability"
|
239 |
+
#text = "literature authors"
|
240 |
+
#text = "artifical intelligence"
|
241 |
+
|
242 |
+
|
243 |
+
model = SentenceTransformer('../multi-qa-MiniLM-L6-cos-v1')
|
244 |
+
vector = model.encode(text)
|
245 |
+
#print("#### vector: ",vector[0])
|
246 |
+
vectorList = []
|
247 |
+
|
248 |
+
for vec in vector:
|
249 |
+
vectorList.append(vec)
|
250 |
+
print("vectorList: ",vectorList[2])
|
251 |
+
|
252 |
+
semChunks = wpChunkCollection.query.near_vector(
|
253 |
+
near_vector=vectorList,
|
254 |
+
distance=0.7,
|
255 |
+
limit=3
|
256 |
+
)
|
257 |
+
print("### semChunks[0]: ",semChunks)
|
258 |
+
#print("### semChunks.objects[0]: ",semChunks.objects[0])
|
259 |
+
|
260 |
+
for chunk in enumerate(semChunks.objects):
|
261 |
+
print("\n\n#### chunk: ",chunk)
|
262 |
+
#webpage_uuid = chunk.properties['references']['webpage']
|
263 |
+
#webpage_uuid = chunk.references.webpage
|
264 |
+
webpage_uuid = chunk[1].properties['references']['webpage']
|
265 |
+
print("\nwebpage_uuid: ",webpage_uuid)
|
266 |
+
wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
|
267 |
+
print("\n\n### wpFromChunk title: ",wpFromChunk.properties['title'])
|
268 |
+
|
269 |
+
|
270 |
+
#print("response: ",response)
|
271 |
+
|
272 |
+
if False:
|
273 |
+
client = weaviate.connect_to_local(
|
274 |
+
#cluster_url="http://localhost:8080"
|
275 |
+
)
|
276 |
+
|
277 |
+
for item in wpCollection.iterator():
|
278 |
+
print(print("\n## webpage collection: ",item.uuid, item.properties))
|
279 |
+
|
280 |
+
for item in wpChunkCollection.iterator():
|
281 |
+
print(print("\n## chunk collection: ",item.uuid, item.properties))
|
282 |
+
|
283 |
+
client.close()
|
284 |
+
|