MVPilgrim commited on
Commit
76035d1
·
verified ·
1 Parent(s): c487c44

Create semsearch.py

Browse files
Files changed (1) hide show
  1. semsearch.py +284 -0
semsearch.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import weaviate
2
+ import weaviate.classes as wvc
3
+ from weaviate.embedded import EmbeddedOptions
4
+ from sentence_transformers import SentenceTransformer
5
+ from langchain_community.document_loaders import BSHTMLLoader
6
+ from pathlib import Path
7
+ from lxml import html
8
+ import logging
9
+ from semantic_text_splitter import HuggingFaceTextSplitter
10
+ from tokenizers import Tokenizer
11
+ import json
12
+ import os
13
+ import re
14
+
15
+ def createChunksCollection():
16
+ print("#### createChunksCollection() entered.")
17
+ if client.collections.exists("Chunks"):
18
+ client.collections.delete("Chunks")
19
+
20
+ class_obj = {
21
+ "class": "Chunks",
22
+ "description": "Collection for document chunks.",
23
+ "vectorizer": "text2vec-transformers",
24
+ "moduleConfig": {
25
+ "text2vec-transformers": {
26
+ "vectorizeClassName": True
27
+ }
28
+ },
29
+ "vectorIndexType": "hnsw",
30
+ "vectorIndexConfig": {
31
+ "distance": "cosine",
32
+ },
33
+ "properties": [
34
+ {
35
+ "name": "chunk",
36
+ "dataType": ["text"],
37
+ "description": "Single webpage chunk.",
38
+ "vectorizer": "text2vec-transformers",
39
+ "moduleConfig": {
40
+ "text2vec-transformers": {
41
+ "vectorizePropertyName": False,
42
+ "skip": False,
43
+ "tokenization": "lowercase"
44
+ }
45
+ }
46
+ },
47
+ {
48
+ "name": "chunk_index",
49
+ "dataType": ["int"]
50
+ },
51
+ {
52
+ "name": "webpage",
53
+ "dataType": ["Documents"],
54
+ "description": "Webpage content chunks.",
55
+
56
+ "invertedIndexConfig": {
57
+ "bm25": {
58
+ "b": 0.75,
59
+ "k1": 1.2
60
+ }
61
+ }
62
+ }
63
+ ]
64
+ }
65
+
66
+ return(client.collections.create_from_dict(class_obj))
67
+
68
+ def createWebpageCollection():
69
+ print("#### createWebpageCollection() entered.")
70
+ if client.collections.exists("Documents"):
71
+ client.collections.delete("Documents")
72
+
73
+ class_obj = {
74
+ "class": "Documents",
75
+ "description": "For first attempt at loading a Weviate database.",
76
+ "vectorizer": "text2vec-transformers",
77
+ "moduleConfig": {
78
+ "text2vec-transformers": {
79
+ "vectorizeClassName": False
80
+ }
81
+ },
82
+ "vectorIndexType": "hnsw",
83
+ "vectorIndexConfig": {
84
+ "distance": "cosine",
85
+ },
86
+ "properties": [
87
+ #{
88
+ # "docname": "fdsa",
89
+ # "dataType": ["text"],
90
+ # "description": "Name of document"
91
+ #},
92
+ {
93
+ "name": "title",
94
+ "dataType": ["text"],
95
+ "description": "HTML doc title.",
96
+ "vectorizer": "text2vec-transformers",
97
+ "moduleConfig": {
98
+ "text2vec-transformers": {
99
+ "vectorizePropertyName": True,
100
+ "skip": False,
101
+ "tokenization": "lowercase"
102
+ }
103
+ },
104
+ "invertedIndexConfig": {
105
+ "bm25": {
106
+ "b": 0.75,
107
+ "k1": 1.2
108
+ },
109
+ }
110
+ },
111
+ {
112
+ "name": "content",
113
+ "dataType": ["text"],
114
+ "description": "HTML page content.",
115
+ "moduleConfig": {
116
+ "text2vec-transformers": {
117
+ "vectorizePropertyName": True,
118
+ "tokenization": "whitespace"
119
+ }
120
+ }
121
+ }
122
+ ]
123
+ }
124
+
125
+ return(client.collections.create_from_dict(class_obj))
126
+
127
+
128
+ #
129
+ # MAINLINE
130
+ #
131
+ #pathString = "/Users/660565/KPSAllInOne/ProgramFilesX86/WebCopy/DownloadedWebSites/LLMPOC_HTML"
132
+ pathString = "inputDocs"
133
+ chunks = []
134
+ webpageDocNames = []
135
+ #webpageChunksClassesNames = []
136
+ page_contentArray = []
137
+ webpageChunks = []
138
+ webpageTitles = []
139
+ webpageChunksDocNames = []
140
+
141
+ #client = weaviate.WeaviateClient(
142
+ # embedded_options=EmbeddedOptions(
143
+ # additional_env_vars={
144
+ # "ENABLE_MODULES": "backup-filesystem,text2vec-transformers",
145
+ # "BACKUP_FILESYSTEM_PATH": "/tmp/backups",
146
+ # "PERSISTENCE_DATA_PATH": "/var/lib/weaviate",
147
+ # "DEFAULT_VECTORIZER_MODULE": "text2vec-transformers"
148
+ # #"TRANSFORMERS_INFERENCE_API": "http://huggingface.co/spaces/MVPilgrim/WeaviateDB:8080"
149
+ #
150
+ # }
151
+ # )
152
+ #)
153
+
154
+ client = weaviate.connect_to_custom(
155
+ #http_host="http://huggingface.co/spaces/MVPilgrim/WeaviateDB",
156
+ http_host="http://127.0.0.1",
157
+ http_port=8080,
158
+ http_secure=False,
159
+ #grpc_host="huggingface.co",
160
+ grpc_host="127.0.0.1",
161
+ grpc_port=50051,
162
+ grpc_secure=False
163
+ #auth_credentials=AuthApiKey(weaviate_key), # `weaviate_key`: your Weaviate API key
164
+ )
165
+ print("#### client: ",client)
166
+
167
+ client.connect()
168
+
169
+ for filename in os.listdir(pathString):
170
+ print(filename)
171
+ path = Path(pathString + "/" + filename)
172
+ filename = filename.rstrip(".html")
173
+ webpageDocNames.append(filename)
174
+ htmlLoader = BSHTMLLoader(path,"utf-8")
175
+ htmlData = htmlLoader.load()
176
+
177
+ title = htmlData[0].metadata['title']
178
+ page_content = htmlData[0].page_content
179
+
180
+ # Clean data. Remove multiple newlines, etc.
181
+ page_content = re.sub(r'\n+', '\n',page_content)
182
+
183
+ page_contentArray.append(page_content);
184
+ webpageTitles.append(title)
185
+ #htmlDocument = htmlData[0]
186
+ max_tokens = 1000
187
+ tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
188
+ splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
189
+ chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
190
+
191
+ chunks = []
192
+ for chnk in chunksOnePage:
193
+ #print("\n\n#### chnk: ",chnk)
194
+ chunks.append(chnk)
195
+ #print("chunks: ",chunks)
196
+ webpageChunks.append(chunks)
197
+ webpageChunksDocNames.append(filename + "Chunks")
198
+
199
+ print("### filename, title: ",filename,",",title)
200
+ print("### webpageDocNames: ",webpageDocNames)
201
+
202
+ wpCollection = createWebpageCollection()
203
+ wpChunkCollection = createChunksCollection()
204
+
205
+ for i, className in enumerate(webpageDocNames):
206
+ title = webpageTitles[i]
207
+ print("## className, title: ",className,",",title)
208
+ # Create Webpage Object
209
+ page_content = page_contentArray[i]
210
+ #print("\n#### page_content: ",page_content)
211
+
212
+ wpCollectionObj_uuid = wpCollection.data.insert(
213
+ {
214
+ "name": className,
215
+ "title": title,
216
+ "content": page_content
217
+ }
218
+ )
219
+
220
+ for i2, chunk in enumerate(webpageChunks[i]):
221
+ #print("#### chunk: ",chunk)
222
+ chunk_uuid = wpChunkCollection.data.insert(
223
+ {
224
+ "title": title,
225
+ "chunk": chunk,
226
+ "chunk_index": i2,
227
+ "references":
228
+ {
229
+ "webpage": wpCollectionObj_uuid
230
+ }
231
+ }
232
+ )
233
+ #print("### chunk_index,chunk: ",i2,",",chunk[0:20])
234
+
235
+ #text = "List the main capabilities of artificial intelligence."
236
+ #text = "List three of the greatest Norwegian authors."
237
+ #text = "turkey burgers golden fried with lots of mayonaise"
238
+ text = "human-made computer cognitive ability"
239
+ #text = "literature authors"
240
+ #text = "artifical intelligence"
241
+
242
+
243
+ model = SentenceTransformer('../multi-qa-MiniLM-L6-cos-v1')
244
+ vector = model.encode(text)
245
+ #print("#### vector: ",vector[0])
246
+ vectorList = []
247
+
248
+ for vec in vector:
249
+ vectorList.append(vec)
250
+ print("vectorList: ",vectorList[2])
251
+
252
+ semChunks = wpChunkCollection.query.near_vector(
253
+ near_vector=vectorList,
254
+ distance=0.7,
255
+ limit=3
256
+ )
257
+ print("### semChunks[0]: ",semChunks)
258
+ #print("### semChunks.objects[0]: ",semChunks.objects[0])
259
+
260
+ for chunk in enumerate(semChunks.objects):
261
+ print("\n\n#### chunk: ",chunk)
262
+ #webpage_uuid = chunk.properties['references']['webpage']
263
+ #webpage_uuid = chunk.references.webpage
264
+ webpage_uuid = chunk[1].properties['references']['webpage']
265
+ print("\nwebpage_uuid: ",webpage_uuid)
266
+ wpFromChunk = wpCollection.query.fetch_object_by_id(webpage_uuid)
267
+ print("\n\n### wpFromChunk title: ",wpFromChunk.properties['title'])
268
+
269
+
270
+ #print("response: ",response)
271
+
272
+ if False:
273
+ client = weaviate.connect_to_local(
274
+ #cluster_url="http://localhost:8080"
275
+ )
276
+
277
+ for item in wpCollection.iterator():
278
+ print(print("\n## webpage collection: ",item.uuid, item.properties))
279
+
280
+ for item in wpChunkCollection.iterator():
281
+ print(print("\n## chunk collection: ",item.uuid, item.properties))
282
+
283
+ client.close()
284
+