Spaces:
Running
Running
MVPilgrim
commited on
Commit
·
4d1c68b
1
Parent(s):
cee962e
logging
Browse files- semsearch.py +66 -79
semsearch.py
CHANGED
@@ -21,7 +21,7 @@ from IPython.display import display, clear_output
|
|
21 |
|
22 |
|
23 |
weaviate_logger = logging.getLogger("httpx")
|
24 |
-
weaviate_logger.setLevel(logging.
|
25 |
|
26 |
logger = logging.getLogger(__name__)
|
27 |
logging.basicConfig(level=logging.INFO)
|
@@ -95,60 +95,9 @@ submitButton = widgets.Button(
|
|
95 |
)
|
96 |
|
97 |
|
98 |
-
#######################################################
|
99 |
-
# Read each text input file, parse it into a document,
|
100 |
-
# chunk it, collect chunks and document name.
|
101 |
-
logger.info("#### Read and chunk input text files.")
|
102 |
-
for filename in os.listdir(pathString):
|
103 |
-
logger.info(filename)
|
104 |
-
path = Path(pathString + "/" + filename)
|
105 |
-
filename = filename.rstrip(".html")
|
106 |
-
webpageDocNames.append(filename)
|
107 |
-
htmlLoader = BSHTMLLoader(path,"utf-8")
|
108 |
-
htmlData = htmlLoader.load()
|
109 |
-
|
110 |
-
title = htmlData[0].metadata['title']
|
111 |
-
page_content = htmlData[0].page_content
|
112 |
-
|
113 |
-
# Clean data. Remove multiple newlines, etc.
|
114 |
-
page_content = re.sub(r'\n+', '\n',page_content)
|
115 |
-
|
116 |
-
page_contentArray.append(page_content);
|
117 |
-
webpageTitles.append(title)
|
118 |
-
max_tokens = 1000
|
119 |
-
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
|
120 |
-
logger.debug(f"### tokenizer: {tokenizer}")
|
121 |
-
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
|
122 |
-
chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
|
123 |
-
|
124 |
-
chunks = []
|
125 |
-
for chnk in chunksOnePage:
|
126 |
-
logger.debug(f"#### chnk in file: {chnk}")
|
127 |
-
chunks.append(chnk)
|
128 |
-
logger.debug(f"chunks: {chunks}")
|
129 |
-
webpageChunks.append(chunks)
|
130 |
-
webpageChunksDocNames.append(filename + "Chunks")
|
131 |
-
|
132 |
-
logger.debug(f"### filename, title: {filename}, {title}")
|
133 |
-
|
134 |
-
logger.debug(f"### webpageDocNames: {webpageDocNames}")
|
135 |
-
|
136 |
-
|
137 |
######################################################
|
138 |
# Connect to the Weaviate vector database.
|
139 |
logger.info("#### Create Weaviate db client connection.")
|
140 |
-
#client = weaviate.connect_to_custom(
|
141 |
-
# http_host="127.0.0.1",
|
142 |
-
# http_port=8080,
|
143 |
-
# http_secure=False,
|
144 |
-
# grpc_host="127.0.0.1",
|
145 |
-
# grpc_port=50051,
|
146 |
-
# grpc_secure=False,
|
147 |
-
# timeout=[600,600]
|
148 |
-
# #read_timeout=600,
|
149 |
-
# #write_timeout=90
|
150 |
-
#)
|
151 |
-
|
152 |
client = weaviate.WeaviateClient(
|
153 |
connection_params=ConnectionParams.from_params(
|
154 |
http_host="localhost",
|
@@ -156,20 +105,56 @@ client = weaviate.WeaviateClient(
|
|
156 |
http_secure=False,
|
157 |
grpc_host="localhost",
|
158 |
grpc_port="50051",
|
159 |
-
grpc_secure=False
|
160 |
-
log_level="WARNING"
|
161 |
),
|
162 |
-
|
163 |
-
|
164 |
-
#
|
165 |
-
|
166 |
-
additional_config=AdditionalConfig(
|
167 |
-
timeout=Timeout(init=60, query=1800, insert=1800), # Values in seconds
|
168 |
-
)
|
169 |
)
|
170 |
client.connect()
|
171 |
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
######################################################
|
174 |
# Create database webpage and chunks collections.
|
175 |
#wpCollection = createWebpageCollection()
|
@@ -278,20 +263,23 @@ if not client.collections.exists("Chunks"):
|
|
278 |
|
279 |
###########################################################
|
280 |
# Create document and chunks objects in the database.
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
|
|
|
|
|
|
295 |
logger.info("#### Create chunk db objects.")
|
296 |
# Insert the chunks for the document.
|
297 |
for i2, chunk in enumerate(webpageChunks[i]):
|
@@ -436,7 +424,6 @@ submitButton.on_click(on_submitButton_clicked)
|
|
436 |
display(output_widget)
|
437 |
|
438 |
|
439 |
-
logger.info("#### Closing client db connection.")
|
440 |
-
client.close()
|
441 |
-
|
442 |
-
logger.info("#### Program terminating.")
|
|
|
21 |
|
22 |
|
23 |
weaviate_logger = logging.getLogger("httpx")
|
24 |
+
weaviate_logger.setLevel(logging.WARNING)
|
25 |
|
26 |
logger = logging.getLogger(__name__)
|
27 |
logging.basicConfig(level=logging.INFO)
|
|
|
95 |
)
|
96 |
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
######################################################
|
99 |
# Connect to the Weaviate vector database.
|
100 |
logger.info("#### Create Weaviate db client connection.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
client = weaviate.WeaviateClient(
|
102 |
connection_params=ConnectionParams.from_params(
|
103 |
http_host="localhost",
|
|
|
105 |
http_secure=False,
|
106 |
grpc_host="localhost",
|
107 |
grpc_port="50051",
|
108 |
+
grpc_secure=False
|
|
|
109 |
),
|
110 |
+
},
|
111 |
+
additional_config=AdditionalConfig(
|
112 |
+
timeout=Timeout(init=60, query=1800, insert=1800), # Values in seconds
|
113 |
+
)
|
|
|
|
|
|
|
114 |
)
|
115 |
client.connect()
|
116 |
|
117 |
|
118 |
+
#######################################################
|
119 |
+
# Read each text input file, parse it into a document,
|
120 |
+
# chunk it, collect chunks and document name.
|
121 |
+
logger.info("#### Read and chunk input text files.")
|
122 |
+
if not client.collections.exists("Documents") || not client.collections.exists("Documents") :
|
123 |
+
for filename in os.listdir(pathString):
|
124 |
+
logger.info(filename)
|
125 |
+
path = Path(pathString + "/" + filename)
|
126 |
+
filename = filename.rstrip(".html")
|
127 |
+
webpageDocNames.append(filename)
|
128 |
+
htmlLoader = BSHTMLLoader(path,"utf-8")
|
129 |
+
htmlData = htmlLoader.load()
|
130 |
+
|
131 |
+
title = htmlData[0].metadata['title']
|
132 |
+
page_content = htmlData[0].page_content
|
133 |
+
|
134 |
+
# Clean data. Remove multiple newlines, etc.
|
135 |
+
page_content = re.sub(r'\n+', '\n',page_content)
|
136 |
+
|
137 |
+
page_contentArray.append(page_content);
|
138 |
+
webpageTitles.append(title)
|
139 |
+
max_tokens = 1000
|
140 |
+
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
|
141 |
+
logger.debug(f"### tokenizer: {tokenizer}")
|
142 |
+
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
|
143 |
+
chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
|
144 |
+
|
145 |
+
chunks = []
|
146 |
+
for chnk in chunksOnePage:
|
147 |
+
logger.debug(f"#### chnk in file: {chnk}")
|
148 |
+
chunks.append(chnk)
|
149 |
+
logger.debug(f"chunks: {chunks}")
|
150 |
+
webpageChunks.append(chunks)
|
151 |
+
webpageChunksDocNames.append(filename + "Chunks")
|
152 |
+
|
153 |
+
logger.debug(f"### filename, title: {filename}, {title}")
|
154 |
+
logger.debug(f"### webpageDocNames: {webpageDocNames}")
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
######################################################
|
159 |
# Create database webpage and chunks collections.
|
160 |
#wpCollection = createWebpageCollection()
|
|
|
263 |
|
264 |
###########################################################
|
265 |
# Create document and chunks objects in the database.
|
266 |
+
if not client.collections.exists("Documents") :
|
267 |
+
logger.info("#### Create page/doc db objects.")
|
268 |
+
for i, className in enumerate(webpageDocNames):
|
269 |
+
title = webpageTitles[i]
|
270 |
+
logger.debug(f"## className, title: {className}, {title}")
|
271 |
+
# Create Webpage Object
|
272 |
+
page_content = page_contentArray[i]
|
273 |
+
# Insert the document.
|
274 |
+
wpCollectionObj_uuid = wpCollection.data.insert(
|
275 |
+
{
|
276 |
+
"name": className,
|
277 |
+
"title": title,
|
278 |
+
"content": page_content
|
279 |
+
}
|
280 |
+
)
|
281 |
+
|
282 |
+
if not client.collections.exists("Chunks") :
|
283 |
logger.info("#### Create chunk db objects.")
|
284 |
# Insert the chunks for the document.
|
285 |
for i2, chunk in enumerate(webpageChunks[i]):
|
|
|
424 |
display(output_widget)
|
425 |
|
426 |
|
427 |
+
#logger.info("#### Closing client db connection.")
|
428 |
+
#client.close()
|
429 |
+
#logger.info("#### Program terminating.")
|
|