Spaces:
Running
Running
MVPilgrim
commited on
Commit
·
2b6582c
1
Parent(s):
0f87433
debug
Browse files
app.py
CHANGED
@@ -141,20 +141,20 @@ try:
|
|
141 |
webpageTitles.append(title)
|
142 |
max_tokens = 1000
|
143 |
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
|
144 |
-
logger.
|
145 |
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
|
146 |
chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
|
147 |
|
148 |
chunks = []
|
149 |
for chnk in chunksOnePage:
|
150 |
-
logger.
|
151 |
chunks.append(chnk)
|
152 |
-
logger.
|
153 |
webpageChunks.append(chunks)
|
154 |
webpageChunksDocNames.append(filename + "Chunks")
|
155 |
|
156 |
-
logger.
|
157 |
-
logger.
|
158 |
logger.info("#### Read and chunk input text files exited.")
|
159 |
|
160 |
|
@@ -281,6 +281,7 @@ try:
|
|
281 |
|
282 |
###########################################################
|
283 |
# Create document and chunks objects in the database.
|
|
|
284 |
if 'wpCollectionLoaded' not in st.session_state:
|
285 |
logger.info("#### Create page/doc db objects.")
|
286 |
for i, className in enumerate(webpageDocNames):
|
@@ -289,7 +290,7 @@ try:
|
|
289 |
# Create Webpage Object
|
290 |
page_content = page_contentArray[i]
|
291 |
# Insert the document.
|
292 |
-
wpCollectionObj_uuid = wpCollection.data.insert(
|
293 |
{
|
294 |
"name": className,
|
295 |
"title": title,
|
@@ -311,7 +312,7 @@ try:
|
|
311 |
"chunk_index": i2,
|
312 |
"references":
|
313 |
{
|
314 |
-
"webpage": wpCollectionObj_uuid
|
315 |
}
|
316 |
}
|
317 |
)
|
|
|
141 |
webpageTitles.append(title)
|
142 |
max_tokens = 1000
|
143 |
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
|
144 |
+
logger.info(f"### tokenizer: {tokenizer}")
|
145 |
splitter = HuggingFaceTextSplitter(tokenizer, trim_chunks=True)
|
146 |
chunksOnePage = splitter.chunks(page_content, chunk_capacity=50)
|
147 |
|
148 |
chunks = []
|
149 |
for chnk in chunksOnePage:
|
150 |
+
logger.info(f"#### chnk in file: {chnk}")
|
151 |
chunks.append(chnk)
|
152 |
+
logger.info(f"chunks: {chunks}")
|
153 |
webpageChunks.append(chunks)
|
154 |
webpageChunksDocNames.append(filename + "Chunks")
|
155 |
|
156 |
+
logger.info(f"### filename, title: {filename}, {title}")
|
157 |
+
logger.info(f"### webpageDocNames: {webpageDocNames}")
|
158 |
logger.info("#### Read and chunk input text files exited.")
|
159 |
|
160 |
|
|
|
281 |
|
282 |
###########################################################
|
283 |
# Create document and chunks objects in the database.
|
284 |
+
wpCollectionObj_uuid = []
|
285 |
if 'wpCollectionLoaded' not in st.session_state:
|
286 |
logger.info("#### Create page/doc db objects.")
|
287 |
for i, className in enumerate(webpageDocNames):
|
|
|
290 |
# Create Webpage Object
|
291 |
page_content = page_contentArray[i]
|
292 |
# Insert the document.
|
293 |
+
wpCollectionObj_uuid[i] = wpCollection.data.insert(
|
294 |
{
|
295 |
"name": className,
|
296 |
"title": title,
|
|
|
312 |
"chunk_index": i2,
|
313 |
"references":
|
314 |
{
|
315 |
+
"webpage": wpCollectionObj_uuid[i2]
|
316 |
}
|
317 |
}
|
318 |
)
|