Spaces:
Runtime error
Runtime error
Update helpers.py
Browse files- helpers.py +6 -7
helpers.py
CHANGED
@@ -269,13 +269,12 @@ def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
|
|
269 |
def load_text_data(file_path):
|
270 |
"""Load text content from a DOCX file (tables removed)."""
|
271 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=2500)
|
272 |
-
loader = DoclingLoader(
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
)
|
278 |
-
docs = loader.load()
|
279 |
chunks = text_splitter.split_documents(docs)
|
280 |
# You can wrap each chunk back into a Document if needed.
|
281 |
return chunks
|
|
|
269 |
def load_text_data(file_path):
|
270 |
"""Load text content from a DOCX file (tables removed)."""
|
271 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=2500)
|
272 |
+
# loader = DoclingLoader(
|
273 |
+
# file_path=file_path,
|
274 |
+
# export_type=ExportType.MARKDOWN, # Enable internal chunking,
|
275 |
+
# # chunker = text_splitter
|
276 |
+
# )
|
277 |
+
loader = UnstructuredWordDocumentLoader(file_path)
|
|
|
278 |
chunks = text_splitter.split_documents(docs)
|
279 |
# You can wrap each chunk back into a Document if needed.
|
280 |
return chunks
|