quoc-khanh commited on
Commit
e65c9bf
·
verified ·
1 Parent(s): 6be7362

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +6 -7
helpers.py CHANGED
@@ -269,13 +269,12 @@ def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
269
  def load_text_data(file_path):
270
  """Load text content from a DOCX file (tables removed)."""
271
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=2500)
272
- loader = DoclingLoader(
273
- file_path=file_path,
274
- export_type=ExportType.MARKDOWN, # Enable internal chunking,
275
- chunker = text_splitter,
276
- # convert_kwargs={"input_format": "docx"} # Specify the input format
277
- )
278
- docs = loader.load()
279
  chunks = text_splitter.split_documents(docs)
280
  # You can wrap each chunk back into a Document if needed.
281
  return chunks
 
269
  def load_text_data(file_path):
270
  """Load text content from a DOCX file (tables removed)."""
271
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=2500)
272
+ # loader = DoclingLoader(
273
+ # file_path=file_path,
274
+ # export_type=ExportType.MARKDOWN, # Enable internal chunking,
275
+ # # chunker = text_splitter
276
+ # )
277
+ loader = UnstructuredWordDocumentLoader(file_path)
 
278
  chunks = text_splitter.split_documents(docs)
279
  # You can wrap each chunk back into a Document if needed.
280
  return chunks