Spaces:
Running
Running
File size: 2,350 Bytes
10b392a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# src/data_loader/loader.py
import os
from glob import glob
from langchain_community.document_loaders import TextLoader # cite: embed_pipeline.py
from langchain.schema import Document # cite: embed_pipeline.py
from config.settings import DOCS_FOLDER
import logging
logger = logging.getLogger(__name__)
def load_documents(docs_folder: str = DOCS_FOLDER) -> list[Document]:
"""
Loads documents from the specified folder.
Args:
docs_folder: The path to the folder containing documents.
Returns:
A list of loaded Langchain Document objects.
"""
all_docs = []
files = glob(os.path.join(docs_folder, "*.*")) # cite: embed_pipeline.py
for path in files:
try:
# --- Financial Ministry Adaptation ---
# TODO: Implement more sophisticated loading for specific government ruling formats (PDFs, DOCX, XML, etc.)
# This might involve using libraries like pdfminer.six, python-docx, or custom parsers.
# Handle scanned documents (OCR).
# ------------------------------------
# Attempt UTF-8 loading with autodetect fallback
loader = TextLoader(
path,
encoding="utf-8",
autodetect_encoding=True
)
docs = loader.load()
logger.info(f"Successfully loaded {os.path.basename(path)}")
except UnicodeDecodeError: # cite: embed_pipeline.py
# Fallback to a lenient read if decoding fails
logger.warning(f"Decoding error on {path}, falling back to ignore-errors mode") # cite: embed_pipeline.py
try:
with open(path, "r", encoding="utf-8", errors="ignore") as f: # cite: embed_pipeline.py
text = f.read()
docs = [Document(page_content=text, metadata={"source": path})] # cite: embed_pipeline.py
except Exception as e:
logger.error(f"Failed to read file {path}: {e}")
continue # Skip this file if even lenient read fails
except Exception as e:
logger.error(f"Failed to load file {path}: {e}")
continue # Skip this file if loading fails
all_docs.extend(docs)
logger.info(f"Finished loading documents. Total documents loaded: {len(all_docs)}")
return all_docs |