# src/data_loader/loader.py import os from glob import glob from langchain_community.document_loaders import TextLoader # cite: embed_pipeline.py from langchain.schema import Document # cite: embed_pipeline.py from config.settings import DOCS_FOLDER import logging logger = logging.getLogger(__name__) def load_documents(docs_folder: str = DOCS_FOLDER) -> list[Document]: """ Loads documents from the specified folder. Args: docs_folder: The path to the folder containing documents. Returns: A list of loaded Langchain Document objects. """ all_docs = [] files = glob(os.path.join(docs_folder, "*.*")) # cite: embed_pipeline.py for path in files: try: # --- Financial Ministry Adaptation --- # TODO: Implement more sophisticated loading for specific government ruling formats (PDFs, DOCX, XML, etc.) # This might involve using libraries like pdfminer.six, python-docx, or custom parsers. # Handle scanned documents (OCR). # ------------------------------------ # Attempt UTF-8 loading with autodetect fallback loader = TextLoader( path, encoding="utf-8", autodetect_encoding=True ) docs = loader.load() logger.info(f"Successfully loaded {os.path.basename(path)}") except UnicodeDecodeError: # cite: embed_pipeline.py # Fallback to a lenient read if decoding fails logger.warning(f"Decoding error on {path}, falling back to ignore-errors mode") # cite: embed_pipeline.py try: with open(path, "r", encoding="utf-8", errors="ignore") as f: # cite: embed_pipeline.py text = f.read() docs = [Document(page_content=text, metadata={"source": path})] # cite: embed_pipeline.py except Exception as e: logger.error(f"Failed to read file {path}: {e}") continue # Skip this file if even lenient read fails except Exception as e: logger.error(f"Failed to load file {path}: {e}") continue # Skip this file if loading fails all_docs.extend(docs) logger.info(f"Finished loading documents. Total documents loaded: {len(all_docs)}") return all_docs