# pdf_processor.py from langchain_community.document_loaders import PyMuPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) def process_pdf(pdf_url: str) -> str: logging.info(f"Processing PDF from URL: {pdf_url}") try: loader = PyMuPDFLoader(pdf_url) data = loader.load() if not data: logging.warning(f"No data found in PDF at {pdf_url}") return "" return "\n".join([page.page_content for page in data]) except Exception as e: logging.error(f"Failed to process PDF at {pdf_url}: {str(e)}") return "" def split_text(text: str) -> List[str]: return text_splitter.split_text(text)