import pdfplumber import logging from typing import List, Union, Tuple import os from concurrent.futures import ThreadPoolExecutor, as_completed # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def extract_page_content(args) -> Union[str, None]: """ Extract content from a specific page number of a PDF. Opens the PDF file independently for thread safety. """ pdf_path, page_number = args try: with pdfplumber.open(pdf_path) as pdf: page = pdf.pages[page_number] # Extract tables tables = page.extract_tables() table_strings = [] for table in tables: if table: table_str = "\n".join( ["\t".join(str(cell) if cell is not None else "" for cell in row)] ) table_strings.append(f"[TABLE]\n{table_str}\n[/TABLE]") # Extract text text = page.extract_text() content = [] if table_strings: content.extend(table_strings) if text and text.strip(): content.append(text.strip()) return "\n".join(content) if content else None except Exception as e: logger.error(f"Error processing page {page_number} of {pdf_path}: {str(e)}") return None def extract_pdf_content(pdf_path: str, max_workers: int = 4) -> List[str]: """ Extract all pages of a PDF in parallel using threads. Args: pdf_path (str): Path to the PDF file max_workers (int): Number of threads to use Returns: List[str]: List of extracted content chunks (per page) """ if not os.path.exists(pdf_path): logger.error(f"PDF file not found: {pdf_path}") return [] try: with pdfplumber.open(pdf_path) as pdf: total_pages = len(pdf.pages) logger.info(f"Processing {total_pages} pages from {pdf_path} in parallel.") page_args = [(pdf_path, i) for i in range(total_pages)] results = [] with ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_page = { executor.submit(extract_page_content, args): args[1] for args in page_args } for future in as_completed(future_to_page): page_num = future_to_page[future] try: result = future.result() if result: results.append(result) except Exception as exc: logger.error(f"Page {page_num} generated an exception: {exc}") # Maintain page order based on index return results except Exception as e: logger.error(f"Error opening {pdf_path}: {str(e)}") return []