import pdfplumber from docx import Document from openpyxl import load_workbook import pdfplumber import logging from typing import List, Union, Tuple import os # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def extract_pdf_content(pdf_path: str) -> List[str]: """ Extract text and tables from PDF in their natural reading order. Simplified version without positional processing. Args: pdf_path (str): Path to the PDF file Returns: List[str]: List of extracted content chunks (text and tables) """ if not os.path.exists(pdf_path): logger.error(f"PDF file not found: {pdf_path}") return [] try: with pdfplumber.open(pdf_path) as pdf: content = [] for page in pdf.pages: # First extract tables tables = page.extract_tables() for table in tables: if table: # Convert table to string representation table_str = "\n".join( ["\t".join(str(cell) for cell in row) for row in table] ) content.append(f"[TABLE]\n{table_str}\n[/TABLE]") # Then extract regular text text = page.extract_text() if text and text.strip(): content.append(text.strip()) logger.info(f"Successfully extracted content from {pdf_path}") return content except Exception as e: logger.error(f"Error processing {pdf_path}: {str(e)}") return [] from docx import Document from typing import List import os def extract_docx_content(docx_path: str) -> List[str]: """ Extract text and tables from DOCX file with clear table markers. Args: docx_path (str): Path to the DOCX file Returns: List[str]: List of extracted content chunks with tables marked as [TABLE]...[/TABLE] """ if not os.path.exists(docx_path): raise FileNotFoundError(f"DOCX file not found: {docx_path}") doc = Document(docx_path) content = [] # Process all paragraphs first for paragraph in doc.paragraphs: text = paragraph.text.strip() if text: content.append(text) # Process all tables after paragraphs for table in doc.tables: table_str = "\n".join( ["\t".join(cell.text.strip() for cell in row.cells) for row in table.rows] ) if table_str.strip(): content.append(f"[TABLE]\n{table_str}\n[/TABLE]") return content def extract_xlsx_content(file_path: str): wb = load_workbook(file_path) sheets_text = [] for sheet in wb: sheet_str = f"--- Sheet: {sheet.title} ---\n" for row in sheet.iter_rows(): row_str = "\t".join(str(cell.value) if cell.value else "" for cell in row) sheet_str += row_str + "\n" sheets_text.append(sheet_str.strip()) return sheets_text