|
import pdfplumber |
|
from docx import Document |
|
from openpyxl import load_workbook |
|
import pdfplumber |
|
import logging |
|
from typing import List, Union, Tuple |
|
import os |
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def extract_pdf_content(pdf_path: str) -> List[str]: |
|
""" |
|
Extract text and tables from PDF in their natural reading order. |
|
Simplified version without positional processing. |
|
|
|
Args: |
|
pdf_path (str): Path to the PDF file |
|
|
|
Returns: |
|
List[str]: List of extracted content chunks (text and tables) |
|
""" |
|
if not os.path.exists(pdf_path): |
|
logger.error(f"PDF file not found: {pdf_path}") |
|
return [] |
|
|
|
try: |
|
with pdfplumber.open(pdf_path) as pdf: |
|
content = [] |
|
|
|
for page in pdf.pages: |
|
|
|
tables = page.extract_tables() |
|
for table in tables: |
|
if table: |
|
|
|
table_str = "\n".join( |
|
["\t".join(str(cell) for cell in row) for row in table] |
|
) |
|
content.append(f"[TABLE]\n{table_str}\n[/TABLE]") |
|
|
|
|
|
text = page.extract_text() |
|
if text and text.strip(): |
|
content.append(text.strip()) |
|
|
|
logger.info(f"Successfully extracted content from {pdf_path}") |
|
return content |
|
|
|
except Exception as e: |
|
logger.error(f"Error processing {pdf_path}: {str(e)}") |
|
return [] |
|
|
|
|
|
from docx import Document |
|
from typing import List |
|
import os |
|
|
|
def extract_docx_content(docx_path: str) -> List[str]: |
|
""" |
|
Extract text and tables from DOCX file with clear table markers. |
|
|
|
Args: |
|
docx_path (str): Path to the DOCX file |
|
|
|
Returns: |
|
List[str]: List of extracted content chunks with tables marked as [TABLE]...[/TABLE] |
|
""" |
|
if not os.path.exists(docx_path): |
|
raise FileNotFoundError(f"DOCX file not found: {docx_path}") |
|
|
|
doc = Document(docx_path) |
|
content = [] |
|
|
|
|
|
for paragraph in doc.paragraphs: |
|
text = paragraph.text.strip() |
|
if text: |
|
content.append(text) |
|
|
|
|
|
for table in doc.tables: |
|
table_str = "\n".join( |
|
["\t".join(cell.text.strip() for cell in row.cells) |
|
for row in table.rows] |
|
) |
|
if table_str.strip(): |
|
content.append(f"[TABLE]\n{table_str}\n[/TABLE]") |
|
|
|
return content |
|
|
|
def extract_xlsx_content(file_path: str): |
|
wb = load_workbook(file_path) |
|
sheets_text = [] |
|
|
|
for sheet in wb: |
|
sheet_str = f"--- Sheet: {sheet.title} ---\n" |
|
for row in sheet.iter_rows(): |
|
row_str = "\t".join(str(cell.value) if cell.value else "" for cell in row) |
|
sheet_str += row_str + "\n" |
|
sheets_text.append(sheet_str.strip()) |
|
|
|
return sheets_text |