# # SPDX-FileCopyrightText: Hadad # SPDX-License-Identifier: Apache-2.0 # import pdfplumber # Library to extract text and tables from PDF files import pytesseract # OCR tool to extract text from images import docx # Library to read Microsoft Word (.docx) files import zipfile # To handle zipped archives, used here to access embedded images in Word files import io # Provides tools for handling byte streams, used to open images from bytes import pandas as pd # Data analysis library, used here to handle tables from Excel and other files import warnings # Used to suppress warnings during Excel file reading import re # Regular expressions for text cleaning from openpyxl import load_workbook # Excel file reading library, used for .xlsx files from pptx import Presentation # Library to read Microsoft PowerPoint files from PIL import Image, ImageEnhance, ImageFilter # Image processing libraries for OCR preprocessing from pathlib import Path # Object-oriented filesystem paths def clean_text(text): """ Clean and normalize extracted text to improve readability and remove noise. This function performs several cleaning steps: - Removes characters that are not letters, digits, spaces, or common punctuation. - Removes isolated single letters which are often OCR errors or noise. - Strips whitespace from each line and removes empty lines. - Joins cleaned lines back into a single string separated by newlines. Args: text (str): Raw extracted text from any source. Returns: str: Cleaned and normalized text ready for display or further processing. """ # Remove all characters except letters, digits, spaces, and common punctuation marks text = re.sub(r'[^a-zA-Z0-9\s.,?!():;\'"-]', '', text) # Remove single isolated letters which are likely errors or noise from OCR text = re.sub(r'\b[a-zA-Z]\b', '', text) # Split text into lines, strip whitespace, and remove empty lines lines = [line.strip() for line in text.splitlines() if line.strip()] # Join cleaned lines with newline characters return "\n".join(lines) def format_table(df, max_rows=10): """ Convert a pandas DataFrame into a clean, readable string representation of a table. This function: - Removes rows and columns that are completely empty to reduce clutter. - Replaces any NaN values with empty strings for cleaner output. - Limits the output to a maximum number of rows for brevity. - Adds a note if there are more rows than displayed. Args: df (pandas.DataFrame): The table data to format. max_rows (int): Maximum number of rows to display from the table. Returns: str: Formatted string representation of the table or empty string if no data. """ if df.empty: return "" # Remove rows and columns where all values are NaN to clean the table df_clean = df.dropna(axis=0, how='all').dropna(axis=1, how='all') # Replace remaining NaN values with empty strings for better readability df_clean = df_clean.fillna('') if df_clean.empty: return "" # Select only the first max_rows rows for display display_df = df_clean.head(max_rows) # Convert DataFrame to string without row indices table_str = display_df.to_string(index=False) # Append a message if there are more rows than displayed if len(df_clean) > max_rows: table_str += f"\n... ({len(df_clean) - max_rows} more rows)" return table_str def preprocess_image(img): """ Enhance an image to improve OCR accuracy by applying several preprocessing steps. The preprocessing includes: - Converting the image to grayscale to simplify colors. - Increasing contrast to make text stand out more. - Applying a median filter to reduce noise. - Binarizing the image by thresholding to black and white. Args: img (PIL.Image.Image): The original image to preprocess. Returns: PIL.Image.Image: The processed image ready for OCR. If an error occurs during processing, returns the original image. """ try: # Convert image to grayscale mode img = img.convert("L") # Enhance contrast by a factor of 2 to make text clearer enhancer = ImageEnhance.Contrast(img) img = enhancer.enhance(2) # Apply median filter to reduce noise and smooth the image img = img.filter(ImageFilter.MedianFilter()) # Convert image to black and white using a threshold of 140 img = img.point(lambda x: 0 if x < 140 else 255, '1') return img except Exception: # In case of any error, return the original image without changes return img def ocr_image(img): """ Extract text from an image using OCR after preprocessing to improve results. This function: - Preprocesses the image to enhance text visibility. - Uses pytesseract with page segmentation mode 6 (assumes a single uniform block of text). - Cleans the extracted text using the clean_text function. Args: img (PIL.Image.Image): The image from which to extract text. Returns: str: The cleaned OCR-extracted text. Returns empty string if OCR fails. """ try: # Preprocess image to improve OCR quality img = preprocess_image(img) # Perform OCR using pytesseract with English language and specified config text = pytesseract.image_to_string(img, lang='eng', config='--psm 6') # Clean the OCR output to remove noise and normalize text text = clean_text(text) return text except Exception: # Return empty string if OCR fails for any reason return "" def extract_pdf_content(fp): """ Extract text and tables from a PDF file, including OCR on embedded images. This function: - Opens the PDF file and iterates through each page. - Extracts and cleans text from each page. - Performs OCR on images embedded in pages to extract any text within images. - Extracts tables from pages and formats them as readable text. - Handles exceptions by appending error messages to the content. Args: fp (str or Path): File path to the PDF document. Returns: str: Combined extracted text, OCR results, and formatted tables from the PDF. """ content = "" try: with pdfplumber.open(fp) as pdf: for i, page in enumerate(pdf.pages, 1): # Extract text from the current page, defaulting to empty string if None text = page.extract_text() or "" # Clean extracted text and add page header content += f"Page {i} Text:\n{clean_text(text)}\n\n" # If there are images on the page, perform OCR on each if page.images: # Create an image object of the page with 300 dpi resolution for cropping img_obj = page.to_image(resolution=300) for img in page.images: # Define bounding box coordinates for the image on the page bbox = (img["x0"], img["top"], img["x1"], img["bottom"]) # Crop the image from the page image cropped = img_obj.original.crop(bbox) # Perform OCR on the cropped image ocr_text = ocr_image(cropped) if ocr_text: # Append OCR text with page and image reference content += f"[OCR Text from image on page {i}]:\n{ocr_text}\n\n" # Extract tables from the page tables = page.extract_tables() for idx, table in enumerate(tables, 1): if table: # Convert table list to DataFrame using first row as header df = pd.DataFrame(table[1:], columns=table[0]) # Format and append the table text content += f"Table {idx} on page {i}:\n{format_table(df)}\n\n" except Exception as e: # Append error message if PDF reading fails content += f"\n[Error reading PDF {fp}: {e}]" # Return the combined content with whitespace trimmed return content.strip() def extract_docx_content(fp): """ Extract text, tables, and OCR text from images embedded in a Microsoft Word (.docx) file. This function: - Reads paragraphs and tables from the document. - Cleans and formats extracted text and tables. - Opens the .docx file as a zip archive to extract embedded images. - Performs OCR on embedded images to extract any text they contain. - Handles exceptions and appends error messages if reading fails. Args: fp (str or Path): File path to the Word document. Returns: str: Combined extracted paragraphs, tables, and OCR text from embedded images. """ content = "" try: # Load the Word document doc = docx.Document(fp) # Extract and clean all non-empty paragraphs paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()] if paragraphs: content += "Paragraphs:\n" + "\n".join(paragraphs) + "\n\n" # Extract tables from the document tables = [] for table in doc.tables: rows = [] for row in table.rows: # Extract and clean text from each cell in the row cells = [cell.text.strip() for cell in row.cells] rows.append(cells) if rows: # Convert rows to DataFrame using first row as header df = pd.DataFrame(rows[1:], columns=rows[0]) tables.append(df) # Format and append each extracted table for i, df in enumerate(tables, 1): content += f"Table {i}:\n{format_table(df)}\n\n" # Open the .docx file as a zip archive to access embedded media files with zipfile.ZipFile(fp) as z: for file in z.namelist(): # Look for images inside the word/media directory if file.startswith("word/media/"): data = z.read(file) try: # Open image from bytes img = Image.open(io.BytesIO(data)) # Perform OCR on the image ocr_text = ocr_image(img) if ocr_text: # Append OCR text extracted from embedded image content += f"[OCR Text from embedded image]:\n{ocr_text}\n\n" except Exception: # Ignore errors in image processing to continue extraction pass except Exception as e: # Append error message if Word document reading fails content += f"\n[Error reading Microsoft Word {fp}: {e}]" # Return combined content trimmed of extra whitespace return content.strip() def extract_excel_content(fp): """ Extract readable table content from Microsoft Excel files (.xlsx, .xls). This function: - Reads all sheets in the Excel file. - Converts each sheet to a formatted table string. - Suppresses warnings during reading to avoid clutter. - Does not attempt to extract images to avoid errors. - Handles exceptions by appending error messages. Args: fp (str or Path): File path to the Excel workbook. Returns: str: Combined formatted tables from all sheets in the workbook. """ content = "" try: # Suppress warnings such as openpyxl deprecation or data type warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") # Read all sheets into a dictionary of DataFrames using openpyxl engine sheets = pd.read_excel(fp, sheet_name=None, engine='openpyxl') # Iterate over each sheet and format its content for sheet_name, df in sheets.items(): content += f"Sheet: {sheet_name}\n" content += format_table(df) + "\n\n" except Exception as e: # Append error message if Excel reading fails content += f"\n[Error reading Microsoft Excel {fp}: {e}]" # Return combined sheet contents trimmed of whitespace return content.strip() def extract_pptx_content(fp): """ Extract text, tables, and OCR text from images in Microsoft PowerPoint (.pptx) files. This function: - Reads each slide in the presentation. - Extracts text from shapes and tables on each slide. - Performs OCR on images embedded in shapes. - Handles exceptions and appends error messages if reading fails. Args: fp (str or Path): File path to the PowerPoint presentation. Returns: str: Combined extracted text, tables, and OCR results from all slides. """ content = "" try: # Load the PowerPoint presentation prs = Presentation(fp) # Iterate through each slide by index starting at 1 for i, slide in enumerate(prs.slides, 1): slide_texts = [] # Iterate through all shapes on the slide for shape in slide.shapes: # Extract and clean text from shapes that have text attribute if hasattr(shape, "text") and shape.text.strip(): slide_texts.append(shape.text.strip()) # Check if the shape is a picture (shape_type 13) with an image if shape.shape_type == 13 and hasattr(shape, "image") and shape.image: try: # Open image from the shape's binary blob data img = Image.open(io.BytesIO(shape.image.blob)) # Perform OCR on the image ocr_text = ocr_image(img) if ocr_text: # Append OCR text extracted from the image slide_texts.append(f"[OCR Text from image]:\n{ocr_text}") except Exception: # Ignore errors in image OCR to continue processing pass # Add slide text or note if no text found if slide_texts: content += f"Slide {i} Text:\n" + "\n".join(slide_texts) + "\n\n" else: content += f"Slide {i} Text:\nNo text found on this slide.\n\n" # Extract tables from shapes that have tables for shape in slide.shapes: if shape.has_table: rows = [] table = shape.table # Extract text from each cell in the table rows for row in table.rows: cells = [cell.text.strip() for cell in row.cells] rows.append(cells) if rows: # Convert rows to DataFrame using first row as header df = pd.DataFrame(rows[1:], columns=rows[0]) # Format and append the table text content += f"Table on slide {i}:\n{format_table(df)}\n\n" except Exception as e: # Append error message if PowerPoint reading fails content += f"\n[Error reading Microsoft PowerPoint {fp}: {e}]" # Return combined slide content trimmed of whitespace return content.strip() def extract_file_content(fp): """ Determine the file type based on its extension and extract text content accordingly. This function supports: - PDF files with text, tables, and OCR on images. - Microsoft Word documents with paragraphs, tables, and OCR on embedded images. - Microsoft Excel workbooks with formatted sheet tables. - Microsoft PowerPoint presentations with slide text, tables, and OCR on images. - Other file types are attempted to be read as plain UTF-8 text. Args: fp (str or Path): File path to the document to extract content from. Returns: str: Extracted and cleaned text content from the file, or an error message. """ # Get the file extension in lowercase to identify file type ext = Path(fp).suffix.lower() if ext == ".pdf": # Extract content from PDF files return extract_pdf_content(fp) elif ext in [".doc", ".docx"]: # Extract content from Word documents return extract_docx_content(fp) elif ext in [".xlsx", ".xls"]: # Extract content from Excel workbooks return extract_excel_content(fp) elif ext in [".ppt", ".pptx"]: # Extract content from PowerPoint presentations return extract_pptx_content(fp) else: try: # Attempt to read unknown file types as plain UTF-8 text text = Path(fp).read_text(encoding="utf-8") # Clean the extracted text before returning return clean_text(text) except Exception as e: # Return error message if reading fails return f"\n[Error reading file {fp}: {e}]"