File size: 17,376 Bytes
f99ad65
 
 
 
 
d17e7ef
 
 
 
 
 
 
 
f99ad65
d17e7ef
 
 
 
f99ad65
 
d17e7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f99ad65
d17e7ef
f99ad65
d17e7ef
f99ad65
d17e7ef
f99ad65
 
 
d17e7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f99ad65
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
 
d17e7ef
f99ad65
d17e7ef
f99ad65
d17e7ef
f99ad65
 
 
 
 
d17e7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f99ad65
d17e7ef
 
 
f99ad65
d17e7ef
 
 
 
f99ad65
 
 
d17e7ef
f99ad65
 
 
d17e7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
f99ad65
d17e7ef
f99ad65
d17e7ef
f99ad65
d17e7ef
f99ad65
 
 
d17e7ef
f99ad65
 
 
 
d17e7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
f99ad65
 
 
 
 
d17e7ef
f99ad65
d17e7ef
f99ad65
d17e7ef
f99ad65
d17e7ef
f99ad65
 
d17e7ef
f99ad65
d17e7ef
f99ad65
d17e7ef
f99ad65
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
 
 
d17e7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
f99ad65
 
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
 
d17e7ef
f99ad65
 
 
 
d17e7ef
f99ad65
 
 
d17e7ef
f99ad65
 
d17e7ef
f99ad65
 
d17e7ef
f99ad65
 
d17e7ef
f99ad65
 
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
d17e7ef
f99ad65
 
d17e7ef
f99ad65
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
 
 
d17e7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
f99ad65
 
 
d17e7ef
f99ad65
d17e7ef
 
f99ad65
d17e7ef
f99ad65
 
 
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
 
 
d17e7ef
 
 
 
 
 
 
 
 
 
 
 
 
f99ad65
 
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
d17e7ef
f99ad65
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
d17e7ef
f99ad65
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
 
 
d17e7ef
f99ad65
 
 
 
d17e7ef
f99ad65
 
 
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
 
 
d17e7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
f99ad65
d17e7ef
f99ad65
 
d17e7ef
f99ad65
 
d17e7ef
f99ad65
 
d17e7ef
f99ad65
 
d17e7ef
f99ad65
 
 
d17e7ef
f99ad65
d17e7ef
f99ad65
 
d17e7ef
f99ad65
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
#
# SPDX-FileCopyrightText: Hadad <[email protected]>
# SPDX-License-Identifier: Apache-2.0
#

import pdfplumber  # Library to extract text and tables from PDF files
import pytesseract  # OCR tool to extract text from images
import docx  # Library to read Microsoft Word (.docx) files
import zipfile  # To handle zipped archives, used here to access embedded images in Word files
import io  # Provides tools for handling byte streams, used to open images from bytes
import pandas as pd  # Data analysis library, used here to handle tables from Excel and other files
import warnings  # Used to suppress warnings during Excel file reading
import re  # Regular expressions for text cleaning

from openpyxl import load_workbook  # Excel file reading library, used for .xlsx files
from pptx import Presentation  # Library to read Microsoft PowerPoint files
from PIL import Image, ImageEnhance, ImageFilter  # Image processing libraries for OCR preprocessing
from pathlib import Path  # Object-oriented filesystem paths

def clean_text(text):
    """
    Clean and normalize extracted text to improve readability and remove noise.
    
    This function performs several cleaning steps:
    - Removes characters that are not letters, digits, spaces, or common punctuation.
    - Removes isolated single letters which are often OCR errors or noise.
    - Strips whitespace from each line and removes empty lines.
    - Joins cleaned lines back into a single string separated by newlines.
    
    Args:
        text (str): Raw extracted text from any source.
    
    Returns:
        str: Cleaned and normalized text ready for display or further processing.
    """
    # Remove all characters except letters, digits, spaces, and common punctuation marks
    text = re.sub(r'[^a-zA-Z0-9\s.,?!():;\'"-]', '', text)
    # Remove single isolated letters which are likely errors or noise from OCR
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    # Split text into lines, strip whitespace, and remove empty lines
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    # Join cleaned lines with newline characters
    return "\n".join(lines)

def format_table(df, max_rows=10):
    """
    Convert a pandas DataFrame into a clean, readable string representation of a table.
    
    This function:
    - Removes rows and columns that are completely empty to reduce clutter.
    - Replaces any NaN values with empty strings for cleaner output.
    - Limits the output to a maximum number of rows for brevity.
    - Adds a note if there are more rows than displayed.
    
    Args:
        df (pandas.DataFrame): The table data to format.
        max_rows (int): Maximum number of rows to display from the table.
    
    Returns:
        str: Formatted string representation of the table or empty string if no data.
    """
    if df.empty:
        return ""
    # Remove rows and columns where all values are NaN to clean the table
    df_clean = df.dropna(axis=0, how='all').dropna(axis=1, how='all')
    # Replace remaining NaN values with empty strings for better readability
    df_clean = df_clean.fillna('')
    if df_clean.empty:
        return ""
    # Select only the first max_rows rows for display
    display_df = df_clean.head(max_rows)
    # Convert DataFrame to string without row indices
    table_str = display_df.to_string(index=False)
    # Append a message if there are more rows than displayed
    if len(df_clean) > max_rows:
        table_str += f"\n... ({len(df_clean) - max_rows} more rows)"
    return table_str

def preprocess_image(img):
    """
    Enhance an image to improve OCR accuracy by applying several preprocessing steps.
    
    The preprocessing includes:
    - Converting the image to grayscale to simplify colors.
    - Increasing contrast to make text stand out more.
    - Applying a median filter to reduce noise.
    - Binarizing the image by thresholding to black and white.
    
    Args:
        img (PIL.Image.Image): The original image to preprocess.
    
    Returns:
        PIL.Image.Image: The processed image ready for OCR.
        If an error occurs during processing, returns the original image.
    """
    try:
        # Convert image to grayscale mode
        img = img.convert("L")
        # Enhance contrast by a factor of 2 to make text clearer
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(2)
        # Apply median filter to reduce noise and smooth the image
        img = img.filter(ImageFilter.MedianFilter())
        # Convert image to black and white using a threshold of 140
        img = img.point(lambda x: 0 if x < 140 else 255, '1')
        return img
    except Exception:
        # In case of any error, return the original image without changes
        return img

def ocr_image(img):
    """
    Extract text from an image using OCR after preprocessing to improve results.
    
    This function:
    - Preprocesses the image to enhance text visibility.
    - Uses pytesseract with page segmentation mode 6 (assumes a single uniform block of text).
    - Cleans the extracted text using the clean_text function.
    
    Args:
        img (PIL.Image.Image): The image from which to extract text.
    
    Returns:
        str: The cleaned OCR-extracted text. Returns empty string if OCR fails.
    """
    try:
        # Preprocess image to improve OCR quality
        img = preprocess_image(img)
        # Perform OCR using pytesseract with English language and specified config
        text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
        # Clean the OCR output to remove noise and normalize text
        text = clean_text(text)
        return text
    except Exception:
        # Return empty string if OCR fails for any reason
        return ""

def extract_pdf_content(fp):
    """
    Extract text and tables from a PDF file, including OCR on embedded images.
    
    This function:
    - Opens the PDF file and iterates through each page.
    - Extracts and cleans text from each page.
    - Performs OCR on images embedded in pages to extract any text within images.
    - Extracts tables from pages and formats them as readable text.
    - Handles exceptions by appending error messages to the content.
    
    Args:
        fp (str or Path): File path to the PDF document.
    
    Returns:
        str: Combined extracted text, OCR results, and formatted tables from the PDF.
    """
    content = ""
    try:
        with pdfplumber.open(fp) as pdf:
            for i, page in enumerate(pdf.pages, 1):
                # Extract text from the current page, defaulting to empty string if None
                text = page.extract_text() or ""
                # Clean extracted text and add page header
                content += f"Page {i} Text:\n{clean_text(text)}\n\n"
                # If there are images on the page, perform OCR on each
                if page.images:
                    # Create an image object of the page with 300 dpi resolution for cropping
                    img_obj = page.to_image(resolution=300)
                    for img in page.images:
                        # Define bounding box coordinates for the image on the page
                        bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
                        # Crop the image from the page image
                        cropped = img_obj.original.crop(bbox)
                        # Perform OCR on the cropped image
                        ocr_text = ocr_image(cropped)
                        if ocr_text:
                            # Append OCR text with page and image reference
                            content += f"[OCR Text from image on page {i}]:\n{ocr_text}\n\n"
                # Extract tables from the page
                tables = page.extract_tables()
                for idx, table in enumerate(tables, 1):
                    if table:
                        # Convert table list to DataFrame using first row as header
                        df = pd.DataFrame(table[1:], columns=table[0])
                        # Format and append the table text
                        content += f"Table {idx} on page {i}:\n{format_table(df)}\n\n"
    except Exception as e:
        # Append error message if PDF reading fails
        content += f"\n[Error reading PDF {fp}: {e}]"
    # Return the combined content with whitespace trimmed
    return content.strip()

def extract_docx_content(fp):
    """
    Extract text, tables, and OCR text from images embedded in a Microsoft Word (.docx) file.
    
    This function:
    - Reads paragraphs and tables from the document.
    - Cleans and formats extracted text and tables.
    - Opens the .docx file as a zip archive to extract embedded images.
    - Performs OCR on embedded images to extract any text they contain.
    - Handles exceptions and appends error messages if reading fails.
    
    Args:
        fp (str or Path): File path to the Word document.
    
    Returns:
        str: Combined extracted paragraphs, tables, and OCR text from embedded images.
    """
    content = ""
    try:
        # Load the Word document
        doc = docx.Document(fp)
        # Extract and clean all non-empty paragraphs
        paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
        if paragraphs:
            content += "Paragraphs:\n" + "\n".join(paragraphs) + "\n\n"
        # Extract tables from the document
        tables = []
        for table in doc.tables:
            rows = []
            for row in table.rows:
                # Extract and clean text from each cell in the row
                cells = [cell.text.strip() for cell in row.cells]
                rows.append(cells)
            if rows:
                # Convert rows to DataFrame using first row as header
                df = pd.DataFrame(rows[1:], columns=rows[0])
                tables.append(df)
        # Format and append each extracted table
        for i, df in enumerate(tables, 1):
            content += f"Table {i}:\n{format_table(df)}\n\n"
        # Open the .docx file as a zip archive to access embedded media files
        with zipfile.ZipFile(fp) as z:
            for file in z.namelist():
                # Look for images inside the word/media directory
                if file.startswith("word/media/"):
                    data = z.read(file)
                    try:
                        # Open image from bytes
                        img = Image.open(io.BytesIO(data))
                        # Perform OCR on the image
                        ocr_text = ocr_image(img)
                        if ocr_text:
                            # Append OCR text extracted from embedded image
                            content += f"[OCR Text from embedded image]:\n{ocr_text}\n\n"
                    except Exception:
                        # Ignore errors in image processing to continue extraction
                        pass
    except Exception as e:
        # Append error message if Word document reading fails
        content += f"\n[Error reading Microsoft Word {fp}: {e}]"
    # Return combined content trimmed of extra whitespace
    return content.strip()

def extract_excel_content(fp):
    """
    Extract readable table content from Microsoft Excel files (.xlsx, .xls).
    
    This function:
    - Reads all sheets in the Excel file.
    - Converts each sheet to a formatted table string.
    - Suppresses warnings during reading to avoid clutter.
    - Does not attempt to extract images to avoid errors.
    - Handles exceptions by appending error messages.
    
    Args:
        fp (str or Path): File path to the Excel workbook.
    
    Returns:
        str: Combined formatted tables from all sheets in the workbook.
    """
    content = ""
    try:
        # Suppress warnings such as openpyxl deprecation or data type warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            # Read all sheets into a dictionary of DataFrames using openpyxl engine
            sheets = pd.read_excel(fp, sheet_name=None, engine='openpyxl')
        # Iterate over each sheet and format its content
        for sheet_name, df in sheets.items():
            content += f"Sheet: {sheet_name}\n"
            content += format_table(df) + "\n\n"
    except Exception as e:
        # Append error message if Excel reading fails
        content += f"\n[Error reading Microsoft Excel {fp}: {e}]"
    # Return combined sheet contents trimmed of whitespace
    return content.strip()

def extract_pptx_content(fp):
    """
    Extract text, tables, and OCR text from images in Microsoft PowerPoint (.pptx) files.
    
    This function:
    - Reads each slide in the presentation.
    - Extracts text from shapes and tables on each slide.
    - Performs OCR on images embedded in shapes.
    - Handles exceptions and appends error messages if reading fails.
    
    Args:
        fp (str or Path): File path to the PowerPoint presentation.
    
    Returns:
        str: Combined extracted text, tables, and OCR results from all slides.
    """
    content = ""
    try:
        # Load the PowerPoint presentation
        prs = Presentation(fp)
        # Iterate through each slide by index starting at 1
        for i, slide in enumerate(prs.slides, 1):
            slide_texts = []
            # Iterate through all shapes on the slide
            for shape in slide.shapes:
                # Extract and clean text from shapes that have text attribute
                if hasattr(shape, "text") and shape.text.strip():
                    slide_texts.append(shape.text.strip())
                # Check if the shape is a picture (shape_type 13) with an image
                if shape.shape_type == 13 and hasattr(shape, "image") and shape.image:
                    try:
                        # Open image from the shape's binary blob data
                        img = Image.open(io.BytesIO(shape.image.blob))
                        # Perform OCR on the image
                        ocr_text = ocr_image(img)
                        if ocr_text:
                            # Append OCR text extracted from the image
                            slide_texts.append(f"[OCR Text from image]:\n{ocr_text}")
                    except Exception:
                        # Ignore errors in image OCR to continue processing
                        pass
            # Add slide text or note if no text found
            if slide_texts:
                content += f"Slide {i} Text:\n" + "\n".join(slide_texts) + "\n\n"
            else:
                content += f"Slide {i} Text:\nNo text found on this slide.\n\n"
            # Extract tables from shapes that have tables
            for shape in slide.shapes:
                if shape.has_table:
                    rows = []
                    table = shape.table
                    # Extract text from each cell in the table rows
                    for row in table.rows:
                        cells = [cell.text.strip() for cell in row.cells]
                        rows.append(cells)
                    if rows:
                        # Convert rows to DataFrame using first row as header
                        df = pd.DataFrame(rows[1:], columns=rows[0])
                        # Format and append the table text
                        content += f"Table on slide {i}:\n{format_table(df)}\n\n"
    except Exception as e:
        # Append error message if PowerPoint reading fails
        content += f"\n[Error reading Microsoft PowerPoint {fp}: {e}]"
    # Return combined slide content trimmed of whitespace
    return content.strip()

def extract_file_content(fp):
    """
    Determine the file type based on its extension and extract text content accordingly.
    
    This function supports:
    - PDF files with text, tables, and OCR on images.
    - Microsoft Word documents with paragraphs, tables, and OCR on embedded images.
    - Microsoft Excel workbooks with formatted sheet tables.
    - Microsoft PowerPoint presentations with slide text, tables, and OCR on images.
    - Other file types are attempted to be read as plain UTF-8 text.
    
    Args:
        fp (str or Path): File path to the document to extract content from.
    
    Returns:
        str: Extracted and cleaned text content from the file, or an error message.
    """
    # Get the file extension in lowercase to identify file type
    ext = Path(fp).suffix.lower()
    if ext == ".pdf":
        # Extract content from PDF files
        return extract_pdf_content(fp)
    elif ext in [".doc", ".docx"]:
        # Extract content from Word documents
        return extract_docx_content(fp)
    elif ext in [".xlsx", ".xls"]:
        # Extract content from Excel workbooks
        return extract_excel_content(fp)
    elif ext in [".ppt", ".pptx"]:
        # Extract content from PowerPoint presentations
        return extract_pptx_content(fp)
    else:
        try:
            # Attempt to read unknown file types as plain UTF-8 text
            text = Path(fp).read_text(encoding="utf-8")
            # Clean the extracted text before returning
            return clean_text(text)
        except Exception as e:
            # Return error message if reading fails
            return f"\n[Error reading file {fp}: {e}]"