ai / src /main /file_extractors.py
hadadrjt's picture
ai: Refactor the code.
d17e7ef
raw
history blame
17.4 kB
#
# SPDX-FileCopyrightText: Hadad <[email protected]>
# SPDX-License-Identifier: Apache-2.0
#
import pdfplumber # Library to extract text and tables from PDF files
import pytesseract # OCR tool to extract text from images
import docx # Library to read Microsoft Word (.docx) files
import zipfile # To handle zipped archives, used here to access embedded images in Word files
import io # Provides tools for handling byte streams, used to open images from bytes
import pandas as pd # Data analysis library, used here to handle tables from Excel and other files
import warnings # Used to suppress warnings during Excel file reading
import re # Regular expressions for text cleaning
from openpyxl import load_workbook # Excel file reading library, used for .xlsx files
from pptx import Presentation # Library to read Microsoft PowerPoint files
from PIL import Image, ImageEnhance, ImageFilter # Image processing libraries for OCR preprocessing
from pathlib import Path # Object-oriented filesystem paths
def clean_text(text):
"""
Clean and normalize extracted text to improve readability and remove noise.
This function performs several cleaning steps:
- Removes characters that are not letters, digits, spaces, or common punctuation.
- Removes isolated single letters which are often OCR errors or noise.
- Strips whitespace from each line and removes empty lines.
- Joins cleaned lines back into a single string separated by newlines.
Args:
text (str): Raw extracted text from any source.
Returns:
str: Cleaned and normalized text ready for display or further processing.
"""
# Remove all characters except letters, digits, spaces, and common punctuation marks
text = re.sub(r'[^a-zA-Z0-9\s.,?!():;\'"-]', '', text)
# Remove single isolated letters which are likely errors or noise from OCR
text = re.sub(r'\b[a-zA-Z]\b', '', text)
# Split text into lines, strip whitespace, and remove empty lines
lines = [line.strip() for line in text.splitlines() if line.strip()]
# Join cleaned lines with newline characters
return "\n".join(lines)
def format_table(df, max_rows=10):
"""
Convert a pandas DataFrame into a clean, readable string representation of a table.
This function:
- Removes rows and columns that are completely empty to reduce clutter.
- Replaces any NaN values with empty strings for cleaner output.
- Limits the output to a maximum number of rows for brevity.
- Adds a note if there are more rows than displayed.
Args:
df (pandas.DataFrame): The table data to format.
max_rows (int): Maximum number of rows to display from the table.
Returns:
str: Formatted string representation of the table or empty string if no data.
"""
if df.empty:
return ""
# Remove rows and columns where all values are NaN to clean the table
df_clean = df.dropna(axis=0, how='all').dropna(axis=1, how='all')
# Replace remaining NaN values with empty strings for better readability
df_clean = df_clean.fillna('')
if df_clean.empty:
return ""
# Select only the first max_rows rows for display
display_df = df_clean.head(max_rows)
# Convert DataFrame to string without row indices
table_str = display_df.to_string(index=False)
# Append a message if there are more rows than displayed
if len(df_clean) > max_rows:
table_str += f"\n... ({len(df_clean) - max_rows} more rows)"
return table_str
def preprocess_image(img):
"""
Enhance an image to improve OCR accuracy by applying several preprocessing steps.
The preprocessing includes:
- Converting the image to grayscale to simplify colors.
- Increasing contrast to make text stand out more.
- Applying a median filter to reduce noise.
- Binarizing the image by thresholding to black and white.
Args:
img (PIL.Image.Image): The original image to preprocess.
Returns:
PIL.Image.Image: The processed image ready for OCR.
If an error occurs during processing, returns the original image.
"""
try:
# Convert image to grayscale mode
img = img.convert("L")
# Enhance contrast by a factor of 2 to make text clearer
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(2)
# Apply median filter to reduce noise and smooth the image
img = img.filter(ImageFilter.MedianFilter())
# Convert image to black and white using a threshold of 140
img = img.point(lambda x: 0 if x < 140 else 255, '1')
return img
except Exception:
# In case of any error, return the original image without changes
return img
def ocr_image(img):
"""
Extract text from an image using OCR after preprocessing to improve results.
This function:
- Preprocesses the image to enhance text visibility.
- Uses pytesseract with page segmentation mode 6 (assumes a single uniform block of text).
- Cleans the extracted text using the clean_text function.
Args:
img (PIL.Image.Image): The image from which to extract text.
Returns:
str: The cleaned OCR-extracted text. Returns empty string if OCR fails.
"""
try:
# Preprocess image to improve OCR quality
img = preprocess_image(img)
# Perform OCR using pytesseract with English language and specified config
text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
# Clean the OCR output to remove noise and normalize text
text = clean_text(text)
return text
except Exception:
# Return empty string if OCR fails for any reason
return ""
def extract_pdf_content(fp):
"""
Extract text and tables from a PDF file, including OCR on embedded images.
This function:
- Opens the PDF file and iterates through each page.
- Extracts and cleans text from each page.
- Performs OCR on images embedded in pages to extract any text within images.
- Extracts tables from pages and formats them as readable text.
- Handles exceptions by appending error messages to the content.
Args:
fp (str or Path): File path to the PDF document.
Returns:
str: Combined extracted text, OCR results, and formatted tables from the PDF.
"""
content = ""
try:
with pdfplumber.open(fp) as pdf:
for i, page in enumerate(pdf.pages, 1):
# Extract text from the current page, defaulting to empty string if None
text = page.extract_text() or ""
# Clean extracted text and add page header
content += f"Page {i} Text:\n{clean_text(text)}\n\n"
# If there are images on the page, perform OCR on each
if page.images:
# Create an image object of the page with 300 dpi resolution for cropping
img_obj = page.to_image(resolution=300)
for img in page.images:
# Define bounding box coordinates for the image on the page
bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
# Crop the image from the page image
cropped = img_obj.original.crop(bbox)
# Perform OCR on the cropped image
ocr_text = ocr_image(cropped)
if ocr_text:
# Append OCR text with page and image reference
content += f"[OCR Text from image on page {i}]:\n{ocr_text}\n\n"
# Extract tables from the page
tables = page.extract_tables()
for idx, table in enumerate(tables, 1):
if table:
# Convert table list to DataFrame using first row as header
df = pd.DataFrame(table[1:], columns=table[0])
# Format and append the table text
content += f"Table {idx} on page {i}:\n{format_table(df)}\n\n"
except Exception as e:
# Append error message if PDF reading fails
content += f"\n[Error reading PDF {fp}: {e}]"
# Return the combined content with whitespace trimmed
return content.strip()
def extract_docx_content(fp):
"""
Extract text, tables, and OCR text from images embedded in a Microsoft Word (.docx) file.
This function:
- Reads paragraphs and tables from the document.
- Cleans and formats extracted text and tables.
- Opens the .docx file as a zip archive to extract embedded images.
- Performs OCR on embedded images to extract any text they contain.
- Handles exceptions and appends error messages if reading fails.
Args:
fp (str or Path): File path to the Word document.
Returns:
str: Combined extracted paragraphs, tables, and OCR text from embedded images.
"""
content = ""
try:
# Load the Word document
doc = docx.Document(fp)
# Extract and clean all non-empty paragraphs
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
if paragraphs:
content += "Paragraphs:\n" + "\n".join(paragraphs) + "\n\n"
# Extract tables from the document
tables = []
for table in doc.tables:
rows = []
for row in table.rows:
# Extract and clean text from each cell in the row
cells = [cell.text.strip() for cell in row.cells]
rows.append(cells)
if rows:
# Convert rows to DataFrame using first row as header
df = pd.DataFrame(rows[1:], columns=rows[0])
tables.append(df)
# Format and append each extracted table
for i, df in enumerate(tables, 1):
content += f"Table {i}:\n{format_table(df)}\n\n"
# Open the .docx file as a zip archive to access embedded media files
with zipfile.ZipFile(fp) as z:
for file in z.namelist():
# Look for images inside the word/media directory
if file.startswith("word/media/"):
data = z.read(file)
try:
# Open image from bytes
img = Image.open(io.BytesIO(data))
# Perform OCR on the image
ocr_text = ocr_image(img)
if ocr_text:
# Append OCR text extracted from embedded image
content += f"[OCR Text from embedded image]:\n{ocr_text}\n\n"
except Exception:
# Ignore errors in image processing to continue extraction
pass
except Exception as e:
# Append error message if Word document reading fails
content += f"\n[Error reading Microsoft Word {fp}: {e}]"
# Return combined content trimmed of extra whitespace
return content.strip()
def extract_excel_content(fp):
"""
Extract readable table content from Microsoft Excel files (.xlsx, .xls).
This function:
- Reads all sheets in the Excel file.
- Converts each sheet to a formatted table string.
- Suppresses warnings during reading to avoid clutter.
- Does not attempt to extract images to avoid errors.
- Handles exceptions by appending error messages.
Args:
fp (str or Path): File path to the Excel workbook.
Returns:
str: Combined formatted tables from all sheets in the workbook.
"""
content = ""
try:
# Suppress warnings such as openpyxl deprecation or data type warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore")
# Read all sheets into a dictionary of DataFrames using openpyxl engine
sheets = pd.read_excel(fp, sheet_name=None, engine='openpyxl')
# Iterate over each sheet and format its content
for sheet_name, df in sheets.items():
content += f"Sheet: {sheet_name}\n"
content += format_table(df) + "\n\n"
except Exception as e:
# Append error message if Excel reading fails
content += f"\n[Error reading Microsoft Excel {fp}: {e}]"
# Return combined sheet contents trimmed of whitespace
return content.strip()
def extract_pptx_content(fp):
"""
Extract text, tables, and OCR text from images in Microsoft PowerPoint (.pptx) files.
This function:
- Reads each slide in the presentation.
- Extracts text from shapes and tables on each slide.
- Performs OCR on images embedded in shapes.
- Handles exceptions and appends error messages if reading fails.
Args:
fp (str or Path): File path to the PowerPoint presentation.
Returns:
str: Combined extracted text, tables, and OCR results from all slides.
"""
content = ""
try:
# Load the PowerPoint presentation
prs = Presentation(fp)
# Iterate through each slide by index starting at 1
for i, slide in enumerate(prs.slides, 1):
slide_texts = []
# Iterate through all shapes on the slide
for shape in slide.shapes:
# Extract and clean text from shapes that have text attribute
if hasattr(shape, "text") and shape.text.strip():
slide_texts.append(shape.text.strip())
# Check if the shape is a picture (shape_type 13) with an image
if shape.shape_type == 13 and hasattr(shape, "image") and shape.image:
try:
# Open image from the shape's binary blob data
img = Image.open(io.BytesIO(shape.image.blob))
# Perform OCR on the image
ocr_text = ocr_image(img)
if ocr_text:
# Append OCR text extracted from the image
slide_texts.append(f"[OCR Text from image]:\n{ocr_text}")
except Exception:
# Ignore errors in image OCR to continue processing
pass
# Add slide text or note if no text found
if slide_texts:
content += f"Slide {i} Text:\n" + "\n".join(slide_texts) + "\n\n"
else:
content += f"Slide {i} Text:\nNo text found on this slide.\n\n"
# Extract tables from shapes that have tables
for shape in slide.shapes:
if shape.has_table:
rows = []
table = shape.table
# Extract text from each cell in the table rows
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells]
rows.append(cells)
if rows:
# Convert rows to DataFrame using first row as header
df = pd.DataFrame(rows[1:], columns=rows[0])
# Format and append the table text
content += f"Table on slide {i}:\n{format_table(df)}\n\n"
except Exception as e:
# Append error message if PowerPoint reading fails
content += f"\n[Error reading Microsoft PowerPoint {fp}: {e}]"
# Return combined slide content trimmed of whitespace
return content.strip()
def extract_file_content(fp):
"""
Determine the file type based on its extension and extract text content accordingly.
This function supports:
- PDF files with text, tables, and OCR on images.
- Microsoft Word documents with paragraphs, tables, and OCR on embedded images.
- Microsoft Excel workbooks with formatted sheet tables.
- Microsoft PowerPoint presentations with slide text, tables, and OCR on images.
- Other file types are attempted to be read as plain UTF-8 text.
Args:
fp (str or Path): File path to the document to extract content from.
Returns:
str: Extracted and cleaned text content from the file, or an error message.
"""
# Get the file extension in lowercase to identify file type
ext = Path(fp).suffix.lower()
if ext == ".pdf":
# Extract content from PDF files
return extract_pdf_content(fp)
elif ext in [".doc", ".docx"]:
# Extract content from Word documents
return extract_docx_content(fp)
elif ext in [".xlsx", ".xls"]:
# Extract content from Excel workbooks
return extract_excel_content(fp)
elif ext in [".ppt", ".pptx"]:
# Extract content from PowerPoint presentations
return extract_pptx_content(fp)
else:
try:
# Attempt to read unknown file types as plain UTF-8 text
text = Path(fp).read_text(encoding="utf-8")
# Clean the extracted text before returning
return clean_text(text)
except Exception as e:
# Return error message if reading fails
return f"\n[Error reading file {fp}: {e}]"