Spaces:
Running
Running
# | |
# SPDX-FileCopyrightText: Hadad <[email protected]> | |
# SPDX-License-Identifier: Apache-2.0 | |
# | |
import pdfplumber # Library to extract text and tables from PDF files | |
import pytesseract # OCR tool to extract text from images | |
import docx # Library to read Microsoft Word (.docx) files | |
import zipfile # To handle zipped archives, used here to access embedded images in Word files | |
import io # Provides tools for handling byte streams, used to open images from bytes | |
import pandas as pd # Data analysis library, used here to handle tables from Excel and other files | |
import warnings # Used to suppress warnings during Excel file reading | |
import re # Regular expressions for text cleaning | |
from openpyxl import load_workbook # Excel file reading library, used for .xlsx files | |
from pptx import Presentation # Library to read Microsoft PowerPoint files | |
from PIL import Image, ImageEnhance, ImageFilter # Image processing libraries for OCR preprocessing | |
from pathlib import Path # Object-oriented filesystem paths | |
def clean_text(text): | |
""" | |
Clean and normalize extracted text to improve readability and remove noise. | |
This function performs several cleaning steps: | |
- Removes characters that are not letters, digits, spaces, or common punctuation. | |
- Removes isolated single letters which are often OCR errors or noise. | |
- Strips whitespace from each line and removes empty lines. | |
- Joins cleaned lines back into a single string separated by newlines. | |
Args: | |
text (str): Raw extracted text from any source. | |
Returns: | |
str: Cleaned and normalized text ready for display or further processing. | |
""" | |
# Remove all characters except letters, digits, spaces, and common punctuation marks | |
text = re.sub(r'[^a-zA-Z0-9\s.,?!():;\'"-]', '', text) | |
# Remove single isolated letters which are likely errors or noise from OCR | |
text = re.sub(r'\b[a-zA-Z]\b', '', text) | |
# Split text into lines, strip whitespace, and remove empty lines | |
lines = [line.strip() for line in text.splitlines() if line.strip()] | |
# Join cleaned lines with newline characters | |
return "\n".join(lines) | |
def format_table(df, max_rows=10): | |
""" | |
Convert a pandas DataFrame into a clean, readable string representation of a table. | |
This function: | |
- Removes rows and columns that are completely empty to reduce clutter. | |
- Replaces any NaN values with empty strings for cleaner output. | |
- Limits the output to a maximum number of rows for brevity. | |
- Adds a note if there are more rows than displayed. | |
Args: | |
df (pandas.DataFrame): The table data to format. | |
max_rows (int): Maximum number of rows to display from the table. | |
Returns: | |
str: Formatted string representation of the table or empty string if no data. | |
""" | |
if df.empty: | |
return "" | |
# Remove rows and columns where all values are NaN to clean the table | |
df_clean = df.dropna(axis=0, how='all').dropna(axis=1, how='all') | |
# Replace remaining NaN values with empty strings for better readability | |
df_clean = df_clean.fillna('') | |
if df_clean.empty: | |
return "" | |
# Select only the first max_rows rows for display | |
display_df = df_clean.head(max_rows) | |
# Convert DataFrame to string without row indices | |
table_str = display_df.to_string(index=False) | |
# Append a message if there are more rows than displayed | |
if len(df_clean) > max_rows: | |
table_str += f"\n... ({len(df_clean) - max_rows} more rows)" | |
return table_str | |
def preprocess_image(img): | |
""" | |
Enhance an image to improve OCR accuracy by applying several preprocessing steps. | |
The preprocessing includes: | |
- Converting the image to grayscale to simplify colors. | |
- Increasing contrast to make text stand out more. | |
- Applying a median filter to reduce noise. | |
- Binarizing the image by thresholding to black and white. | |
Args: | |
img (PIL.Image.Image): The original image to preprocess. | |
Returns: | |
PIL.Image.Image: The processed image ready for OCR. | |
If an error occurs during processing, returns the original image. | |
""" | |
try: | |
# Convert image to grayscale mode | |
img = img.convert("L") | |
# Enhance contrast by a factor of 2 to make text clearer | |
enhancer = ImageEnhance.Contrast(img) | |
img = enhancer.enhance(2) | |
# Apply median filter to reduce noise and smooth the image | |
img = img.filter(ImageFilter.MedianFilter()) | |
# Convert image to black and white using a threshold of 140 | |
img = img.point(lambda x: 0 if x < 140 else 255, '1') | |
return img | |
except Exception: | |
# In case of any error, return the original image without changes | |
return img | |
def ocr_image(img): | |
""" | |
Extract text from an image using OCR after preprocessing to improve results. | |
This function: | |
- Preprocesses the image to enhance text visibility. | |
- Uses pytesseract with page segmentation mode 6 (assumes a single uniform block of text). | |
- Cleans the extracted text using the clean_text function. | |
Args: | |
img (PIL.Image.Image): The image from which to extract text. | |
Returns: | |
str: The cleaned OCR-extracted text. Returns empty string if OCR fails. | |
""" | |
try: | |
# Preprocess image to improve OCR quality | |
img = preprocess_image(img) | |
# Perform OCR using pytesseract with English language and specified config | |
text = pytesseract.image_to_string(img, lang='eng', config='--psm 6') | |
# Clean the OCR output to remove noise and normalize text | |
text = clean_text(text) | |
return text | |
except Exception: | |
# Return empty string if OCR fails for any reason | |
return "" | |
def extract_pdf_content(fp): | |
""" | |
Extract text and tables from a PDF file, including OCR on embedded images. | |
This function: | |
- Opens the PDF file and iterates through each page. | |
- Extracts and cleans text from each page. | |
- Performs OCR on images embedded in pages to extract any text within images. | |
- Extracts tables from pages and formats them as readable text. | |
- Handles exceptions by appending error messages to the content. | |
Args: | |
fp (str or Path): File path to the PDF document. | |
Returns: | |
str: Combined extracted text, OCR results, and formatted tables from the PDF. | |
""" | |
content = "" | |
try: | |
with pdfplumber.open(fp) as pdf: | |
for i, page in enumerate(pdf.pages, 1): | |
# Extract text from the current page, defaulting to empty string if None | |
text = page.extract_text() or "" | |
# Clean extracted text and add page header | |
content += f"Page {i} Text:\n{clean_text(text)}\n\n" | |
# If there are images on the page, perform OCR on each | |
if page.images: | |
# Create an image object of the page with 300 dpi resolution for cropping | |
img_obj = page.to_image(resolution=300) | |
for img in page.images: | |
# Define bounding box coordinates for the image on the page | |
bbox = (img["x0"], img["top"], img["x1"], img["bottom"]) | |
# Crop the image from the page image | |
cropped = img_obj.original.crop(bbox) | |
# Perform OCR on the cropped image | |
ocr_text = ocr_image(cropped) | |
if ocr_text: | |
# Append OCR text with page and image reference | |
content += f"[OCR Text from image on page {i}]:\n{ocr_text}\n\n" | |
# Extract tables from the page | |
tables = page.extract_tables() | |
for idx, table in enumerate(tables, 1): | |
if table: | |
# Convert table list to DataFrame using first row as header | |
df = pd.DataFrame(table[1:], columns=table[0]) | |
# Format and append the table text | |
content += f"Table {idx} on page {i}:\n{format_table(df)}\n\n" | |
except Exception as e: | |
# Append error message if PDF reading fails | |
content += f"\n[Error reading PDF {fp}: {e}]" | |
# Return the combined content with whitespace trimmed | |
return content.strip() | |
def extract_docx_content(fp): | |
""" | |
Extract text, tables, and OCR text from images embedded in a Microsoft Word (.docx) file. | |
This function: | |
- Reads paragraphs and tables from the document. | |
- Cleans and formats extracted text and tables. | |
- Opens the .docx file as a zip archive to extract embedded images. | |
- Performs OCR on embedded images to extract any text they contain. | |
- Handles exceptions and appends error messages if reading fails. | |
Args: | |
fp (str or Path): File path to the Word document. | |
Returns: | |
str: Combined extracted paragraphs, tables, and OCR text from embedded images. | |
""" | |
content = "" | |
try: | |
# Load the Word document | |
doc = docx.Document(fp) | |
# Extract and clean all non-empty paragraphs | |
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()] | |
if paragraphs: | |
content += "Paragraphs:\n" + "\n".join(paragraphs) + "\n\n" | |
# Extract tables from the document | |
tables = [] | |
for table in doc.tables: | |
rows = [] | |
for row in table.rows: | |
# Extract and clean text from each cell in the row | |
cells = [cell.text.strip() for cell in row.cells] | |
rows.append(cells) | |
if rows: | |
# Convert rows to DataFrame using first row as header | |
df = pd.DataFrame(rows[1:], columns=rows[0]) | |
tables.append(df) | |
# Format and append each extracted table | |
for i, df in enumerate(tables, 1): | |
content += f"Table {i}:\n{format_table(df)}\n\n" | |
# Open the .docx file as a zip archive to access embedded media files | |
with zipfile.ZipFile(fp) as z: | |
for file in z.namelist(): | |
# Look for images inside the word/media directory | |
if file.startswith("word/media/"): | |
data = z.read(file) | |
try: | |
# Open image from bytes | |
img = Image.open(io.BytesIO(data)) | |
# Perform OCR on the image | |
ocr_text = ocr_image(img) | |
if ocr_text: | |
# Append OCR text extracted from embedded image | |
content += f"[OCR Text from embedded image]:\n{ocr_text}\n\n" | |
except Exception: | |
# Ignore errors in image processing to continue extraction | |
pass | |
except Exception as e: | |
# Append error message if Word document reading fails | |
content += f"\n[Error reading Microsoft Word {fp}: {e}]" | |
# Return combined content trimmed of extra whitespace | |
return content.strip() | |
def extract_excel_content(fp): | |
""" | |
Extract readable table content from Microsoft Excel files (.xlsx, .xls). | |
This function: | |
- Reads all sheets in the Excel file. | |
- Converts each sheet to a formatted table string. | |
- Suppresses warnings during reading to avoid clutter. | |
- Does not attempt to extract images to avoid errors. | |
- Handles exceptions by appending error messages. | |
Args: | |
fp (str or Path): File path to the Excel workbook. | |
Returns: | |
str: Combined formatted tables from all sheets in the workbook. | |
""" | |
content = "" | |
try: | |
# Suppress warnings such as openpyxl deprecation or data type warnings | |
with warnings.catch_warnings(): | |
warnings.simplefilter("ignore") | |
# Read all sheets into a dictionary of DataFrames using openpyxl engine | |
sheets = pd.read_excel(fp, sheet_name=None, engine='openpyxl') | |
# Iterate over each sheet and format its content | |
for sheet_name, df in sheets.items(): | |
content += f"Sheet: {sheet_name}\n" | |
content += format_table(df) + "\n\n" | |
except Exception as e: | |
# Append error message if Excel reading fails | |
content += f"\n[Error reading Microsoft Excel {fp}: {e}]" | |
# Return combined sheet contents trimmed of whitespace | |
return content.strip() | |
def extract_pptx_content(fp): | |
""" | |
Extract text, tables, and OCR text from images in Microsoft PowerPoint (.pptx) files. | |
This function: | |
- Reads each slide in the presentation. | |
- Extracts text from shapes and tables on each slide. | |
- Performs OCR on images embedded in shapes. | |
- Handles exceptions and appends error messages if reading fails. | |
Args: | |
fp (str or Path): File path to the PowerPoint presentation. | |
Returns: | |
str: Combined extracted text, tables, and OCR results from all slides. | |
""" | |
content = "" | |
try: | |
# Load the PowerPoint presentation | |
prs = Presentation(fp) | |
# Iterate through each slide by index starting at 1 | |
for i, slide in enumerate(prs.slides, 1): | |
slide_texts = [] | |
# Iterate through all shapes on the slide | |
for shape in slide.shapes: | |
# Extract and clean text from shapes that have text attribute | |
if hasattr(shape, "text") and shape.text.strip(): | |
slide_texts.append(shape.text.strip()) | |
# Check if the shape is a picture (shape_type 13) with an image | |
if shape.shape_type == 13 and hasattr(shape, "image") and shape.image: | |
try: | |
# Open image from the shape's binary blob data | |
img = Image.open(io.BytesIO(shape.image.blob)) | |
# Perform OCR on the image | |
ocr_text = ocr_image(img) | |
if ocr_text: | |
# Append OCR text extracted from the image | |
slide_texts.append(f"[OCR Text from image]:\n{ocr_text}") | |
except Exception: | |
# Ignore errors in image OCR to continue processing | |
pass | |
# Add slide text or note if no text found | |
if slide_texts: | |
content += f"Slide {i} Text:\n" + "\n".join(slide_texts) + "\n\n" | |
else: | |
content += f"Slide {i} Text:\nNo text found on this slide.\n\n" | |
# Extract tables from shapes that have tables | |
for shape in slide.shapes: | |
if shape.has_table: | |
rows = [] | |
table = shape.table | |
# Extract text from each cell in the table rows | |
for row in table.rows: | |
cells = [cell.text.strip() for cell in row.cells] | |
rows.append(cells) | |
if rows: | |
# Convert rows to DataFrame using first row as header | |
df = pd.DataFrame(rows[1:], columns=rows[0]) | |
# Format and append the table text | |
content += f"Table on slide {i}:\n{format_table(df)}\n\n" | |
except Exception as e: | |
# Append error message if PowerPoint reading fails | |
content += f"\n[Error reading Microsoft PowerPoint {fp}: {e}]" | |
# Return combined slide content trimmed of whitespace | |
return content.strip() | |
def extract_file_content(fp): | |
""" | |
Determine the file type based on its extension and extract text content accordingly. | |
This function supports: | |
- PDF files with text, tables, and OCR on images. | |
- Microsoft Word documents with paragraphs, tables, and OCR on embedded images. | |
- Microsoft Excel workbooks with formatted sheet tables. | |
- Microsoft PowerPoint presentations with slide text, tables, and OCR on images. | |
- Other file types are attempted to be read as plain UTF-8 text. | |
Args: | |
fp (str or Path): File path to the document to extract content from. | |
Returns: | |
str: Extracted and cleaned text content from the file, or an error message. | |
""" | |
# Get the file extension in lowercase to identify file type | |
ext = Path(fp).suffix.lower() | |
if ext == ".pdf": | |
# Extract content from PDF files | |
return extract_pdf_content(fp) | |
elif ext in [".doc", ".docx"]: | |
# Extract content from Word documents | |
return extract_docx_content(fp) | |
elif ext in [".xlsx", ".xls"]: | |
# Extract content from Excel workbooks | |
return extract_excel_content(fp) | |
elif ext in [".ppt", ".pptx"]: | |
# Extract content from PowerPoint presentations | |
return extract_pptx_content(fp) | |
else: | |
try: | |
# Attempt to read unknown file types as plain UTF-8 text | |
text = Path(fp).read_text(encoding="utf-8") | |
# Clean the extracted text before returning | |
return clean_text(text) | |
except Exception as e: | |
# Return error message if reading fails | |
return f"\n[Error reading file {fp}: {e}]" | |