Spaces:

hadadrjt
/

ai

Running

App Files Files Community

ai / src /main /file_extractors.py

hadadrjt

ai: Refactor the code.

d17e7ef about 2 months ago

raw

history blame

17.4 kB

	#
	# SPDX-FileCopyrightText: Hadad <[email protected]>
	# SPDX-License-Identifier: Apache-2.0
	#

	import pdfplumber # Library to extract text and tables from PDF files
	import pytesseract # OCR tool to extract text from images
	import docx # Library to read Microsoft Word (.docx) files
	import zipfile # To handle zipped archives, used here to access embedded images in Word files
	import io # Provides tools for handling byte streams, used to open images from bytes
	import pandas as pd # Data analysis library, used here to handle tables from Excel and other files
	import warnings # Used to suppress warnings during Excel file reading
	import re # Regular expressions for text cleaning

	from openpyxl import load_workbook # Excel file reading library, used for .xlsx files
	from pptx import Presentation # Library to read Microsoft PowerPoint files
	from PIL import Image, ImageEnhance, ImageFilter # Image processing libraries for OCR preprocessing
	from pathlib import Path # Object-oriented filesystem paths

	def clean_text(text):
	"""
	Clean and normalize extracted text to improve readability and remove noise.

	This function performs several cleaning steps:
	- Removes characters that are not letters, digits, spaces, or common punctuation.
	- Removes isolated single letters which are often OCR errors or noise.
	- Strips whitespace from each line and removes empty lines.
	- Joins cleaned lines back into a single string separated by newlines.

	Args:
	text (str): Raw extracted text from any source.

	Returns:
	str: Cleaned and normalized text ready for display or further processing.
	"""
	# Remove all characters except letters, digits, spaces, and common punctuation marks
	text = re.sub(r'[^a-zA-Z0-9\s.,?!():;\'"-]', '', text)
	# Remove single isolated letters which are likely errors or noise from OCR
	text = re.sub(r'\b[a-zA-Z]\b', '', text)
	# Split text into lines, strip whitespace, and remove empty lines
	lines = [line.strip() for line in text.splitlines() if line.strip()]
	# Join cleaned lines with newline characters
	return "\n".join(lines)

	def format_table(df, max_rows=10):
	"""
	Convert a pandas DataFrame into a clean, readable string representation of a table.

	This function:
	- Removes rows and columns that are completely empty to reduce clutter.
	- Replaces any NaN values with empty strings for cleaner output.
	- Limits the output to a maximum number of rows for brevity.
	- Adds a note if there are more rows than displayed.

	Args:
	df (pandas.DataFrame): The table data to format.
	max_rows (int): Maximum number of rows to display from the table.

	Returns:
	str: Formatted string representation of the table or empty string if no data.
	"""
	if df.empty:
	return ""
	# Remove rows and columns where all values are NaN to clean the table
	df_clean = df.dropna(axis=0, how='all').dropna(axis=1, how='all')
	# Replace remaining NaN values with empty strings for better readability
	df_clean = df_clean.fillna('')
	if df_clean.empty:
	return ""
	# Select only the first max_rows rows for display
	display_df = df_clean.head(max_rows)
	# Convert DataFrame to string without row indices
	table_str = display_df.to_string(index=False)
	# Append a message if there are more rows than displayed
	if len(df_clean) > max_rows:
	table_str += f"\n... ({len(df_clean) - max_rows} more rows)"
	return table_str

	def preprocess_image(img):
	"""
	Enhance an image to improve OCR accuracy by applying several preprocessing steps.

	The preprocessing includes:
	- Converting the image to grayscale to simplify colors.
	- Increasing contrast to make text stand out more.
	- Applying a median filter to reduce noise.
	- Binarizing the image by thresholding to black and white.

	Args:
	img (PIL.Image.Image): The original image to preprocess.

	Returns:
	PIL.Image.Image: The processed image ready for OCR.
	If an error occurs during processing, returns the original image.
	"""
	try:
	# Convert image to grayscale mode
	img = img.convert("L")
	# Enhance contrast by a factor of 2 to make text clearer
	enhancer = ImageEnhance.Contrast(img)
	img = enhancer.enhance(2)
	# Apply median filter to reduce noise and smooth the image
	img = img.filter(ImageFilter.MedianFilter())
	# Convert image to black and white using a threshold of 140
	img = img.point(lambda x: 0 if x < 140 else 255, '1')
	return img
	except Exception:
	# In case of any error, return the original image without changes
	return img

	def ocr_image(img):
	"""
	Extract text from an image using OCR after preprocessing to improve results.

	This function:
	- Preprocesses the image to enhance text visibility.
	- Uses pytesseract with page segmentation mode 6 (assumes a single uniform block of text).
	- Cleans the extracted text using the clean_text function.

	Args:
	img (PIL.Image.Image): The image from which to extract text.

	Returns:
	str: The cleaned OCR-extracted text. Returns empty string if OCR fails.
	"""
	try:
	# Preprocess image to improve OCR quality
	img = preprocess_image(img)
	# Perform OCR using pytesseract with English language and specified config
	text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
	# Clean the OCR output to remove noise and normalize text
	text = clean_text(text)
	return text
	except Exception:
	# Return empty string if OCR fails for any reason
	return ""

	def extract_pdf_content(fp):
	"""
	Extract text and tables from a PDF file, including OCR on embedded images.

	This function:
	- Opens the PDF file and iterates through each page.
	- Extracts and cleans text from each page.
	- Performs OCR on images embedded in pages to extract any text within images.
	- Extracts tables from pages and formats them as readable text.
	- Handles exceptions by appending error messages to the content.

	Args:
	fp (str or Path): File path to the PDF document.

	Returns:
	str: Combined extracted text, OCR results, and formatted tables from the PDF.
	"""
	content = ""
	try:
	with pdfplumber.open(fp) as pdf:
	for i, page in enumerate(pdf.pages, 1):
	# Extract text from the current page, defaulting to empty string if None
	text = page.extract_text() or ""
	# Clean extracted text and add page header
	content += f"Page {i} Text:\n{clean_text(text)}\n\n"
	# If there are images on the page, perform OCR on each
	if page.images:
	# Create an image object of the page with 300 dpi resolution for cropping
	img_obj = page.to_image(resolution=300)
	for img in page.images:
	# Define bounding box coordinates for the image on the page
	bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
	# Crop the image from the page image
	cropped = img_obj.original.crop(bbox)
	# Perform OCR on the cropped image
	ocr_text = ocr_image(cropped)
	if ocr_text:
	# Append OCR text with page and image reference
	content += f"[OCR Text from image on page {i}]:\n{ocr_text}\n\n"
	# Extract tables from the page
	tables = page.extract_tables()
	for idx, table in enumerate(tables, 1):
	if table:
	# Convert table list to DataFrame using first row as header
	df = pd.DataFrame(table[1:], columns=table[0])
	# Format and append the table text
	content += f"Table {idx} on page {i}:\n{format_table(df)}\n\n"
	except Exception as e:
	# Append error message if PDF reading fails
	content += f"\n[Error reading PDF {fp}: {e}]"
	# Return the combined content with whitespace trimmed
	return content.strip()

	def extract_docx_content(fp):
	"""
	Extract text, tables, and OCR text from images embedded in a Microsoft Word (.docx) file.

	This function:
	- Reads paragraphs and tables from the document.
	- Cleans and formats extracted text and tables.
	- Opens the .docx file as a zip archive to extract embedded images.
	- Performs OCR on embedded images to extract any text they contain.
	- Handles exceptions and appends error messages if reading fails.

	Args:
	fp (str or Path): File path to the Word document.

	Returns:
	str: Combined extracted paragraphs, tables, and OCR text from embedded images.
	"""
	content = ""
	try:
	# Load the Word document
	doc = docx.Document(fp)
	# Extract and clean all non-empty paragraphs
	paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
	if paragraphs:
	content += "Paragraphs:\n" + "\n".join(paragraphs) + "\n\n"
	# Extract tables from the document
	tables = []
	for table in doc.tables:
	rows = []
	for row in table.rows:
	# Extract and clean text from each cell in the row
	cells = [cell.text.strip() for cell in row.cells]
	rows.append(cells)
	if rows:
	# Convert rows to DataFrame using first row as header
	df = pd.DataFrame(rows[1:], columns=rows[0])
	tables.append(df)
	# Format and append each extracted table
	for i, df in enumerate(tables, 1):
	content += f"Table {i}:\n{format_table(df)}\n\n"
	# Open the .docx file as a zip archive to access embedded media files
	with zipfile.ZipFile(fp) as z:
	for file in z.namelist():
	# Look for images inside the word/media directory
	if file.startswith("word/media/"):
	data = z.read(file)
	try:
	# Open image from bytes
	img = Image.open(io.BytesIO(data))
	# Perform OCR on the image
	ocr_text = ocr_image(img)
	if ocr_text:
	# Append OCR text extracted from embedded image
	content += f"[OCR Text from embedded image]:\n{ocr_text}\n\n"
	except Exception:
	# Ignore errors in image processing to continue extraction
	pass
	except Exception as e:
	# Append error message if Word document reading fails
	content += f"\n[Error reading Microsoft Word {fp}: {e}]"
	# Return combined content trimmed of extra whitespace
	return content.strip()

	def extract_excel_content(fp):
	"""
	Extract readable table content from Microsoft Excel files (.xlsx, .xls).

	This function:
	- Reads all sheets in the Excel file.
	- Converts each sheet to a formatted table string.
	- Suppresses warnings during reading to avoid clutter.
	- Does not attempt to extract images to avoid errors.
	- Handles exceptions by appending error messages.

	Args:
	fp (str or Path): File path to the Excel workbook.

	Returns:
	str: Combined formatted tables from all sheets in the workbook.
	"""
	content = ""
	try:
	# Suppress warnings such as openpyxl deprecation or data type warnings
	with warnings.catch_warnings():
	warnings.simplefilter("ignore")
	# Read all sheets into a dictionary of DataFrames using openpyxl engine
	sheets = pd.read_excel(fp, sheet_name=None, engine='openpyxl')
	# Iterate over each sheet and format its content
	for sheet_name, df in sheets.items():
	content += f"Sheet: {sheet_name}\n"
	content += format_table(df) + "\n\n"
	except Exception as e:
	# Append error message if Excel reading fails
	content += f"\n[Error reading Microsoft Excel {fp}: {e}]"
	# Return combined sheet contents trimmed of whitespace
	return content.strip()

	def extract_pptx_content(fp):
	"""
	Extract text, tables, and OCR text from images in Microsoft PowerPoint (.pptx) files.

	This function:
	- Reads each slide in the presentation.
	- Extracts text from shapes and tables on each slide.
	- Performs OCR on images embedded in shapes.
	- Handles exceptions and appends error messages if reading fails.

	Args:
	fp (str or Path): File path to the PowerPoint presentation.

	Returns:
	str: Combined extracted text, tables, and OCR results from all slides.
	"""
	content = ""
	try:
	# Load the PowerPoint presentation
	prs = Presentation(fp)
	# Iterate through each slide by index starting at 1
	for i, slide in enumerate(prs.slides, 1):
	slide_texts = []
	# Iterate through all shapes on the slide
	for shape in slide.shapes:
	# Extract and clean text from shapes that have text attribute
	if hasattr(shape, "text") and shape.text.strip():
	slide_texts.append(shape.text.strip())
	# Check if the shape is a picture (shape_type 13) with an image
	if shape.shape_type == 13 and hasattr(shape, "image") and shape.image:
	try:
	# Open image from the shape's binary blob data
	img = Image.open(io.BytesIO(shape.image.blob))
	# Perform OCR on the image
	ocr_text = ocr_image(img)
	if ocr_text:
	# Append OCR text extracted from the image
	slide_texts.append(f"[OCR Text from image]:\n{ocr_text}")
	except Exception:
	# Ignore errors in image OCR to continue processing
	pass
	# Add slide text or note if no text found
	if slide_texts:
	content += f"Slide {i} Text:\n" + "\n".join(slide_texts) + "\n\n"
	else:
	content += f"Slide {i} Text:\nNo text found on this slide.\n\n"
	# Extract tables from shapes that have tables
	for shape in slide.shapes:
	if shape.has_table:
	rows = []
	table = shape.table
	# Extract text from each cell in the table rows
	for row in table.rows:
	cells = [cell.text.strip() for cell in row.cells]
	rows.append(cells)
	if rows:
	# Convert rows to DataFrame using first row as header
	df = pd.DataFrame(rows[1:], columns=rows[0])
	# Format and append the table text
	content += f"Table on slide {i}:\n{format_table(df)}\n\n"
	except Exception as e:
	# Append error message if PowerPoint reading fails
	content += f"\n[Error reading Microsoft PowerPoint {fp}: {e}]"
	# Return combined slide content trimmed of whitespace
	return content.strip()

	def extract_file_content(fp):
	"""
	Determine the file type based on its extension and extract text content accordingly.

	This function supports:
	- PDF files with text, tables, and OCR on images.
	- Microsoft Word documents with paragraphs, tables, and OCR on embedded images.
	- Microsoft Excel workbooks with formatted sheet tables.
	- Microsoft PowerPoint presentations with slide text, tables, and OCR on images.
	- Other file types are attempted to be read as plain UTF-8 text.

	Args:
	fp (str or Path): File path to the document to extract content from.

	Returns:
	str: Extracted and cleaned text content from the file, or an error message.
	"""
	# Get the file extension in lowercase to identify file type
	ext = Path(fp).suffix.lower()
	if ext == ".pdf":
	# Extract content from PDF files
	return extract_pdf_content(fp)
	elif ext in [".doc", ".docx"]:
	# Extract content from Word documents
	return extract_docx_content(fp)
	elif ext in [".xlsx", ".xls"]:
	# Extract content from Excel workbooks
	return extract_excel_content(fp)
	elif ext in [".ppt", ".pptx"]:
	# Extract content from PowerPoint presentations
	return extract_pptx_content(fp)
	else:
	try:
	# Attempt to read unknown file types as plain UTF-8 text
	text = Path(fp).read_text(encoding="utf-8")
	# Clean the extracted text before returning
	return clean_text(text)
	except Exception as e:
	# Return error message if reading fails
	return f"\n[Error reading file {fp}: {e}]"