#
# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
# SPDX-License-Identifier: Apache-2.0
#

import pdfplumber  # Library to extract text and tables from PDF files
import pytesseract  # OCR tool to extract text from images
import docx  # Library to read Microsoft Word (.docx) files
import zipfile  # To handle zipped archives, used here to access embedded images in Word files
import io  # Provides tools for handling byte streams, used to open images from bytes
import pandas as pd  # Data analysis library, used here to handle tables from Excel and other files
import warnings  # Used to suppress warnings during Excel file reading
import re  # Regular expressions for text cleaning

from openpyxl import load_workbook  # Excel file reading library, used for .xlsx files
from pptx import Presentation  # Library to read Microsoft PowerPoint files
from PIL import Image, ImageEnhance, ImageFilter  # Image processing libraries for OCR preprocessing
from pathlib import Path  # Object-oriented filesystem paths

def clean_text(text):
    """
    Clean and normalize extracted text to improve readability and remove noise.
    
    This function performs several cleaning steps:
    - Removes characters that are not letters, digits, spaces, or common punctuation.
    - Removes isolated single letters which are often OCR errors or noise.
    - Strips whitespace from each line and removes empty lines.
    - Joins cleaned lines back into a single string separated by newlines.
    
    Args:
        text (str): Raw extracted text from any source.
    
    Returns:
        str: Cleaned and normalized text ready for display or further processing.
    """
    # Remove all characters except letters, digits, spaces, and common punctuation marks
    text = re.sub(r'[^a-zA-Z0-9\s.,?!():;\'"-]', '', text)
    # Remove single isolated letters which are likely errors or noise from OCR
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    # Split text into lines, strip whitespace, and remove empty lines
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    # Join cleaned lines with newline characters
    return "\n".join(lines)

def format_table(df, max_rows=10):
    """
    Convert a pandas DataFrame into a clean, readable string representation of a table.
    
    This function:
    - Removes rows and columns that are completely empty to reduce clutter.
    - Replaces any NaN values with empty strings for cleaner output.
    - Limits the output to a maximum number of rows for brevity.
    - Adds a note if there are more rows than displayed.
    
    Args:
        df (pandas.DataFrame): The table data to format.
        max_rows (int): Maximum number of rows to display from the table.
    
    Returns:
        str: Formatted string representation of the table or empty string if no data.
    """
    if df.empty:
        return ""
    # Remove rows and columns where all values are NaN to clean the table
    df_clean = df.dropna(axis=0, how='all').dropna(axis=1, how='all')
    # Replace remaining NaN values with empty strings for better readability
    df_clean = df_clean.fillna('')
    if df_clean.empty:
        return ""
    # Select only the first max_rows rows for display
    display_df = df_clean.head(max_rows)
    # Convert DataFrame to string without row indices
    table_str = display_df.to_string(index=False)
    # Append a message if there are more rows than displayed
    if len(df_clean) > max_rows:
        table_str += f"\n... ({len(df_clean) - max_rows} more rows)"
    return table_str

def preprocess_image(img):
    """
    Enhance an image to improve OCR accuracy by applying several preprocessing steps.
    
    The preprocessing includes:
    - Converting the image to grayscale to simplify colors.
    - Increasing contrast to make text stand out more.
    - Applying a median filter to reduce noise.
    - Binarizing the image by thresholding to black and white.
    
    Args:
        img (PIL.Image.Image): The original image to preprocess.
    
    Returns:
        PIL.Image.Image: The processed image ready for OCR.
        If an error occurs during processing, returns the original image.
    """
    try:
        # Convert image to grayscale mode
        img = img.convert("L")
        # Enhance contrast by a factor of 2 to make text clearer
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(2)
        # Apply median filter to reduce noise and smooth the image
        img = img.filter(ImageFilter.MedianFilter())
        # Convert image to black and white using a threshold of 140
        img = img.point(lambda x: 0 if x < 140 else 255, '1')
        return img
    except Exception:
        # In case of any error, return the original image without changes
        return img

def ocr_image(img):
    """
    Extract text from an image using OCR after preprocessing to improve results.
    
    This function:
    - Preprocesses the image to enhance text visibility.
    - Uses pytesseract with page segmentation mode 6 (assumes a single uniform block of text).
    - Cleans the extracted text using the clean_text function.
    
    Args:
        img (PIL.Image.Image): The image from which to extract text.
    
    Returns:
        str: The cleaned OCR-extracted text. Returns empty string if OCR fails.
    """
    try:
        # Preprocess image to improve OCR quality
        img = preprocess_image(img)
        # Perform OCR using pytesseract with English language and specified config
        text = pytesseract.image_to_string(img, lang='eng', config='--psm 6')
        # Clean the OCR output to remove noise and normalize text
        text = clean_text(text)
        return text
    except Exception:
        # Return empty string if OCR fails for any reason
        return ""

def extract_pdf_content(fp):
    """
    Extract text and tables from a PDF file, including OCR on embedded images.
    
    This function:
    - Opens the PDF file and iterates through each page.
    - Extracts and cleans text from each page.
    - Performs OCR on images embedded in pages to extract any text within images.
    - Extracts tables from pages and formats them as readable text.
    - Handles exceptions by appending error messages to the content.
    
    Args:
        fp (str or Path): File path to the PDF document.
    
    Returns:
        str: Combined extracted text, OCR results, and formatted tables from the PDF.
    """
    content = ""
    try:
        with pdfplumber.open(fp) as pdf:
            for i, page in enumerate(pdf.pages, 1):
                # Extract text from the current page, defaulting to empty string if None
                text = page.extract_text() or ""
                # Clean extracted text and add page header
                content += f"Page {i} Text:\n{clean_text(text)}\n\n"
                # If there are images on the page, perform OCR on each
                if page.images:
                    # Create an image object of the page with 300 dpi resolution for cropping
                    img_obj = page.to_image(resolution=300)
                    for img in page.images:
                        # Define bounding box coordinates for the image on the page
                        bbox = (img["x0"], img["top"], img["x1"], img["bottom"])
                        # Crop the image from the page image
                        cropped = img_obj.original.crop(bbox)
                        # Perform OCR on the cropped image
                        ocr_text = ocr_image(cropped)
                        if ocr_text:
                            # Append OCR text with page and image reference
                            content += f"[OCR Text from image on page {i}]:\n{ocr_text}\n\n"
                # Extract tables from the page
                tables = page.extract_tables()
                for idx, table in enumerate(tables, 1):
                    if table:
                        # Convert table list to DataFrame using first row as header
                        df = pd.DataFrame(table[1:], columns=table[0])
                        # Format and append the table text
                        content += f"Table {idx} on page {i}:\n{format_table(df)}\n\n"
    except Exception as e:
        # Append error message if PDF reading fails
        content += f"\n[Error reading PDF {fp}: {e}]"
    # Return the combined content with whitespace trimmed
    return content.strip()

def extract_docx_content(fp):
    """
    Extract text, tables, and OCR text from images embedded in a Microsoft Word (.docx) file.
    
    This function:
    - Reads paragraphs and tables from the document.
    - Cleans and formats extracted text and tables.
    - Opens the .docx file as a zip archive to extract embedded images.
    - Performs OCR on embedded images to extract any text they contain.
    - Handles exceptions and appends error messages if reading fails.
    
    Args:
        fp (str or Path): File path to the Word document.
    
    Returns:
        str: Combined extracted paragraphs, tables, and OCR text from embedded images.
    """
    content = ""
    try:
        # Load the Word document
        doc = docx.Document(fp)
        # Extract and clean all non-empty paragraphs
        paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
        if paragraphs:
            content += "Paragraphs:\n" + "\n".join(paragraphs) + "\n\n"
        # Extract tables from the document
        tables = []
        for table in doc.tables:
            rows = []
            for row in table.rows:
                # Extract and clean text from each cell in the row
                cells = [cell.text.strip() for cell in row.cells]
                rows.append(cells)
            if rows:
                # Convert rows to DataFrame using first row as header
                df = pd.DataFrame(rows[1:], columns=rows[0])
                tables.append(df)
        # Format and append each extracted table
        for i, df in enumerate(tables, 1):
            content += f"Table {i}:\n{format_table(df)}\n\n"
        # Open the .docx file as a zip archive to access embedded media files
        with zipfile.ZipFile(fp) as z:
            for file in z.namelist():
                # Look for images inside the word/media directory
                if file.startswith("word/media/"):
                    data = z.read(file)
                    try:
                        # Open image from bytes
                        img = Image.open(io.BytesIO(data))
                        # Perform OCR on the image
                        ocr_text = ocr_image(img)
                        if ocr_text:
                            # Append OCR text extracted from embedded image
                            content += f"[OCR Text from embedded image]:\n{ocr_text}\n\n"
                    except Exception:
                        # Ignore errors in image processing to continue extraction
                        pass
    except Exception as e:
        # Append error message if Word document reading fails
        content += f"\n[Error reading Microsoft Word {fp}: {e}]"
    # Return combined content trimmed of extra whitespace
    return content.strip()

def extract_excel_content(fp):
    """
    Extract readable table content from Microsoft Excel files (.xlsx, .xls).
    
    This function:
    - Reads all sheets in the Excel file.
    - Converts each sheet to a formatted table string.
    - Suppresses warnings during reading to avoid clutter.
    - Does not attempt to extract images to avoid errors.
    - Handles exceptions by appending error messages.
    
    Args:
        fp (str or Path): File path to the Excel workbook.
    
    Returns:
        str: Combined formatted tables from all sheets in the workbook.
    """
    content = ""
    try:
        # Suppress warnings such as openpyxl deprecation or data type warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            # Read all sheets into a dictionary of DataFrames using openpyxl engine
            sheets = pd.read_excel(fp, sheet_name=None, engine='openpyxl')
        # Iterate over each sheet and format its content
        for sheet_name, df in sheets.items():
            content += f"Sheet: {sheet_name}\n"
            content += format_table(df) + "\n\n"
    except Exception as e:
        # Append error message if Excel reading fails
        content += f"\n[Error reading Microsoft Excel {fp}: {e}]"
    # Return combined sheet contents trimmed of whitespace
    return content.strip()

def extract_pptx_content(fp):
    """
    Extract text, tables, and OCR text from images in Microsoft PowerPoint (.pptx) files.
    
    This function:
    - Reads each slide in the presentation.
    - Extracts text from shapes and tables on each slide.
    - Performs OCR on images embedded in shapes.
    - Handles exceptions and appends error messages if reading fails.
    
    Args:
        fp (str or Path): File path to the PowerPoint presentation.
    
    Returns:
        str: Combined extracted text, tables, and OCR results from all slides.
    """
    content = ""
    try:
        # Load the PowerPoint presentation
        prs = Presentation(fp)
        # Iterate through each slide by index starting at 1
        for i, slide in enumerate(prs.slides, 1):
            slide_texts = []
            # Iterate through all shapes on the slide
            for shape in slide.shapes:
                # Extract and clean text from shapes that have text attribute
                if hasattr(shape, "text") and shape.text.strip():
                    slide_texts.append(shape.text.strip())
                # Check if the shape is a picture (shape_type 13) with an image
                if shape.shape_type == 13 and hasattr(shape, "image") and shape.image:
                    try:
                        # Open image from the shape's binary blob data
                        img = Image.open(io.BytesIO(shape.image.blob))
                        # Perform OCR on the image
                        ocr_text = ocr_image(img)
                        if ocr_text:
                            # Append OCR text extracted from the image
                            slide_texts.append(f"[OCR Text from image]:\n{ocr_text}")
                    except Exception:
                        # Ignore errors in image OCR to continue processing
                        pass
            # Add slide text or note if no text found
            if slide_texts:
                content += f"Slide {i} Text:\n" + "\n".join(slide_texts) + "\n\n"
            else:
                content += f"Slide {i} Text:\nNo text found on this slide.\n\n"
            # Extract tables from shapes that have tables
            for shape in slide.shapes:
                if shape.has_table:
                    rows = []
                    table = shape.table
                    # Extract text from each cell in the table rows
                    for row in table.rows:
                        cells = [cell.text.strip() for cell in row.cells]
                        rows.append(cells)
                    if rows:
                        # Convert rows to DataFrame using first row as header
                        df = pd.DataFrame(rows[1:], columns=rows[0])
                        # Format and append the table text
                        content += f"Table on slide {i}:\n{format_table(df)}\n\n"
    except Exception as e:
        # Append error message if PowerPoint reading fails
        content += f"\n[Error reading Microsoft PowerPoint {fp}: {e}]"
    # Return combined slide content trimmed of whitespace
    return content.strip()

def extract_file_content(fp):
    """
    Determine the file type based on its extension and extract text content accordingly.
    
    This function supports:
    - PDF files with text, tables, and OCR on images.
    - Microsoft Word documents with paragraphs, tables, and OCR on embedded images.
    - Microsoft Excel workbooks with formatted sheet tables.
    - Microsoft PowerPoint presentations with slide text, tables, and OCR on images.
    - Other file types are attempted to be read as plain UTF-8 text.
    
    Args:
        fp (str or Path): File path to the document to extract content from.
    
    Returns:
        str: Extracted and cleaned text content from the file, or an error message.
    """
    # Get the file extension in lowercase to identify file type
    ext = Path(fp).suffix.lower()
    if ext == ".pdf":
        # Extract content from PDF files
        return extract_pdf_content(fp)
    elif ext in [".doc", ".docx"]:
        # Extract content from Word documents
        return extract_docx_content(fp)
    elif ext in [".xlsx", ".xls"]:
        # Extract content from Excel workbooks
        return extract_excel_content(fp)
    elif ext in [".ppt", ".pptx"]:
        # Extract content from PowerPoint presentations
        return extract_pptx_content(fp)
    else:
        try:
            # Attempt to read unknown file types as plain UTF-8 text
            text = Path(fp).read_text(encoding="utf-8")
            # Clean the extracted text before returning
            return clean_text(text)
        except Exception as e:
            # Return error message if reading fails
            return f"\n[Error reading file {fp}: {e}]"