Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jan 24

Commit

6b28cfa

1 Parent(s): a265560

Added capabilities to export to and import from Adobe .xfdf files

Browse files

Files changed (3) hide show

app.py +22 -5
tools/helper_functions.py +5 -0
tools/redaction_review.py +336 -32

app.py CHANGED Viewed

@@ -10,11 +10,11 @@ from datetime import datetime
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
-from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
@@ -154,6 +154,8 @@ with app:
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
     duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
     ###
     # UI DESIGN
     ###
@@ -255,7 +257,12 @@ with app:
         #with gr.Column(scale=1):
         with gr.Row():
             recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
-            recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
     ###
     # TEXT / TABULAR DATA TAB
@@ -361,7 +368,8 @@ with app:
     ###
     # Upload previous files for modifying redactions
-    upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
         then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
@@ -419,7 +427,16 @@ with app:
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     ###
     # TABULAR DATA REDACTION
     ###

 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
+from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
     duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
     ###
     # UI DESIGN
     ###
         #with gr.Column(scale=1):
         with gr.Row():
             recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
+            recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
+        with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
+            convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
+            adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple')
+            convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
     ###
     # TEXT / TABULAR DATA TAB
     ###
     # Upload previous files for modifying redactions
+    upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
         then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
+    # Convert review file to xfdf Adobe format
+    convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
+        then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
+        then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
+    # Convert xfdf Adobe file back to review_file.csv
+    convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
+        then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
+        then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
     ###
     # TABULAR DATA REDACTION
     ###

tools/helper_functions.py CHANGED Viewed

@@ -22,6 +22,9 @@ def reset_state_vars():
             interactive=False
         ), [], [], [], pd.DataFrame(), pd.DataFrame()
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists
     value = os.environ.get(var_name)
@@ -81,6 +84,8 @@ def detect_file_type(filename):
         return 'jpeg'
     elif filename.endswith('.png'):
         return 'png'
     else:
         raise ValueError("Unsupported file type.")

             interactive=False
         ), [], [], [], pd.DataFrame(), pd.DataFrame()
+def reset_review_vars():
+    return [], pd.DataFrame(), pd.DataFrame()
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists
     value = os.environ.get(var_name)
         return 'jpeg'
     elif filename.endswith('.png'):
         return 'png'
+    elif filename.endswith('.xfdf'):
+        return 'xfdf'
     else:
         raise ValueError("Unsupported file type.")

tools/redaction_review.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import gradio as gr
 import pandas as pd
 import numpy as np
-from xml.etree.ElementTree import Element, SubElement, tostring
 from xml.dom import minidom
 import uuid
 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
-from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
-from tools.helper_functions import get_file_path_end, output_folder
 from tools.file_redaction import redact_page_with_pymupdf
 import json
 import os
@@ -383,10 +383,46 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page
-def create_xfdf(df, pdf_path):
     # Create root element
     xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
@@ -397,47 +433,315 @@ def create_xfdf(df, pdf_path):
     # Add annots
     annots = SubElement(xfdf, 'annots')
-    # Process each row in dataframe
     for _, row in df.iterrows():
-        # Create text annotation
-        text_annot = SubElement(annots, 'text')
-        # Generate unique ID for each annotation
         annot_id = str(uuid.uuid4())
-        text_annot.set('name', annot_id)
         # Set page number (subtract 1 as PDF pages are 0-based)
-        text_annot.set('page', str(int(row['page']) - 1))
-        # Set coordinates (convert to PDF coordinate system)
-        # Note: You might need to adjust these calculations based on your PDF dimensions
-        text_annot.set('rect', f"{row['xmin']},{row['ymin']},{row['xmax']},{row['ymax']}")
-        # Set color (convert RGB tuple string to comma-separated values)
-        color_str = row['color'].strip('()').replace(' ', '')
-        text_annot.set('color', color_str)
-        # Set text content
-        text_annot.set('contents', f"{row['label']}: {row['text']}")
-        # Set additional properties
-        text_annot.set('flags', "print")
-        text_annot.set('date', "D:20240123000000")
-        text_annot.set('title', "Annotation")
     # Convert to pretty XML string
     xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent="  ")
     return xml_str
-# Example usage:
-# Assuming your dataframe is named 'df' and you want to create annotations for 'example.pdf'
-def convert_df_to_xfdf(df, pdf_path, output_path):
-    xfdf_content = create_xfdf(df, pdf_path)
-    with open(output_path, 'w', encoding='utf-8') as f:
-        f.write(xfdf_content)
-# Usage example:
-# df = your_dataframe
-# convert_df_to_xfdf(df, 'path/to/your.pdf', 'output.xfdf')

 import gradio as gr
 import pandas as pd
 import numpy as np
+from xml.etree.ElementTree import Element, SubElement, tostring, parse
 from xml.dom import minidom
 import uuid
 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
+from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
+from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
 import json
 import os
         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page
+def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
+    '''
+    Converts coordinates from image space to Adobe PDF space.
+    Parameters:
+    - pdf_page_width: Width of the PDF page
+    - pdf_page_height: Height of the PDF page
+    - image_width: Width of the source image
+    - image_height: Height of the source image
+    - x1, y1, x2, y2: Coordinates in image space
+    Returns:
+    - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
+    '''
+    # Calculate scaling factors
+    scale_width = pdf_page_width / image_width
+    scale_height = pdf_page_height / image_height
+    # Convert coordinates
+    pdf_x1 = x1 * scale_width
+    pdf_x2 = x2 * scale_width
+    # Convert Y coordinates (flip vertical axis)
+    # Adobe coordinates start from bottom-left
+    pdf_y1 = pdf_page_height - (y1 * scale_height)
+    pdf_y2 = pdf_page_height - (y2 * scale_height)
+    # Make sure y1 is always less than y2 for Adobe's coordinate system
+    if pdf_y1 > pdf_y2:
+        pdf_y1, pdf_y2 = pdf_y2, pdf_y1
+    return pdf_x1, pdf_y1, pdf_x2, pdf_y2
+def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
+    '''
+    Create an xfdf file from a review csv file and a pdf
+    '''
     # Create root element
     xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
     # Add annots
     annots = SubElement(xfdf, 'annots')
     for _, row in df.iterrows():
+        page_python_format = int(row["page"])-1
+        pymupdf_page = pymupdf_doc.load_page(page_python_format)
+        pdf_page_height = pymupdf_page.rect.height
+        pdf_page_width = pymupdf_page.rect.width
+        image = image_paths[page_python_format]
+        #print("image:", image)
+        if isinstance(image, str):
+            image = Image.open(image)
+        image_page_width, image_page_height = image.size
+        # Create redaction annotation
+        redact_annot = SubElement(annots, 'redact')
+        # Generate unique ID
         annot_id = str(uuid.uuid4())
+        redact_annot.set('name', annot_id)
         # Set page number (subtract 1 as PDF pages are 0-based)
+        redact_annot.set('page', str(int(row['page']) - 1))
+        # Convert coordinates
+        x1, y1, x2, y2 = convert_image_coords_to_adobe(
+            pdf_page_width,
+            pdf_page_height,
+            image_page_width,
+            image_page_height,
+            row['xmin'],
+            row['ymin'],
+            row['xmax'],
+            row['ymax']
+        )
+        if CUSTOM_BOX_COLOUR == "grey":
+            colour_str = "0.5,0.5,0.5"
+        else:
+            colour_str = row['color'].strip('()').replace(' ', '')
+        # Set coordinates
+        redact_annot.set('rect', f"{x1:.2f},{y1:.2f},{x2:.2f},{y2:.2f}")
+        # Set redaction properties
+        redact_annot.set('title', row['label'])  # The type of redaction (e.g., "PERSON")
+        redact_annot.set('contents', row['text'])  # The redacted text
+        redact_annot.set('subject', row['label'])  # The redacted text
+        redact_annot.set('mimetype', "Form")
+        # Set appearance properties
+        redact_annot.set('border-color', colour_str)  # Black border
+        redact_annot.set('repeat', 'false')
+        redact_annot.set('interior-color', colour_str)
+        #redact_annot.set('fill-color', colour_str)
+        #redact_annot.set('outline-color', colour_str)
+        redact_annot.set('overlay-color', colour_str)
+        redact_annot.set('overlay-text', row['label'])
+        redact_annot.set('opacity', "0.5")
+        # Add appearance dictionary
+        # appearanceDict = SubElement(redact_annot, 'appearancedict')
+        # # Normal appearance
+        # normal = SubElement(appearanceDict, 'normal')
+        # #normal.set('appearance', 'redact')
+        # # Color settings for the mark (before applying redaction)
+        # markAppearance = SubElement(redact_annot, 'markappearance')
+        # markAppearance.set('stroke-color', colour_str)  # Red outline
+        # markAppearance.set('fill-color', colour_str)    # Light red fill
+        # markAppearance.set('opacity', '0.5')          # 50% opacity
+        # # Final redaction appearance (after applying)
+        # redactAppearance = SubElement(redact_annot, 'redactAppearance')
+        # redactAppearance.set('fillColor', colour_str)  # Black fill
+        # redactAppearance.set('fontName', 'Helvetica')
+        # redactAppearance.set('fontSize', '12')
+        # redactAppearance.set('textAlignment', 'left')
+        # redactAppearance.set('textColor', colour_str)  # White text
     # Convert to pretty XML string
     xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent="  ")
     return xml_str
+def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
+    '''
+    Load in files to convert a review file into an Adobe comment file format
+    '''
+    output_paths = []
+    pdf_name = ""
+    if isinstance(input_files, str):
+        file_paths_list = [input_files]
+    else:
+        file_paths_list = input_files
+    # Sort the file paths so that the pdfs come first
+    file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
+    for file in file_paths_list:
+        if isinstance(file, str):
+            file_path = file
+        else:
+            file_path = file.name
+    file_path_name = get_file_path_end(file_path)
+    file_path_end = detect_file_type(file_path)
+    if file_path_end == "pdf":
+        pdf_name = os.path.basename(file_path)
+    if file_path_end == "csv":
+        # If no pdf name, just get the name of the file path
+        if not pdf_name:
+            pdf_name = file_path_name
+        # Read CSV file
+        df = pd.read_csv(file_path)
+        df.fillna('', inplace=True)  # Replace NaN with an empty string
+        xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths)
+        output_path = output_folder + file_path_name + "_adobe.xfdf"
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(xfdf_content)
+        output_paths.append(output_path)
+    return output_paths
+### Convert xfdf coordinates back to image for app
+def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
+    '''
+    Converts coordinates from Adobe PDF space to image space.
+    Parameters:
+    - pdf_page_width: Width of the PDF page
+    - pdf_page_height: Height of the PDF page
+    - image_width: Width of the source image
+    - image_height: Height of the source image
+    - x1, y1, x2, y2: Coordinates in Adobe PDF space
+    Returns:
+    - Tuple of converted coordinates (x1, y1, x2, y2) in image space
+    '''
+    # Calculate scaling factors
+    scale_width = image_width / pdf_page_width
+    scale_height = image_height / pdf_page_height
+    # Convert coordinates
+    image_x1 = x1 * scale_width
+    image_x2 = x2 * scale_width
+    # Convert Y coordinates (flip vertical axis)
+    # Adobe coordinates start from bottom-left
+    image_y1 = (pdf_page_height - y1) * scale_height
+    image_y2 = (pdf_page_height - y2) * scale_height
+    # Make sure y1 is always less than y2 for image's coordinate system
+    if image_y1 > image_y2:
+        image_y1, image_y2 = image_y2, image_y1
+    return image_x1, image_y1, image_x2, image_y2
+def parse_xfdf(xfdf_path):
+    '''
+    Parse the XFDF file and extract redaction annotations.
+    Parameters:
+    - xfdf_path: Path to the XFDF file
+    Returns:
+    - List of dictionaries containing redaction information
+    '''
+    tree = parse(xfdf_path)
+    root = tree.getroot()
+    # Define the namespace
+    namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
+    redactions = []
+    # Find all redact elements using the namespace
+    for redact in root.findall('.//xfdf:redact', namespaces=namespace):
+        #print("redact:", redact)
+        redaction_info = {
+            'image': '', # Image will be filled in later
+            'page': int(redact.get('page')) + 1,  # Convert to 1-based index
+            'xmin': float(redact.get('rect').split(',')[0]),
+            'ymin': float(redact.get('rect').split(',')[1]),
+            'xmax': float(redact.get('rect').split(',')[2]),
+            'ymax': float(redact.get('rect').split(',')[3]),
+            'label': redact.get('title'),
+            'text': redact.get('contents'),
+            'color': redact.get('border-color', '(0, 0, 0)')  # Default to black if not specified
+        }
+        redactions.append(redaction_info)
+        print("redactions:", redactions)
+    return redactions
+def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
+    '''
+    Convert redaction annotations from XFDF and associated images into a DataFrame.
+    Parameters:
+    - xfdf_path: Path to the XFDF file
+    - pdf_doc: PyMuPDF document object
+    - image_paths: List of PIL Image objects corresponding to PDF pages
+    Returns:
+    - DataFrame containing redaction information
+    '''
+    output_paths = []
+    xfdf_paths = []
+    df = pd.DataFrame()
+    #print("Image paths:", image_paths)
+    # Sort the file paths so that the pdfs come first
+    file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
+    for file in file_paths_list:
+        if isinstance(file, str):
+            file_path = file
+        else:
+            file_path = file.name
+        file_path_name = get_file_path_end(file_path)
+        file_path_end = detect_file_type(file_path)
+        if file_path_end == "pdf":
+            pdf_name = os.path.basename(file_path)
+            #print("pymupdf_doc:", pymupdf_doc)
+            # Add pdf to outputs
+            output_paths.append(file_path)
+        if file_path_end == "xfdf":
+            if not pdf_name:
+                message = "Original PDF needed to convert from .xfdf format"
+                print(message)
+                raise ValueError(message)
+            xfdf_path = file
+            # if isinstance(xfdf_paths, str):
+            #     xfdf_path = xfdf_paths.name
+            # else:
+            #     xfdf_path = xfdf_paths[0].name
+            file_path_name = get_file_path_end(xfdf_path)
+            #print("file_path_name:", file_path_name)
+            # Parse the XFDF file
+            redactions = parse_xfdf(xfdf_path)
+            # Create a DataFrame from the redaction information
+            df = pd.DataFrame(redactions)
+            df.fillna('', inplace=True)  # Replace NaN with an empty string
+            for _, row in df.iterrows():
+                page_python_format = int(row["page"])-1
+                pymupdf_page = pymupdf_doc.load_page(page_python_format)
+                pdf_page_height = pymupdf_page.rect.height
+                pdf_page_width = pymupdf_page.rect.width
+                image_path = image_paths[page_python_format]
+                #print("image_path:", image_path)
+                if isinstance(image_path, str):
+                    image = Image.open(image_path)
+                image_page_width, image_page_height = image.size
+                # Convert to image coordinates
+                image_x1, image_y1, image_x2, image_y2 = convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_page_width, image_page_height, row['xmin'], row['ymin'], row['xmax'], row['ymax'])
+                df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2]
+                # Optionally, you can add the image path or other relevant information
+                #print("Image path:", image_path)
+                df.loc[_, 'image'] = image_path
+                #print('row:', row)
+    out_file_path = output_folder + file_path_name + "_review_file.csv"
+    df.to_csv(out_file_path, index=None)
+    output_paths.append(out_file_path)
+    return output_paths