Commit
·
6b28cfa
1
Parent(s):
a265560
Added capabilities to export to and import from Adobe .xfdf files
Browse files- app.py +22 -5
- tools/helper_functions.py +5 -0
- tools/redaction_review.py +336 -32
app.py
CHANGED
|
@@ -10,11 +10,11 @@ from datetime import datetime
|
|
| 10 |
from gradio_image_annotation import image_annotator
|
| 11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
| 12 |
|
| 13 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
| 14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
| 15 |
from tools.file_redaction import choose_and_run_redactor
|
| 16 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
| 17 |
-
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
|
| 18 |
from tools.data_anonymise import anonymise_data_files
|
| 19 |
from tools.auth import authenticate_user
|
| 20 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
|
@@ -154,6 +154,8 @@ with app:
|
|
| 154 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
| 155 |
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
|
| 156 |
|
|
|
|
|
|
|
| 157 |
###
|
| 158 |
# UI DESIGN
|
| 159 |
###
|
|
@@ -255,7 +257,12 @@ with app:
|
|
| 255 |
#with gr.Column(scale=1):
|
| 256 |
with gr.Row():
|
| 257 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
| 258 |
-
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
###
|
| 261 |
# TEXT / TABULAR DATA TAB
|
|
@@ -361,7 +368,8 @@ with app:
|
|
| 361 |
###
|
| 362 |
|
| 363 |
# Upload previous files for modifying redactions
|
| 364 |
-
upload_previous_review_file_btn.click(fn=
|
|
|
|
| 365 |
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
| 366 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 367 |
|
|
@@ -419,7 +427,16 @@ with app:
|
|
| 419 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
| 420 |
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
| 421 |
|
| 422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
###
|
| 424 |
# TABULAR DATA REDACTION
|
| 425 |
###
|
|
|
|
| 10 |
from gradio_image_annotation import image_annotator
|
| 11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
| 12 |
|
| 13 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars
|
| 14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
| 15 |
from tools.file_redaction import choose_and_run_redactor
|
| 16 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
| 17 |
+
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe
|
| 18 |
from tools.data_anonymise import anonymise_data_files
|
| 19 |
from tools.auth import authenticate_user
|
| 20 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
|
|
|
| 154 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
| 155 |
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
|
| 156 |
|
| 157 |
+
|
| 158 |
+
|
| 159 |
###
|
| 160 |
# UI DESIGN
|
| 161 |
###
|
|
|
|
| 257 |
#with gr.Column(scale=1):
|
| 258 |
with gr.Row():
|
| 259 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
| 260 |
+
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
| 261 |
+
|
| 262 |
+
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
| 263 |
+
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
| 264 |
+
adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple')
|
| 265 |
+
convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
|
| 266 |
|
| 267 |
###
|
| 268 |
# TEXT / TABULAR DATA TAB
|
|
|
|
| 368 |
###
|
| 369 |
|
| 370 |
# Upload previous files for modifying redactions
|
| 371 |
+
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
| 372 |
+
then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
| 373 |
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
| 374 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 375 |
|
|
|
|
| 427 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
| 428 |
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
| 429 |
|
| 430 |
+
# Convert review file to xfdf Adobe format
|
| 431 |
+
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
| 432 |
+
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
| 433 |
+
then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
|
| 434 |
+
|
| 435 |
+
# Convert xfdf Adobe file back to review_file.csv
|
| 436 |
+
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
| 437 |
+
then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
| 438 |
+
then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
|
| 439 |
+
|
| 440 |
###
|
| 441 |
# TABULAR DATA REDACTION
|
| 442 |
###
|
tools/helper_functions.py
CHANGED
|
@@ -22,6 +22,9 @@ def reset_state_vars():
|
|
| 22 |
interactive=False
|
| 23 |
), [], [], [], pd.DataFrame(), pd.DataFrame()
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
def get_or_create_env_var(var_name, default_value):
|
| 26 |
# Get the environment variable if it exists
|
| 27 |
value = os.environ.get(var_name)
|
|
@@ -81,6 +84,8 @@ def detect_file_type(filename):
|
|
| 81 |
return 'jpeg'
|
| 82 |
elif filename.endswith('.png'):
|
| 83 |
return 'png'
|
|
|
|
|
|
|
| 84 |
else:
|
| 85 |
raise ValueError("Unsupported file type.")
|
| 86 |
|
|
|
|
| 22 |
interactive=False
|
| 23 |
), [], [], [], pd.DataFrame(), pd.DataFrame()
|
| 24 |
|
| 25 |
+
def reset_review_vars():
|
| 26 |
+
return [], pd.DataFrame(), pd.DataFrame()
|
| 27 |
+
|
| 28 |
def get_or_create_env_var(var_name, default_value):
|
| 29 |
# Get the environment variable if it exists
|
| 30 |
value = os.environ.get(var_name)
|
|
|
|
| 84 |
return 'jpeg'
|
| 85 |
elif filename.endswith('.png'):
|
| 86 |
return 'png'
|
| 87 |
+
elif filename.endswith('.xfdf'):
|
| 88 |
+
return 'xfdf'
|
| 89 |
else:
|
| 90 |
raise ValueError("Unsupported file type.")
|
| 91 |
|
tools/redaction_review.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
-
from xml.etree.ElementTree import Element, SubElement, tostring
|
| 5 |
from xml.dom import minidom
|
| 6 |
import uuid
|
| 7 |
from typing import List
|
| 8 |
from gradio_image_annotation import image_annotator
|
| 9 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
| 10 |
-
from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
|
| 11 |
-
from tools.helper_functions import get_file_path_end, output_folder
|
| 12 |
from tools.file_redaction import redact_page_with_pymupdf
|
| 13 |
import json
|
| 14 |
import os
|
|
@@ -383,10 +383,46 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
| 383 |
row_value_page = evt.row_value[0] # This is the page number value
|
| 384 |
return row_value_page
|
| 385 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
|
| 387 |
|
| 388 |
-
|
| 389 |
-
|
|
|
|
|
|
|
|
|
|
| 390 |
# Create root element
|
| 391 |
xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
|
| 392 |
|
|
@@ -397,47 +433,315 @@ def create_xfdf(df, pdf_path):
|
|
| 397 |
# Add annots
|
| 398 |
annots = SubElement(xfdf, 'annots')
|
| 399 |
|
| 400 |
-
# Process each row in dataframe
|
| 401 |
for _, row in df.iterrows():
|
| 402 |
-
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
-
# Generate unique ID
|
| 406 |
annot_id = str(uuid.uuid4())
|
| 407 |
-
|
| 408 |
|
| 409 |
# Set page number (subtract 1 as PDF pages are 0-based)
|
| 410 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
-
# Set coordinates
|
| 413 |
-
|
| 414 |
-
text_annot.set('rect', f"{row['xmin']},{row['ymin']},{row['xmax']},{row['ymax']}")
|
| 415 |
|
| 416 |
-
# Set
|
| 417 |
-
|
| 418 |
-
|
|
|
|
|
|
|
| 419 |
|
| 420 |
-
# Set
|
| 421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
-
#
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
# Convert to pretty XML string
|
| 429 |
xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent=" ")
|
| 430 |
|
| 431 |
return xml_str
|
| 432 |
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
|
| 438 |
-
|
| 439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
|
| 441 |
-
#
|
| 442 |
-
|
| 443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
+
from xml.etree.ElementTree import Element, SubElement, tostring, parse
|
| 5 |
from xml.dom import minidom
|
| 6 |
import uuid
|
| 7 |
from typing import List
|
| 8 |
from gradio_image_annotation import image_annotator
|
| 9 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
| 10 |
+
from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
|
| 11 |
+
from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
|
| 12 |
from tools.file_redaction import redact_page_with_pymupdf
|
| 13 |
import json
|
| 14 |
import os
|
|
|
|
| 383 |
row_value_page = evt.row_value[0] # This is the page number value
|
| 384 |
return row_value_page
|
| 385 |
|
| 386 |
+
def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
|
| 387 |
+
'''
|
| 388 |
+
Converts coordinates from image space to Adobe PDF space.
|
| 389 |
+
|
| 390 |
+
Parameters:
|
| 391 |
+
- pdf_page_width: Width of the PDF page
|
| 392 |
+
- pdf_page_height: Height of the PDF page
|
| 393 |
+
- image_width: Width of the source image
|
| 394 |
+
- image_height: Height of the source image
|
| 395 |
+
- x1, y1, x2, y2: Coordinates in image space
|
| 396 |
+
|
| 397 |
+
Returns:
|
| 398 |
+
- Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
|
| 399 |
+
'''
|
| 400 |
+
|
| 401 |
+
# Calculate scaling factors
|
| 402 |
+
scale_width = pdf_page_width / image_width
|
| 403 |
+
scale_height = pdf_page_height / image_height
|
| 404 |
+
|
| 405 |
+
# Convert coordinates
|
| 406 |
+
pdf_x1 = x1 * scale_width
|
| 407 |
+
pdf_x2 = x2 * scale_width
|
| 408 |
+
|
| 409 |
+
# Convert Y coordinates (flip vertical axis)
|
| 410 |
+
# Adobe coordinates start from bottom-left
|
| 411 |
+
pdf_y1 = pdf_page_height - (y1 * scale_height)
|
| 412 |
+
pdf_y2 = pdf_page_height - (y2 * scale_height)
|
| 413 |
+
|
| 414 |
+
# Make sure y1 is always less than y2 for Adobe's coordinate system
|
| 415 |
+
if pdf_y1 > pdf_y2:
|
| 416 |
+
pdf_y1, pdf_y2 = pdf_y2, pdf_y1
|
| 417 |
+
|
| 418 |
+
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
| 419 |
|
| 420 |
|
| 421 |
+
def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
|
| 422 |
+
'''
|
| 423 |
+
Create an xfdf file from a review csv file and a pdf
|
| 424 |
+
'''
|
| 425 |
+
|
| 426 |
# Create root element
|
| 427 |
xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
|
| 428 |
|
|
|
|
| 433 |
# Add annots
|
| 434 |
annots = SubElement(xfdf, 'annots')
|
| 435 |
|
|
|
|
| 436 |
for _, row in df.iterrows():
|
| 437 |
+
page_python_format = int(row["page"])-1
|
| 438 |
+
|
| 439 |
+
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
| 440 |
+
|
| 441 |
+
pdf_page_height = pymupdf_page.rect.height
|
| 442 |
+
pdf_page_width = pymupdf_page.rect.width
|
| 443 |
+
|
| 444 |
+
image = image_paths[page_python_format]
|
| 445 |
+
|
| 446 |
+
#print("image:", image)
|
| 447 |
+
|
| 448 |
+
if isinstance(image, str):
|
| 449 |
+
image = Image.open(image)
|
| 450 |
+
|
| 451 |
+
image_page_width, image_page_height = image.size
|
| 452 |
+
|
| 453 |
+
# Create redaction annotation
|
| 454 |
+
redact_annot = SubElement(annots, 'redact')
|
| 455 |
|
| 456 |
+
# Generate unique ID
|
| 457 |
annot_id = str(uuid.uuid4())
|
| 458 |
+
redact_annot.set('name', annot_id)
|
| 459 |
|
| 460 |
# Set page number (subtract 1 as PDF pages are 0-based)
|
| 461 |
+
redact_annot.set('page', str(int(row['page']) - 1))
|
| 462 |
+
|
| 463 |
+
# Convert coordinates
|
| 464 |
+
x1, y1, x2, y2 = convert_image_coords_to_adobe(
|
| 465 |
+
pdf_page_width,
|
| 466 |
+
pdf_page_height,
|
| 467 |
+
image_page_width,
|
| 468 |
+
image_page_height,
|
| 469 |
+
row['xmin'],
|
| 470 |
+
row['ymin'],
|
| 471 |
+
row['xmax'],
|
| 472 |
+
row['ymax']
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
if CUSTOM_BOX_COLOUR == "grey":
|
| 476 |
+
colour_str = "0.5,0.5,0.5"
|
| 477 |
+
else:
|
| 478 |
+
colour_str = row['color'].strip('()').replace(' ', '')
|
| 479 |
|
| 480 |
+
# Set coordinates
|
| 481 |
+
redact_annot.set('rect', f"{x1:.2f},{y1:.2f},{x2:.2f},{y2:.2f}")
|
|
|
|
| 482 |
|
| 483 |
+
# Set redaction properties
|
| 484 |
+
redact_annot.set('title', row['label']) # The type of redaction (e.g., "PERSON")
|
| 485 |
+
redact_annot.set('contents', row['text']) # The redacted text
|
| 486 |
+
redact_annot.set('subject', row['label']) # The redacted text
|
| 487 |
+
redact_annot.set('mimetype', "Form")
|
| 488 |
|
| 489 |
+
# Set appearance properties
|
| 490 |
+
redact_annot.set('border-color', colour_str) # Black border
|
| 491 |
+
redact_annot.set('repeat', 'false')
|
| 492 |
+
redact_annot.set('interior-color', colour_str)
|
| 493 |
+
#redact_annot.set('fill-color', colour_str)
|
| 494 |
+
#redact_annot.set('outline-color', colour_str)
|
| 495 |
+
redact_annot.set('overlay-color', colour_str)
|
| 496 |
+
redact_annot.set('overlay-text', row['label'])
|
| 497 |
+
redact_annot.set('opacity', "0.5")
|
| 498 |
+
|
| 499 |
+
# Add appearance dictionary
|
| 500 |
+
# appearanceDict = SubElement(redact_annot, 'appearancedict')
|
| 501 |
|
| 502 |
+
# # Normal appearance
|
| 503 |
+
# normal = SubElement(appearanceDict, 'normal')
|
| 504 |
+
# #normal.set('appearance', 'redact')
|
| 505 |
+
|
| 506 |
+
# # Color settings for the mark (before applying redaction)
|
| 507 |
+
# markAppearance = SubElement(redact_annot, 'markappearance')
|
| 508 |
+
# markAppearance.set('stroke-color', colour_str) # Red outline
|
| 509 |
+
# markAppearance.set('fill-color', colour_str) # Light red fill
|
| 510 |
+
# markAppearance.set('opacity', '0.5') # 50% opacity
|
| 511 |
|
| 512 |
+
# # Final redaction appearance (after applying)
|
| 513 |
+
# redactAppearance = SubElement(redact_annot, 'redactAppearance')
|
| 514 |
+
# redactAppearance.set('fillColor', colour_str) # Black fill
|
| 515 |
+
# redactAppearance.set('fontName', 'Helvetica')
|
| 516 |
+
# redactAppearance.set('fontSize', '12')
|
| 517 |
+
# redactAppearance.set('textAlignment', 'left')
|
| 518 |
+
# redactAppearance.set('textColor', colour_str) # White text
|
| 519 |
+
|
| 520 |
# Convert to pretty XML string
|
| 521 |
xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent=" ")
|
| 522 |
|
| 523 |
return xml_str
|
| 524 |
|
| 525 |
+
def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
|
| 526 |
+
'''
|
| 527 |
+
Load in files to convert a review file into an Adobe comment file format
|
| 528 |
+
'''
|
| 529 |
+
output_paths = []
|
| 530 |
+
pdf_name = ""
|
| 531 |
+
|
| 532 |
+
if isinstance(input_files, str):
|
| 533 |
+
file_paths_list = [input_files]
|
| 534 |
+
else:
|
| 535 |
+
file_paths_list = input_files
|
| 536 |
+
|
| 537 |
+
# Sort the file paths so that the pdfs come first
|
| 538 |
+
file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
|
| 539 |
+
|
| 540 |
+
for file in file_paths_list:
|
| 541 |
+
|
| 542 |
+
if isinstance(file, str):
|
| 543 |
+
file_path = file
|
| 544 |
+
else:
|
| 545 |
+
file_path = file.name
|
| 546 |
|
| 547 |
+
file_path_name = get_file_path_end(file_path)
|
| 548 |
+
file_path_end = detect_file_type(file_path)
|
| 549 |
+
|
| 550 |
+
if file_path_end == "pdf":
|
| 551 |
+
pdf_name = os.path.basename(file_path)
|
| 552 |
+
|
| 553 |
+
if file_path_end == "csv":
|
| 554 |
+
# If no pdf name, just get the name of the file path
|
| 555 |
+
if not pdf_name:
|
| 556 |
+
pdf_name = file_path_name
|
| 557 |
+
# Read CSV file
|
| 558 |
+
df = pd.read_csv(file_path)
|
| 559 |
|
| 560 |
+
df.fillna('', inplace=True) # Replace NaN with an empty string
|
| 561 |
+
|
| 562 |
+
xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths)
|
| 563 |
+
|
| 564 |
+
output_path = output_folder + file_path_name + "_adobe.xfdf"
|
| 565 |
+
|
| 566 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 567 |
+
f.write(xfdf_content)
|
| 568 |
+
|
| 569 |
+
output_paths.append(output_path)
|
| 570 |
+
|
| 571 |
+
return output_paths
|
| 572 |
+
|
| 573 |
+
|
| 574 |
+
### Convert xfdf coordinates back to image for app
|
| 575 |
+
|
| 576 |
+
def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
|
| 577 |
+
'''
|
| 578 |
+
Converts coordinates from Adobe PDF space to image space.
|
| 579 |
+
|
| 580 |
+
Parameters:
|
| 581 |
+
- pdf_page_width: Width of the PDF page
|
| 582 |
+
- pdf_page_height: Height of the PDF page
|
| 583 |
+
- image_width: Width of the source image
|
| 584 |
+
- image_height: Height of the source image
|
| 585 |
+
- x1, y1, x2, y2: Coordinates in Adobe PDF space
|
| 586 |
+
|
| 587 |
+
Returns:
|
| 588 |
+
- Tuple of converted coordinates (x1, y1, x2, y2) in image space
|
| 589 |
+
'''
|
| 590 |
+
|
| 591 |
+
# Calculate scaling factors
|
| 592 |
+
scale_width = image_width / pdf_page_width
|
| 593 |
+
scale_height = image_height / pdf_page_height
|
| 594 |
+
|
| 595 |
+
# Convert coordinates
|
| 596 |
+
image_x1 = x1 * scale_width
|
| 597 |
+
image_x2 = x2 * scale_width
|
| 598 |
+
|
| 599 |
+
# Convert Y coordinates (flip vertical axis)
|
| 600 |
+
# Adobe coordinates start from bottom-left
|
| 601 |
+
image_y1 = (pdf_page_height - y1) * scale_height
|
| 602 |
+
image_y2 = (pdf_page_height - y2) * scale_height
|
| 603 |
+
|
| 604 |
+
# Make sure y1 is always less than y2 for image's coordinate system
|
| 605 |
+
if image_y1 > image_y2:
|
| 606 |
+
image_y1, image_y2 = image_y2, image_y1
|
| 607 |
+
|
| 608 |
+
return image_x1, image_y1, image_x2, image_y2
|
| 609 |
+
|
| 610 |
+
def parse_xfdf(xfdf_path):
|
| 611 |
+
'''
|
| 612 |
+
Parse the XFDF file and extract redaction annotations.
|
| 613 |
+
|
| 614 |
+
Parameters:
|
| 615 |
+
- xfdf_path: Path to the XFDF file
|
| 616 |
+
|
| 617 |
+
Returns:
|
| 618 |
+
- List of dictionaries containing redaction information
|
| 619 |
+
'''
|
| 620 |
+
tree = parse(xfdf_path)
|
| 621 |
+
root = tree.getroot()
|
| 622 |
+
|
| 623 |
+
# Define the namespace
|
| 624 |
+
namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
|
| 625 |
+
|
| 626 |
+
redactions = []
|
| 627 |
+
|
| 628 |
+
# Find all redact elements using the namespace
|
| 629 |
+
for redact in root.findall('.//xfdf:redact', namespaces=namespace):
|
| 630 |
+
|
| 631 |
+
#print("redact:", redact)
|
| 632 |
+
|
| 633 |
+
redaction_info = {
|
| 634 |
+
'image': '', # Image will be filled in later
|
| 635 |
+
'page': int(redact.get('page')) + 1, # Convert to 1-based index
|
| 636 |
+
'xmin': float(redact.get('rect').split(',')[0]),
|
| 637 |
+
'ymin': float(redact.get('rect').split(',')[1]),
|
| 638 |
+
'xmax': float(redact.get('rect').split(',')[2]),
|
| 639 |
+
'ymax': float(redact.get('rect').split(',')[3]),
|
| 640 |
+
'label': redact.get('title'),
|
| 641 |
+
'text': redact.get('contents'),
|
| 642 |
+
'color': redact.get('border-color', '(0, 0, 0)') # Default to black if not specified
|
| 643 |
+
}
|
| 644 |
+
redactions.append(redaction_info)
|
| 645 |
+
|
| 646 |
+
print("redactions:", redactions)
|
| 647 |
+
|
| 648 |
+
return redactions
|
| 649 |
+
|
| 650 |
+
def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
|
| 651 |
+
'''
|
| 652 |
+
Convert redaction annotations from XFDF and associated images into a DataFrame.
|
| 653 |
+
|
| 654 |
+
Parameters:
|
| 655 |
+
- xfdf_path: Path to the XFDF file
|
| 656 |
+
- pdf_doc: PyMuPDF document object
|
| 657 |
+
- image_paths: List of PIL Image objects corresponding to PDF pages
|
| 658 |
+
|
| 659 |
+
Returns:
|
| 660 |
+
- DataFrame containing redaction information
|
| 661 |
+
'''
|
| 662 |
+
output_paths = []
|
| 663 |
+
xfdf_paths = []
|
| 664 |
+
df = pd.DataFrame()
|
| 665 |
+
|
| 666 |
+
#print("Image paths:", image_paths)
|
| 667 |
+
|
| 668 |
+
# Sort the file paths so that the pdfs come first
|
| 669 |
+
file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
|
| 670 |
+
|
| 671 |
+
for file in file_paths_list:
|
| 672 |
+
|
| 673 |
+
if isinstance(file, str):
|
| 674 |
+
file_path = file
|
| 675 |
+
else:
|
| 676 |
+
file_path = file.name
|
| 677 |
+
|
| 678 |
+
file_path_name = get_file_path_end(file_path)
|
| 679 |
+
file_path_end = detect_file_type(file_path)
|
| 680 |
+
|
| 681 |
+
if file_path_end == "pdf":
|
| 682 |
+
pdf_name = os.path.basename(file_path)
|
| 683 |
+
#print("pymupdf_doc:", pymupdf_doc)
|
| 684 |
+
|
| 685 |
+
# Add pdf to outputs
|
| 686 |
+
output_paths.append(file_path)
|
| 687 |
+
|
| 688 |
+
if file_path_end == "xfdf":
|
| 689 |
+
|
| 690 |
+
if not pdf_name:
|
| 691 |
+
message = "Original PDF needed to convert from .xfdf format"
|
| 692 |
+
print(message)
|
| 693 |
+
raise ValueError(message)
|
| 694 |
+
|
| 695 |
+
xfdf_path = file
|
| 696 |
+
|
| 697 |
+
# if isinstance(xfdf_paths, str):
|
| 698 |
+
# xfdf_path = xfdf_paths.name
|
| 699 |
+
# else:
|
| 700 |
+
# xfdf_path = xfdf_paths[0].name
|
| 701 |
+
|
| 702 |
+
file_path_name = get_file_path_end(xfdf_path)
|
| 703 |
+
|
| 704 |
+
#print("file_path_name:", file_path_name)
|
| 705 |
+
|
| 706 |
+
# Parse the XFDF file
|
| 707 |
+
redactions = parse_xfdf(xfdf_path)
|
| 708 |
+
|
| 709 |
+
# Create a DataFrame from the redaction information
|
| 710 |
+
df = pd.DataFrame(redactions)
|
| 711 |
+
|
| 712 |
+
df.fillna('', inplace=True) # Replace NaN with an empty string
|
| 713 |
+
|
| 714 |
+
for _, row in df.iterrows():
|
| 715 |
+
page_python_format = int(row["page"])-1
|
| 716 |
+
|
| 717 |
+
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
| 718 |
+
|
| 719 |
+
pdf_page_height = pymupdf_page.rect.height
|
| 720 |
+
pdf_page_width = pymupdf_page.rect.width
|
| 721 |
+
|
| 722 |
+
image_path = image_paths[page_python_format]
|
| 723 |
+
|
| 724 |
+
#print("image_path:", image_path)
|
| 725 |
+
|
| 726 |
+
if isinstance(image_path, str):
|
| 727 |
+
image = Image.open(image_path)
|
| 728 |
+
|
| 729 |
+
image_page_width, image_page_height = image.size
|
| 730 |
+
|
| 731 |
+
# Convert to image coordinates
|
| 732 |
+
image_x1, image_y1, image_x2, image_y2 = convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_page_width, image_page_height, row['xmin'], row['ymin'], row['xmax'], row['ymax'])
|
| 733 |
+
|
| 734 |
+
df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2]
|
| 735 |
+
|
| 736 |
+
# Optionally, you can add the image path or other relevant information
|
| 737 |
+
#print("Image path:", image_path)
|
| 738 |
+
df.loc[_, 'image'] = image_path
|
| 739 |
+
|
| 740 |
+
#print('row:', row)
|
| 741 |
+
|
| 742 |
+
out_file_path = output_folder + file_path_name + "_review_file.csv"
|
| 743 |
+
df.to_csv(out_file_path, index=None)
|
| 744 |
+
|
| 745 |
+
output_paths.append(out_file_path)
|
| 746 |
+
|
| 747 |
+
return output_paths
|