import gradio as gr from PIL import Image, ImageDraw, ImageFont import random import pandas as pd import numpy as np from datasets import concatenate_datasets from operator import itemgetter import collections # download datasets from datasets import load_dataset dataset_small = load_dataset("pierreguillou/DocLayNet-small") dataset_base = load_dataset("pierreguillou/DocLayNet-base") id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)} labels = [label for idx, label in id2label.items()] # need to change the coordinates format def convert_box(box): x, y, w, h = tuple(box) # the row comes in (left, top, width, height) format actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box return actual_box # get back original size def original_box(box, original_width, original_height, coco_width, coco_height): return [ int(original_width * (box[0] / coco_width)), int(original_height * (box[1] / coco_height)), int(original_width * (box[2] / coco_width)), int(original_height* (box[3] / coco_height)), ] # function to sort bounding boxes def get_sorted_boxes(bboxes): # sort by y from page top to bottom bboxes = sorted(bboxes, key=itemgetter(1), reverse=False) y_list = [bbox[1] for bbox in bboxes] # sort by x from page left to right when boxes with same y if len(list(set(y_list))) != len(y_list): y_list_duplicates_indexes = dict() y_list_duplicates = [item for item, count in collections.Counter(y_list).items() if count > 1] for item in y_list_duplicates: y_list_duplicates_indexes[item] = [i for i, e in enumerate(y_list) if e == item] bbox_list_y_duplicates = sorted(np.array(bboxes)[y_list_duplicates_indexes[item]].tolist(), key=itemgetter(0), reverse=False) np_array_bboxes = np.array(bboxes) np_array_bboxes[y_list_duplicates_indexes[item]] = np.array(bbox_list_y_duplicates) bboxes = np_array_bboxes.tolist() return bboxes # categories colors label2color = { 'Caption': 'brown', 'Footnote': 'orange', 'Formula': 'gray', 'List-item': 'yellow', 'Page-footer': 'red', 'Page-header': 'red', 'Picture': 'violet', 'Section-header': 'orange', 'Table': 'green', 'Text': 'blue', 'Title': 'pink' } # image witout content examples_dir = 'samples/' images_wo_content = examples_dir + "wo_content.png" df_paragraphs_wo_content, df_lines_wo_content = pd.DataFrame(), pd.DataFrame() df_paragraphs_wo_content["paragraphs"] = [0] df_paragraphs_wo_content["categories"] = ["no content"] df_paragraphs_wo_content["texts"] = ["no content"] df_paragraphs_wo_content["bounding boxes"] = ["no content"] df_lines_wo_content["lines"] = [0] df_lines_wo_content["categories"] = ["no content"] df_lines_wo_content["texts"] = ["no content"] df_lines_wo_content["bounding boxes"] = ["no content"] # lists font = ImageFont.load_default() dataset_names = ["small", "base"] splits = ["all", "train", "validation", "test"] domains = ["all", "Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"] domains_names = [domain_name.lower().replace(" ", "_") for domain_name in domains] categories = labels + ["all"] # function to get a rendom image and all data from DocLayNet def generate_annotated_image(dataset_name, split, domain, category): def get_dataset(dataset_name, split, domain, category): # error message msg_error = "" # get dataset if dataset_name == "small": example = dataset_small else: example = dataset_base # get split if split == "all": example = concatenate_datasets([example["train"], example["validation"], example["test"]]) else: example = example[split] # get domain domain_name = domains_names[domains.index(domain)] if domain_name != "all": example = example.filter(lambda example: example["doc_category"] == domain_name) if len(example) == 0: msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters ("{domain}" domain / "DocLayNet {dataset_name}" dataset splitted into "{split}").' example = dict() return example, msg_error # get category idx_list = list() if category != "all": for idx, categories_list in zip(example["id"], example["categories"]): if category in categories_list: idx_list.append(idx) example = example.select(idx_list) if len(example) == 0: msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (category: "{category}" / domain: "{domain}" / dataset: "DocLayNet {dataset_name}" / split: "{split}").' example = dict() return example, msg_error return example, msg_error # get results example, msg_error = get_dataset(dataset_name, split, domain, category) if len(msg_error) > 0: return msg_error, images_wo_content, images_wo_content, df_paragraphs_wo_content, df_lines_wo_content else: # get random image & PDF data image_files = example["image"] index = random.randint(0, len(image_files)) image = image_files[index] # original image coco_width, coco_height = example[index]["coco_width"], example[index]["coco_height"] original_width, original_height = example[index]["original_width"], example[index]["original_height"] original_filename = example[index]["original_filename"] page_no = example[index]["page_no"] num_pages = example[index]["num_pages"] # resize image to original image = image.resize((original_width, original_height)) # get corresponding annotations texts = example[index]["texts"] bboxes_block = example[index]["bboxes_block"] bboxes_line = example[index]["bboxes_line"] categories = example[index]["categories"] domain = example[index]["doc_category"] # get list of categories categories_unique = sorted(list(set([categories_list for categories_list in categories]))) categories_unique = [id2label[idx] for idx in categories_unique] # convert boxes to original original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block] original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line] original_bboxes = [original_bboxes_block, original_bboxes_line] ##### block boxes ##### # get list of unique block boxes original_blocks = dict() original_bboxes_block_list = list() original_bbox_block_prec = list() for count_block, original_bbox_block in enumerate(original_bboxes_block): if original_bbox_block != original_bbox_block_prec: original_bbox_block_indexes = [i for i, original_bbox in enumerate(original_bboxes_block) if original_bbox == original_bbox_block] original_blocks[count_block] = original_bbox_block_indexes original_bboxes_block_list.append(original_bbox_block) original_bbox_block_prec = original_bbox_block # get list of categories and texts by unique block boxes category_block_list, text_block_list = list(), list() for original_bbox_block in original_bboxes_block_list: count_block = original_bboxes_block.index(original_bbox_block) original_bbox_block_indexes = original_blocks[count_block] category_block = categories[original_bbox_block_indexes[0]] category_block_list.append(category_block) if id2label[category_block] == "Text" or id2label[category_block] == "Caption" or id2label[category_block] == "Footnote": text_block = ' '.join(np.array(texts)[original_bbox_block_indexes].tolist()) elif id2label[category_block] == "Section-header" or id2label[category_block] == "Title" or id2label[category_block] == "Picture" or id2label[category_block] == "Formula" or id2label[category_block] == "List-item" or id2label[category_block] == "Table" or id2label[category_block] == "Page-header" or id2label[category_block] == "Page-footer": text_block = '\n'.join(np.array(texts)[original_bbox_block_indexes].tolist()) text_block_list.append(text_block) # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary) sorted_original_bboxes_block_list = get_sorted_boxes(original_bboxes_block_list) sorted_original_bboxes_block_list_indexes = [original_bboxes_block_list.index(item) for item in sorted_original_bboxes_block_list] sorted_category_block_list = np.array(category_block_list)[sorted_original_bboxes_block_list_indexes].tolist() sorted_text_block_list = np.array(text_block_list)[sorted_original_bboxes_block_list_indexes].tolist() ##### line boxes #### # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary) original_bboxes_line_list = original_bboxes_line category_line_list = categories text_line_list = texts sorted_original_bboxes_line_list = get_sorted_boxes(original_bboxes_line_list) sorted_original_bboxes_line_list_indexes = [original_bboxes_line_list.index(item) for item in sorted_original_bboxes_line_list] sorted_category_line_list = np.array(category_line_list)[sorted_original_bboxes_line_list_indexes].tolist() sorted_text_line_list = np.array(text_line_list)[sorted_original_bboxes_line_list_indexes].tolist() # setup images & PDf data columns = 2 images = [image.copy(), image.copy()] num_imgs = len(images) imgs, df_paragraphs, df_lines = dict(), pd.DataFrame(), pd.DataFrame() for i, img in enumerate(images): draw = ImageDraw.Draw(img) for box, label_idx, text in zip(original_bboxes[i], categories, texts): label = id2label[label_idx] color = label2color[label] draw.rectangle(box, outline=color) text = text.encode('latin-1', 'replace').decode('latin-1') # https://stackoverflow.com/questions/56761449/unicodeencodeerror-latin-1-codec-cant-encode-character-u2013-writing-to draw.text((box[0] + 10, box[1] - 10), text=label, fill=color, font=font) if i == 0: imgs["paragraphs"] = img df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list))) df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list] df_paragraphs["texts"] = sorted_text_block_list df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list] else: imgs["lines"] = img df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list))) df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list] df_lines["texts"] = sorted_text_line_list df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list] msg = f'The page {page_no} of PDF "{original_filename}" (domain "{domain}") matches your parameters.' return msg, imgs["paragraphs"], imgs["lines"], df_paragraphs, df_lines # gradio APP with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo: gr.HTML("""

DocLayNet image viewer

(01/29/2023) This APP is an image viewer of the DocLayNet dataset.

It uses the datasets DocLayNet small and DocLayNet base.

Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.

""") with gr.Row(): with gr.Column(): dataset_name_gr = gr.Radio(dataset_names, value="small", label="DocLayNet dataset") with gr.Column(): split_gr = gr.Dropdown(splits, value="all", label="Split") with gr.Column(): domain_gr = gr.Dropdown(domains, value="all", label="Domain") with gr.Column(): category_gr = gr.Dropdown(categories, value="all", label="Category") btn = gr.Button("Display PDF image") with gr.Row(): output_msg = gr.Textbox(label="Output message") with gr.Row(): # with gr.Column(): # json = gr.JSON(label="JSON") with gr.Column(): img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs") with gr.Column(): img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines") with gr.Row(): with gr.Column(): df_paragraphs = gr.Dataframe( headers=["paragraphs", "categories", "texts", "bounding boxes"], datatype=["number", "str", "str", "str"], # row_count='dynamic', col_count=(4, "fixed"), interactive=False, label="Paragraphs data", type="pandas", wrap=True ) with gr.Column(): df_lines = gr.Dataframe( headers=["lines", "categories", "texts", "bounding boxes"], datatype=["number", "str", "str", "str"], # row_count='dynamic', col_count=(4, "fixed"), interactive=False, label="Lines data", type="pandas", wrap=True ) btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_paragraphs, img_lines, df_paragraphs, df_lines]) gr.Markdown("## Example") gr.Examples( [["small", "all", "all", "all"]], [dataset_name_gr, split_gr, domain_gr, category_gr], [output_msg, img_paragraphs, img_lines, df_paragraphs, df_lines], fn=generate_annotated_image, cache_examples=True, ) demo.launch()