Spaces:

pierreguillou
/

DocLayNet-image-viewer

Runtime error

App Files Files Community

pierreguillou commited on Jan 31, 2023

Commit

ec64fd3

1 Parent(s): 26691c8

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -43

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import collections
 from datasets import load_dataset
 dataset_small = load_dataset("pierreguillou/DocLayNet-small")
-# dataset_base = load_dataset("pierreguillou/DocLayNet-base")
 id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
 label2id = {label:idx for idx,label in id2label.items()}
@@ -89,7 +89,7 @@ font = ImageFont.load_default()
 dataset_names = ["small", "base"]
 splits = ["all", "train", "validation", "test"]
 domains = ["all", "Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"]
-domains_names = [domain_name.lower().replace(" ", "_") for domain_name in domains]
 categories = labels + ["all"]
 # function to get a rendom image and all data from DocLayNet
@@ -113,7 +113,7 @@ def generate_annotated_image(dataset_name, split, domain, category):
   if domain_name != "all":
     example = example.filter(lambda example: example["doc_category"] == domain_name)
     if len(example) == 0:
-      msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (dataset: "DocLayNet {dataset_name}" / domain: "{domain}" /  split: "{split}").'
       example = dict()
   # get category
@@ -125,16 +125,23 @@ def generate_annotated_image(dataset_name, split, domain, category):
     if len(idx_list) > 0:
       example = example.select(idx_list)
     else:
-      msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (dataset: "DocLayNet {dataset_name}" / split: "{split}" / domain: "{domain}" / category: "{category}").'
       example = dict()
   if len(msg_error) > 0:
-    # save
     Image.open(images_wo_content).save("img_paragraphs.png")
     Image.open(images_wo_content).save("img_lines.png")
     df_paragraphs_wo_content.to_csv("paragraphs.csv", encoding="utf-8", index=False)
     df_lines_wo_content.to_csv("lines.csv", encoding="utf-8", index=False)
-    return msg_error, images_wo_content, images_wo_content, "img_paragraphs.png", "img_lines.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs.csv", visible=False), gr.File.update(value="lines.csv", visible=False)
   else:
     # get random image & PDF data
     index = random.randint(0, len(example))
@@ -148,6 +155,9 @@ def generate_annotated_image(dataset_name, split, domain, category):
     # resize image to original
     image = image.resize((original_width, original_height))
     # get corresponding annotations
     texts = example[index]["texts"]
     bboxes_block = example[index]["bboxes_block"]
@@ -155,10 +165,6 @@ def generate_annotated_image(dataset_name, split, domain, category):
     categories = example[index]["categories"]
     domain = example[index]["doc_category"]
-    # get list of categories
-    categories_unique = sorted(list(set([categories_list for categories_list in categories])))
-    categories_unique = [id2label[idx] for idx in categories_unique]
     # convert boxes to original
     original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block]
     original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line]
@@ -207,7 +213,7 @@ def generate_annotated_image(dataset_name, split, domain, category):
     sorted_category_line_list = np.array(category_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
     sorted_text_line_list = np.array(text_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
-    # setup images & PDf data
     columns = 2
     images = [image.copy(), image.copy()]
     num_imgs = len(images)
@@ -252,77 +258,79 @@ def generate_annotated_image(dataset_name, split, domain, category):
           # save
           df_lines.to_csv("lines.csv", encoding="utf-8", index=False)
-    msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your parameters.'
-    return msg, imgs["paragraphs"], imgs["lines"], "img_paragraphs.png", "img_lines.png", df_paragraphs, df_lines, gr.File.update(value="paragraphs.csv", visible=True), gr.File.update(value="lines.csv", visible=True)
 # gradio APP
 with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
     gr.HTML("""
     <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>DocLayNet image viewer</h1></div>
-    <div style="margin-top: 20px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset.</p></div>
-    <div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
-    <div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
-    <div><p>More informations about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
-    <div><ul><li><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li><a href="" target="_blank">Document AI | DocLayNet image viewer APP</a></li></ul></div>
     """)
     with gr.Row():
         with gr.Column():
-            dataset_name_gr = gr.Radio(["small"], value="small", label="DocLayNet dataset")
         with gr.Column():
             split_gr = gr.Dropdown(splits, value="all", label="Split")
         with gr.Column():
             domain_gr = gr.Dropdown(domains, value="all", label="Domain")
         with gr.Column():
             category_gr = gr.Dropdown(categories, value="all", label="Category")
-    btn = gr.Button("Display PDF image")
     with gr.Row():
-      output_msg = gr.Textbox(label="Output message")
     with gr.Row():
-        # with gr.Column():
-        #   json = gr.JSON(label="JSON")
         with gr.Column():
-          img_paragraphs_file = gr.File(interactive=False, visible=True)
           img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs", visible=True)
         with gr.Column():
-          img_lines_file = gr.File(interactive=False, visible=True)
           img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines", visible=True)
     with gr.Row():
       with gr.Column():
-        csv_paragraphs = gr.File(interactive=False, visible=False)
-        df_paragraphs = gr.Dataframe(
-            headers=["paragraphs", "categories", "texts", "bounding boxes"],
-            datatype=["number", "str", "str", "str"],
-            # row_count='dynamic',
-            col_count=(4, "fixed"),
-            interactive=False,
-            visible=True,
-            label="Paragraphs data",
-            type="pandas",
-            wrap=True
-          )
       with gr.Column():
-          csv_lines = gr.File(interactive=False, visible=False)
           df_lines = gr.Dataframe(
               headers=["lines", "categories", "texts", "bounding boxes"],
               datatype=["number", "str", "str", "str"],
-              # row_count='dynamic',
               col_count=(4, "fixed"),
-              interactive=False,
               visible=True,
               label="Lines data",
               type="pandas",
               wrap=True
             )
-    btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
     gr.Markdown("## Example")
     gr.Examples(
         [["small", "all", "all", "all"]],
         [dataset_name_gr, split_gr, domain_gr, category_gr],
-        [output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
         fn=generate_annotated_image,
         cache_examples=True,
     )
-demo.launch()

 from datasets import load_dataset
 dataset_small = load_dataset("pierreguillou/DocLayNet-small")
+dataset_base = load_dataset("pierreguillou/DocLayNet-base")
 id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
 label2id = {label:idx for idx,label in id2label.items()}
 dataset_names = ["small", "base"]
 splits = ["all", "train", "validation", "test"]
 domains = ["all", "Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"]
+domains_names = [domain_name.lower().replace(" ", "_").replace("&", "and") for domain_name in domains]
 categories = labels + ["all"]
 # function to get a rendom image and all data from DocLayNet
   if domain_name != "all":
     example = example.filter(lambda example: example["doc_category"] == domain_name)
     if len(example) == 0:
+      msg_error = f'There is no image with at least one labeled bounding box that matches your settings (dataset: "DocLayNet {dataset_name}" / domain: "{domain}" /  split: "{split}").'
       example = dict()
   # get category
     if len(idx_list) > 0:
       example = example.select(idx_list)
     else:
+      msg_error = f'There is no image with at least one labeled bounding box that matches your settings (dataset: "DocLayNet {dataset_name}" / split: "{split}" / domain: "{domain}" / category: "{category}").'
       example = dict()
   if len(msg_error) > 0:
+    # save PDF
+    rgba = Image.open(images_wo_content)
+    rgb = Image.new('RGB', rgba.size, (255, 255, 255))  # white background
+    rgb.paste(rgba, mask=rgba.split()[3])               # paste using alpha channel as mask
+    rgb.save("wo_content.pdf", 'PDF', resolution=100.0)
+    # save image files
     Image.open(images_wo_content).save("img_paragraphs.png")
     Image.open(images_wo_content).save("img_lines.png")
+    # save csv files
     df_paragraphs_wo_content.to_csv("paragraphs.csv", encoding="utf-8", index=False)
     df_lines_wo_content.to_csv("lines.csv", encoding="utf-8", index=False)
+    return msg_error, "wo_content.pdf", images_wo_content, images_wo_content, "img_paragraphs.png", "img_lines.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs.csv", visible=False), gr.File.update(value="lines.csv", visible=False)
   else:
     # get random image & PDF data
     index = random.randint(0, len(example))
     # resize image to original
     image = image.resize((original_width, original_height))
+    # get pdf of image
+    image.save(original_filename)
     # get corresponding annotations
     texts = example[index]["texts"]
     bboxes_block = example[index]["bboxes_block"]
     categories = example[index]["categories"]
     domain = example[index]["doc_category"]
     # convert boxes to original
     original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block]
     original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line]
     sorted_category_line_list = np.array(category_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
     sorted_text_line_list = np.array(text_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
+    # setup images & PDF data
     columns = 2
     images = [image.copy(), image.copy()]
     num_imgs = len(images)
           # save
           df_lines.to_csv("lines.csv", encoding="utf-8", index=False)
+    msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your settings.'
+    return msg, original_filename, imgs["paragraphs"], imgs["lines"], "img_paragraphs.png", "img_lines.png", df_paragraphs, df_lines, gr.File.update(value="paragraphs.csv", visible=True), gr.File.update(value="lines.csv", visible=True)
 # gradio APP
 with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
     gr.HTML("""
     <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>DocLayNet image viewer</h1></div>
+    <div style="margin-top: 40px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset and a data extraction tool.</p></div>
+    <div><p>It uses the datasets <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> (you can also run this APP in Google Colab by running this <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>).</p></div>
+    <div><p>Make your settings and the output will show 2 images of a randomly selected PDF with labeled bounding boxes, one of paragraphs and the other of lines, and their corresponding tables of texts with their labels.</p></div>
+    <div><p>For example, if you select the domain "laws_and_regulations" and the category "Caption", you will get a random PDF that corresponds to these settings (ie, it will have at least one bounding box labeled with "Caption" in the PDF).</p></div>
+    <div style="margin-top: 20px"><p>More information about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
+    <div><ul><li>- <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li>- <a href="" target="_blank">(01/31/2023) Document AI | DocLayNet image viewer APP</a></li></ul></div>
     """)
     with gr.Row():
         with gr.Column():
+            dataset_name_gr = gr.Radio(dataset_names, value="small", label="DocLayNet dataset")
         with gr.Column():
             split_gr = gr.Dropdown(splits, value="all", label="Split")
         with gr.Column():
             domain_gr = gr.Dropdown(domains, value="all", label="Domain")
         with gr.Column():
             category_gr = gr.Dropdown(categories, value="all", label="Category")
+    btn = gr.Button("Display labeled PDF image & data")
     with gr.Row():
+      with gr.Column():
+        output_msg = gr.Textbox(label="Output message")
+      with gr.Column():
+        pdf_file = gr.File(visible=True, label="PDF file (original)")
     with gr.Row():
         with gr.Column():
+          img_paragraphs_file = gr.File(visible=True, label="Image file (labeled paragraphs)")
           img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs", visible=True)
         with gr.Column():
+          img_lines_file = gr.File(visible=True, label="Image file (labeled lines)")
           img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines", visible=True)
     with gr.Row():
       with gr.Column():
+        with gr.Row():
+          csv_paragraphs = gr.File(visible=False, label="CSV file (paragraphs)")
+        with gr.Row():
+          df_paragraphs = gr.Dataframe(
+              headers=["paragraphs", "categories", "texts", "bounding boxes"],
+              datatype=["number", "str", "str", "str"],
+              col_count=(4, "fixed"),
+              visible=True,
+              label="Paragraphs data",
+              type="pandas",
+              wrap=True
+            )
       with gr.Column():
+        with gr.Row():
+          csv_lines = gr.File(visible=False, label="CSV file (lines)")
+        with gr.Row():
           df_lines = gr.Dataframe(
               headers=["lines", "categories", "texts", "bounding boxes"],
               datatype=["number", "str", "str", "str"],
               col_count=(4, "fixed"),
               visible=True,
               label="Lines data",
               type="pandas",
               wrap=True
             )
+    btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, pdf_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
     gr.Markdown("## Example")
     gr.Examples(
         [["small", "all", "all", "all"]],
         [dataset_name_gr, split_gr, domain_gr, category_gr],
+        [output_msg, pdf_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
         fn=generate_annotated_image,
         cache_examples=True,
     )
+demo.launch()