Spaces:

pierreguillou
/

DocLayNet-image-viewer

Runtime error

App Files Files Community

pierreguillou commited on Jan 30, 2023

Commit

d768dd3

1 Parent(s): 230d25f

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -8

app.py CHANGED Viewed

@@ -129,7 +129,12 @@ def generate_annotated_image(dataset_name, split, domain, category):
       example = dict()
   if len(msg_error) > 0:
-    return msg_error, images_wo_content, images_wo_content, df_paragraphs_wo_content, df_lines_wo_content
   else:
     # get random image & PDF data
     index = random.randint(0, len(example))
@@ -221,23 +226,35 @@ def generate_annotated_image(dataset_name, split, domain, category):
         if i == 0:
           imgs["paragraphs"] = img
           df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list)))
           df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list]
           df_paragraphs["texts"] = sorted_text_block_list
           df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list]
         else:
           imgs["lines"] = img
           df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list)))
           df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list]
           df_lines["texts"] = sorted_text_line_list
           df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
     msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your parameters.'
-    return msg, imgs["paragraphs"], imgs["lines"], df_paragraphs, df_lines
 # gradio APP
 with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
@@ -247,7 +264,7 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
     <div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
     <div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
     <div><p>More informations about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
-    <div><ul><li>- <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li>- <a href="" target="_blank">Document AI | DocLayNet image viewer APP</a></li></ul></div>
     """)
     with gr.Row():
         with gr.Column():
@@ -265,40 +282,45 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
         # with gr.Column():
         #   json = gr.JSON(label="JSON")
         with gr.Column():
-          img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs")
         with gr.Column():
-          img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines")
     with gr.Row():
       with gr.Column():
         df_paragraphs = gr.Dataframe(
             headers=["paragraphs", "categories", "texts", "bounding boxes"],
             datatype=["number", "str", "str", "str"],
             # row_count='dynamic',
             col_count=(4, "fixed"),
             interactive=False,
             label="Paragraphs data",
             type="pandas",
             wrap=True
           )
       with gr.Column():
           df_lines = gr.Dataframe(
               headers=["lines", "categories", "texts", "bounding boxes"],
               datatype=["number", "str", "str", "str"],
               # row_count='dynamic',
               col_count=(4, "fixed"),
               interactive=False,
               label="Lines data",
               type="pandas",
               wrap=True
             )
-    btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_paragraphs, img_lines, df_paragraphs, df_lines])
     gr.Markdown("## Example")
     gr.Examples(
         [["small", "all", "all", "all"]],
         [dataset_name_gr, split_gr, domain_gr, category_gr],
-        [output_msg, img_paragraphs, img_lines, df_paragraphs, df_lines],
         fn=generate_annotated_image,
         cache_examples=True,
     )

       example = dict()
   if len(msg_error) > 0:
+    # save
+    images_wo_content.save("img_paragraphs.png")
+    images_wo_content.save("img_lines.png")
+    df_paragraphs_wo_content.to_csv("paragraphs.csv", encoding="utf-8", index=False)
+    df_lines_wo_content.to_csv("lines.csv", encoding="utf-8", index=False)
+    return msg_error, images_wo_content, images_wo_content, "img_paragraphs.png", "img_lines.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs.csv", visible=False), gr.File.update(value="lines.csv", visible=False)
   else:
     # get random image & PDF data
     index = random.randint(0, len(example))
         if i == 0:
           imgs["paragraphs"] = img
+          # save
+          img.save("img_paragraphs.png")
           df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list)))
           df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list]
           df_paragraphs["texts"] = sorted_text_block_list
           df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list]
+          # save
+          df_paragraphs.to_csv("paragraphs.csv", encoding="utf-8", index=False)
         else:
           imgs["lines"] = img
+          # save
+          img.save("img_lines.png")
           df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list)))
           df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list]
           df_lines["texts"] = sorted_text_line_list
           df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
+          # save
+          df_lines.to_csv("lines.csv", encoding="utf-8", index=False)
     msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your parameters.'
+    return msg, imgs["paragraphs"], imgs["lines"], "img_paragraphs.png", "img_lines.png", df_paragraphs, df_lines, gr.File.update(value="paragraphs.csv", visible=True), gr.File.update(value="lines.csv", visible=True)
 # gradio APP
 with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
     <div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
     <div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
     <div><p>More informations about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
+    <div><ul><li><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li><a href="" target="_blank">Document AI | DocLayNet image viewer APP</a></li></ul></div>
     """)
     with gr.Row():
         with gr.Column():
         # with gr.Column():
         #   json = gr.JSON(label="JSON")
         with gr.Column():
+          img_paragraphs_file = gr.File(interactive=False, visible=True)
+          img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs", visible=True)
         with gr.Column():
+          img_lines_file = gr.File(interactive=False, visible=True)
+          img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines", visible=True)
     with gr.Row():
       with gr.Column():
+        csv_paragraphs = gr.File(interactive=False, visible=False)
         df_paragraphs = gr.Dataframe(
             headers=["paragraphs", "categories", "texts", "bounding boxes"],
             datatype=["number", "str", "str", "str"],
             # row_count='dynamic',
             col_count=(4, "fixed"),
             interactive=False,
+            visible=True,
             label="Paragraphs data",
             type="pandas",
             wrap=True
           )
       with gr.Column():
+          csv_lines = gr.File(interactive=False, visible=False)
           df_lines = gr.Dataframe(
               headers=["lines", "categories", "texts", "bounding boxes"],
               datatype=["number", "str", "str", "str"],
               # row_count='dynamic',
               col_count=(4, "fixed"),
               interactive=False,
+              visible=True,
               label="Lines data",
               type="pandas",
               wrap=True
             )
+    btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
     gr.Markdown("## Example")
     gr.Examples(
         [["small", "all", "all", "all"]],
         [dataset_name_gr, split_gr, domain_gr, category_gr],
+        [output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
         fn=generate_annotated_image,
         cache_examples=True,
     )