Spaces:

pierreguillou
/

DocLayNet-image-viewer

Runtime error

App Files Files Community

pierreguillou commited on Jan 30, 2023

Commit

17b7e45

1 Parent(s): 44970cd

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -11

app.py CHANGED Viewed

@@ -11,9 +11,10 @@ import collections
 from datasets import load_dataset
 dataset_small = load_dataset("pierreguillou/DocLayNet-small")
-#dataset_base = load_dataset("pierreguillou/DocLayNet-base")
 id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
 labels = [label for idx, label in id2label.items()]
 # need to change the coordinates format
@@ -112,27 +113,27 @@ def generate_annotated_image(dataset_name, split, domain, category):
   if domain_name != "all":
     example = example.filter(lambda example: example["doc_category"] == domain_name)
     if len(example) == 0:
-      msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (domain: "{domain}" / dataset: "DocLayNet {dataset_name}" / split: "{split}").'
       example = dict()
   # get category
   idx_list = list()
   if category != "all":
-    for idx, categories_list in zip(example["id"], example["categories"]):
-      if category in categories_list:
         idx_list.append(idx)
-    example = example.select(idx_list)
-    if len(example) == 0:
-      msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (category: "{category}" / domain: "{domain}" / dataset: "DocLayNet {dataset_name}" / split: "{split}").'
       example = dict()
   if len(msg_error) > 0:
     return msg_error, images_wo_content, images_wo_content, df_paragraphs_wo_content, df_lines_wo_content
   else:
     # get random image & PDF data
-    image_files = example["image"]
-    index = random.randint(0, len(image_files))
-    image = image_files[index] # original image
     coco_width, coco_height = example[index]["coco_width"], example[index]["coco_height"]
     original_width, original_height = example[index]["original_width"], example[index]["original_height"]
     original_filename = example[index]["original_filename"]
@@ -234,7 +235,7 @@ def generate_annotated_image(dataset_name, split, domain, category):
           df_lines["texts"] = sorted_text_line_list
           df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
-    msg = f'The page {page_no} of PDF "{original_filename}" (domain "{domain}") matches your parameters.'
     return msg, imgs["paragraphs"], imgs["lines"], df_paragraphs, df_lines
@@ -245,6 +246,8 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
     <div style="margin-top: 20px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset.</p></div>
     <div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
     <div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
     """)
     with gr.Row():
         with gr.Column():

 from datasets import load_dataset
 dataset_small = load_dataset("pierreguillou/DocLayNet-small")
+# dataset_base = load_dataset("pierreguillou/DocLayNet-base")
 id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
+label2id = {label:idx for idx,label in id2label.items()}
 labels = [label for idx, label in id2label.items()]
 # need to change the coordinates format
   if domain_name != "all":
     example = example.filter(lambda example: example["doc_category"] == domain_name)
     if len(example) == 0:
+      msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (dataset: "DocLayNet {dataset_name}" / domain: "{domain}" /  split: "{split}").'
       example = dict()
   # get category
   idx_list = list()
   if category != "all":
+    for idx, categories_list in enumerate(example["categories"]):
+      if int(label2id[category]) in categories_list:
         idx_list.append(idx)
+    if len(idx_list) > 0:
+      example = example.select(idx_list)
+    else:
+      msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (dataset: "DocLayNet {dataset_name}" / split: "{split}" / domain: "{domain}" / category: "{category}").'
       example = dict()
   if len(msg_error) > 0:
     return msg_error, images_wo_content, images_wo_content, df_paragraphs_wo_content, df_lines_wo_content
   else:
     # get random image & PDF data
+    index = random.randint(0, len(example))
+    image = example[index]["image"] # original image
     coco_width, coco_height = example[index]["coco_width"], example[index]["coco_height"]
     original_width, original_height = example[index]["original_width"], example[index]["original_height"]
     original_filename = example[index]["original_filename"]
           df_lines["texts"] = sorted_text_line_list
           df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
+    msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your parameters.'
     return msg, imgs["paragraphs"], imgs["lines"], df_paragraphs, df_lines
     <div style="margin-top: 20px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset.</p></div>
     <div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
     <div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
+    <div><p>More informations about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
+    <div><ul><li>- <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li>- <a href="" target="_blank">Document AI | DocLayNet image viewer APP</a></li></ul></div>
     """)
     with gr.Row():
         with gr.Column():