pierreguillou commited on
Commit
17b7e45
·
1 Parent(s): 44970cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -11
app.py CHANGED
@@ -11,9 +11,10 @@ import collections
11
  from datasets import load_dataset
12
 
13
  dataset_small = load_dataset("pierreguillou/DocLayNet-small")
14
- #dataset_base = load_dataset("pierreguillou/DocLayNet-base")
15
 
16
  id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
 
17
  labels = [label for idx, label in id2label.items()]
18
 
19
  # need to change the coordinates format
@@ -112,27 +113,27 @@ def generate_annotated_image(dataset_name, split, domain, category):
112
  if domain_name != "all":
113
  example = example.filter(lambda example: example["doc_category"] == domain_name)
114
  if len(example) == 0:
115
- msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (domain: "{domain}" / dataset: "DocLayNet {dataset_name}" / split: "{split}").'
116
  example = dict()
117
 
118
  # get category
119
  idx_list = list()
120
  if category != "all":
121
- for idx, categories_list in zip(example["id"], example["categories"]):
122
- if category in categories_list:
123
  idx_list.append(idx)
124
- example = example.select(idx_list)
125
- if len(example) == 0:
126
- msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (category: "{category}" / domain: "{domain}" / dataset: "DocLayNet {dataset_name}" / split: "{split}").'
 
127
  example = dict()
128
 
129
  if len(msg_error) > 0:
130
  return msg_error, images_wo_content, images_wo_content, df_paragraphs_wo_content, df_lines_wo_content
131
  else:
132
  # get random image & PDF data
133
- image_files = example["image"]
134
- index = random.randint(0, len(image_files))
135
- image = image_files[index] # original image
136
  coco_width, coco_height = example[index]["coco_width"], example[index]["coco_height"]
137
  original_width, original_height = example[index]["original_width"], example[index]["original_height"]
138
  original_filename = example[index]["original_filename"]
@@ -234,7 +235,7 @@ def generate_annotated_image(dataset_name, split, domain, category):
234
  df_lines["texts"] = sorted_text_line_list
235
  df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
236
 
237
- msg = f'The page {page_no} of PDF "{original_filename}" (domain "{domain}") matches your parameters.'
238
 
239
  return msg, imgs["paragraphs"], imgs["lines"], df_paragraphs, df_lines
240
 
@@ -245,6 +246,8 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
245
  <div style="margin-top: 20px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset.</p></div>
246
  <div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
247
  <div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
 
 
248
  """)
249
  with gr.Row():
250
  with gr.Column():
 
11
  from datasets import load_dataset
12
 
13
  dataset_small = load_dataset("pierreguillou/DocLayNet-small")
14
+ # dataset_base = load_dataset("pierreguillou/DocLayNet-base")
15
 
16
  id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
17
+ label2id = {label:idx for idx,label in id2label.items()}
18
  labels = [label for idx, label in id2label.items()]
19
 
20
  # need to change the coordinates format
 
113
  if domain_name != "all":
114
  example = example.filter(lambda example: example["doc_category"] == domain_name)
115
  if len(example) == 0:
116
+ msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (dataset: "DocLayNet {dataset_name}" / domain: "{domain}" / split: "{split}").'
117
  example = dict()
118
 
119
  # get category
120
  idx_list = list()
121
  if category != "all":
122
+ for idx, categories_list in enumerate(example["categories"]):
123
+ if int(label2id[category]) in categories_list:
124
  idx_list.append(idx)
125
+ if len(idx_list) > 0:
126
+ example = example.select(idx_list)
127
+ else:
128
+ msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (dataset: "DocLayNet {dataset_name}" / split: "{split}" / domain: "{domain}" / category: "{category}").'
129
  example = dict()
130
 
131
  if len(msg_error) > 0:
132
  return msg_error, images_wo_content, images_wo_content, df_paragraphs_wo_content, df_lines_wo_content
133
  else:
134
  # get random image & PDF data
135
+ index = random.randint(0, len(example))
136
+ image = example[index]["image"] # original image
 
137
  coco_width, coco_height = example[index]["coco_width"], example[index]["coco_height"]
138
  original_width, original_height = example[index]["original_width"], example[index]["original_height"]
139
  original_filename = example[index]["original_filename"]
 
235
  df_lines["texts"] = sorted_text_line_list
236
  df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
237
 
238
+ msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your parameters.'
239
 
240
  return msg, imgs["paragraphs"], imgs["lines"], df_paragraphs, df_lines
241
 
 
246
  <div style="margin-top: 20px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset.</p></div>
247
  <div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
248
  <div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
249
+ <div><p>More informations about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
250
+ <div><ul><li>- <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li>- <a href="" target="_blank">Document AI | DocLayNet image viewer APP</a></li></ul></div>
251
  """)
252
  with gr.Row():
253
  with gr.Column():