pierreguillou commited on
Commit
ec64fd3
·
1 Parent(s): 26691c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -43
app.py CHANGED
@@ -11,7 +11,7 @@ import collections
11
  from datasets import load_dataset
12
 
13
  dataset_small = load_dataset("pierreguillou/DocLayNet-small")
14
- # dataset_base = load_dataset("pierreguillou/DocLayNet-base")
15
 
16
  id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
17
  label2id = {label:idx for idx,label in id2label.items()}
@@ -89,7 +89,7 @@ font = ImageFont.load_default()
89
  dataset_names = ["small", "base"]
90
  splits = ["all", "train", "validation", "test"]
91
  domains = ["all", "Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"]
92
- domains_names = [domain_name.lower().replace(" ", "_") for domain_name in domains]
93
  categories = labels + ["all"]
94
 
95
  # function to get a rendom image and all data from DocLayNet
@@ -113,7 +113,7 @@ def generate_annotated_image(dataset_name, split, domain, category):
113
  if domain_name != "all":
114
  example = example.filter(lambda example: example["doc_category"] == domain_name)
115
  if len(example) == 0:
116
- msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (dataset: "DocLayNet {dataset_name}" / domain: "{domain}" / split: "{split}").'
117
  example = dict()
118
 
119
  # get category
@@ -125,16 +125,23 @@ def generate_annotated_image(dataset_name, split, domain, category):
125
  if len(idx_list) > 0:
126
  example = example.select(idx_list)
127
  else:
128
- msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (dataset: "DocLayNet {dataset_name}" / split: "{split}" / domain: "{domain}" / category: "{category}").'
129
  example = dict()
130
 
131
  if len(msg_error) > 0:
132
- # save
 
 
 
 
 
133
  Image.open(images_wo_content).save("img_paragraphs.png")
134
  Image.open(images_wo_content).save("img_lines.png")
 
135
  df_paragraphs_wo_content.to_csv("paragraphs.csv", encoding="utf-8", index=False)
136
  df_lines_wo_content.to_csv("lines.csv", encoding="utf-8", index=False)
137
- return msg_error, images_wo_content, images_wo_content, "img_paragraphs.png", "img_lines.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs.csv", visible=False), gr.File.update(value="lines.csv", visible=False)
 
138
  else:
139
  # get random image & PDF data
140
  index = random.randint(0, len(example))
@@ -148,6 +155,9 @@ def generate_annotated_image(dataset_name, split, domain, category):
148
  # resize image to original
149
  image = image.resize((original_width, original_height))
150
 
 
 
 
151
  # get corresponding annotations
152
  texts = example[index]["texts"]
153
  bboxes_block = example[index]["bboxes_block"]
@@ -155,10 +165,6 @@ def generate_annotated_image(dataset_name, split, domain, category):
155
  categories = example[index]["categories"]
156
  domain = example[index]["doc_category"]
157
 
158
- # get list of categories
159
- categories_unique = sorted(list(set([categories_list for categories_list in categories])))
160
- categories_unique = [id2label[idx] for idx in categories_unique]
161
-
162
  # convert boxes to original
163
  original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block]
164
  original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line]
@@ -207,7 +213,7 @@ def generate_annotated_image(dataset_name, split, domain, category):
207
  sorted_category_line_list = np.array(category_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
208
  sorted_text_line_list = np.array(text_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
209
 
210
- # setup images & PDf data
211
  columns = 2
212
  images = [image.copy(), image.copy()]
213
  num_imgs = len(images)
@@ -252,77 +258,79 @@ def generate_annotated_image(dataset_name, split, domain, category):
252
  # save
253
  df_lines.to_csv("lines.csv", encoding="utf-8", index=False)
254
 
255
- msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your parameters.'
256
 
257
- return msg, imgs["paragraphs"], imgs["lines"], "img_paragraphs.png", "img_lines.png", df_paragraphs, df_lines, gr.File.update(value="paragraphs.csv", visible=True), gr.File.update(value="lines.csv", visible=True)
258
 
259
  # gradio APP
260
  with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
261
  gr.HTML("""
262
  <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>DocLayNet image viewer</h1></div>
263
- <div style="margin-top: 20px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset.</p></div>
264
- <div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
265
- <div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
266
- <div><p>More informations about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
267
- <div><ul><li><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li><a href="" target="_blank">Document AI | DocLayNet image viewer APP</a></li></ul></div>
 
268
  """)
269
  with gr.Row():
270
  with gr.Column():
271
- dataset_name_gr = gr.Radio(["small"], value="small", label="DocLayNet dataset")
272
  with gr.Column():
273
  split_gr = gr.Dropdown(splits, value="all", label="Split")
274
  with gr.Column():
275
  domain_gr = gr.Dropdown(domains, value="all", label="Domain")
276
  with gr.Column():
277
  category_gr = gr.Dropdown(categories, value="all", label="Category")
278
- btn = gr.Button("Display PDF image")
279
  with gr.Row():
280
- output_msg = gr.Textbox(label="Output message")
 
 
 
281
  with gr.Row():
282
- # with gr.Column():
283
- # json = gr.JSON(label="JSON")
284
  with gr.Column():
285
- img_paragraphs_file = gr.File(interactive=False, visible=True)
286
  img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs", visible=True)
287
  with gr.Column():
288
- img_lines_file = gr.File(interactive=False, visible=True)
289
  img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines", visible=True)
290
  with gr.Row():
291
  with gr.Column():
292
- csv_paragraphs = gr.File(interactive=False, visible=False)
293
- df_paragraphs = gr.Dataframe(
294
- headers=["paragraphs", "categories", "texts", "bounding boxes"],
295
- datatype=["number", "str", "str", "str"],
296
- # row_count='dynamic',
297
- col_count=(4, "fixed"),
298
- interactive=False,
299
- visible=True,
300
- label="Paragraphs data",
301
- type="pandas",
302
- wrap=True
303
- )
304
  with gr.Column():
305
- csv_lines = gr.File(interactive=False, visible=False)
 
 
306
  df_lines = gr.Dataframe(
307
  headers=["lines", "categories", "texts", "bounding boxes"],
308
  datatype=["number", "str", "str", "str"],
309
- # row_count='dynamic',
310
  col_count=(4, "fixed"),
311
- interactive=False,
312
  visible=True,
313
  label="Lines data",
314
  type="pandas",
315
  wrap=True
316
  )
317
- btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
318
 
319
  gr.Markdown("## Example")
320
  gr.Examples(
321
  [["small", "all", "all", "all"]],
322
  [dataset_name_gr, split_gr, domain_gr, category_gr],
323
- [output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
324
  fn=generate_annotated_image,
325
  cache_examples=True,
326
  )
327
 
328
- demo.launch()
 
11
  from datasets import load_dataset
12
 
13
  dataset_small = load_dataset("pierreguillou/DocLayNet-small")
14
+ dataset_base = load_dataset("pierreguillou/DocLayNet-base")
15
 
16
  id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
17
  label2id = {label:idx for idx,label in id2label.items()}
 
89
  dataset_names = ["small", "base"]
90
  splits = ["all", "train", "validation", "test"]
91
  domains = ["all", "Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"]
92
+ domains_names = [domain_name.lower().replace(" ", "_").replace("&", "and") for domain_name in domains]
93
  categories = labels + ["all"]
94
 
95
  # function to get a rendom image and all data from DocLayNet
 
113
  if domain_name != "all":
114
  example = example.filter(lambda example: example["doc_category"] == domain_name)
115
  if len(example) == 0:
116
+ msg_error = f'There is no image with at least one labeled bounding box that matches your settings (dataset: "DocLayNet {dataset_name}" / domain: "{domain}" / split: "{split}").'
117
  example = dict()
118
 
119
  # get category
 
125
  if len(idx_list) > 0:
126
  example = example.select(idx_list)
127
  else:
128
+ msg_error = f'There is no image with at least one labeled bounding box that matches your settings (dataset: "DocLayNet {dataset_name}" / split: "{split}" / domain: "{domain}" / category: "{category}").'
129
  example = dict()
130
 
131
  if len(msg_error) > 0:
132
+ # save PDF
133
+ rgba = Image.open(images_wo_content)
134
+ rgb = Image.new('RGB', rgba.size, (255, 255, 255)) # white background
135
+ rgb.paste(rgba, mask=rgba.split()[3]) # paste using alpha channel as mask
136
+ rgb.save("wo_content.pdf", 'PDF', resolution=100.0)
137
+ # save image files
138
  Image.open(images_wo_content).save("img_paragraphs.png")
139
  Image.open(images_wo_content).save("img_lines.png")
140
+ # save csv files
141
  df_paragraphs_wo_content.to_csv("paragraphs.csv", encoding="utf-8", index=False)
142
  df_lines_wo_content.to_csv("lines.csv", encoding="utf-8", index=False)
143
+
144
+ return msg_error, "wo_content.pdf", images_wo_content, images_wo_content, "img_paragraphs.png", "img_lines.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs.csv", visible=False), gr.File.update(value="lines.csv", visible=False)
145
  else:
146
  # get random image & PDF data
147
  index = random.randint(0, len(example))
 
155
  # resize image to original
156
  image = image.resize((original_width, original_height))
157
 
158
+ # get pdf of image
159
+ image.save(original_filename)
160
+
161
  # get corresponding annotations
162
  texts = example[index]["texts"]
163
  bboxes_block = example[index]["bboxes_block"]
 
165
  categories = example[index]["categories"]
166
  domain = example[index]["doc_category"]
167
 
 
 
 
 
168
  # convert boxes to original
169
  original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block]
170
  original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line]
 
213
  sorted_category_line_list = np.array(category_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
214
  sorted_text_line_list = np.array(text_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
215
 
216
+ # setup images & PDF data
217
  columns = 2
218
  images = [image.copy(), image.copy()]
219
  num_imgs = len(images)
 
258
  # save
259
  df_lines.to_csv("lines.csv", encoding="utf-8", index=False)
260
 
261
+ msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your settings.'
262
 
263
+ return msg, original_filename, imgs["paragraphs"], imgs["lines"], "img_paragraphs.png", "img_lines.png", df_paragraphs, df_lines, gr.File.update(value="paragraphs.csv", visible=True), gr.File.update(value="lines.csv", visible=True)
264
 
265
  # gradio APP
266
  with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
267
  gr.HTML("""
268
  <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>DocLayNet image viewer</h1></div>
269
+ <div style="margin-top: 40px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset and a data extraction tool.</p></div>
270
+ <div><p>It uses the datasets <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> (you can also run this APP in Google Colab by running this <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>).</p></div>
271
+ <div><p>Make your settings and the output will show 2 images of a randomly selected PDF with labeled bounding boxes, one of paragraphs and the other of lines, and their corresponding tables of texts with their labels.</p></div>
272
+ <div><p>For example, if you select the domain "laws_and_regulations" and the category "Caption", you will get a random PDF that corresponds to these settings (ie, it will have at least one bounding box labeled with "Caption" in the PDF).</p></div>
273
+ <div style="margin-top: 20px"><p>More information about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
274
+ <div><ul><li>- <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li>- <a href="" target="_blank">(01/31/2023) Document AI | DocLayNet image viewer APP</a></li></ul></div>
275
  """)
276
  with gr.Row():
277
  with gr.Column():
278
+ dataset_name_gr = gr.Radio(dataset_names, value="small", label="DocLayNet dataset")
279
  with gr.Column():
280
  split_gr = gr.Dropdown(splits, value="all", label="Split")
281
  with gr.Column():
282
  domain_gr = gr.Dropdown(domains, value="all", label="Domain")
283
  with gr.Column():
284
  category_gr = gr.Dropdown(categories, value="all", label="Category")
285
+ btn = gr.Button("Display labeled PDF image & data")
286
  with gr.Row():
287
+ with gr.Column():
288
+ output_msg = gr.Textbox(label="Output message")
289
+ with gr.Column():
290
+ pdf_file = gr.File(visible=True, label="PDF file (original)")
291
  with gr.Row():
 
 
292
  with gr.Column():
293
+ img_paragraphs_file = gr.File(visible=True, label="Image file (labeled paragraphs)")
294
  img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs", visible=True)
295
  with gr.Column():
296
+ img_lines_file = gr.File(visible=True, label="Image file (labeled lines)")
297
  img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines", visible=True)
298
  with gr.Row():
299
  with gr.Column():
300
+ with gr.Row():
301
+ csv_paragraphs = gr.File(visible=False, label="CSV file (paragraphs)")
302
+ with gr.Row():
303
+ df_paragraphs = gr.Dataframe(
304
+ headers=["paragraphs", "categories", "texts", "bounding boxes"],
305
+ datatype=["number", "str", "str", "str"],
306
+ col_count=(4, "fixed"),
307
+ visible=True,
308
+ label="Paragraphs data",
309
+ type="pandas",
310
+ wrap=True
311
+ )
312
  with gr.Column():
313
+ with gr.Row():
314
+ csv_lines = gr.File(visible=False, label="CSV file (lines)")
315
+ with gr.Row():
316
  df_lines = gr.Dataframe(
317
  headers=["lines", "categories", "texts", "bounding boxes"],
318
  datatype=["number", "str", "str", "str"],
 
319
  col_count=(4, "fixed"),
 
320
  visible=True,
321
  label="Lines data",
322
  type="pandas",
323
  wrap=True
324
  )
325
+ btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, pdf_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
326
 
327
  gr.Markdown("## Example")
328
  gr.Examples(
329
  [["small", "all", "all", "all"]],
330
  [dataset_name_gr, split_gr, domain_gr, category_gr],
331
+ [output_msg, pdf_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
332
  fn=generate_annotated_image,
333
  cache_examples=True,
334
  )
335
 
336
+ demo.launch()