pierreguillou commited on
Commit
7e13339
·
1 Parent(s): ec64fd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -24
app.py CHANGED
@@ -129,19 +129,13 @@ def generate_annotated_image(dataset_name, split, domain, category):
129
  example = dict()
130
 
131
  if len(msg_error) > 0:
132
- # save PDF
133
- rgba = Image.open(images_wo_content)
134
- rgb = Image.new('RGB', rgba.size, (255, 255, 255)) # white background
135
- rgb.paste(rgba, mask=rgba.split()[3]) # paste using alpha channel as mask
136
- rgb.save("wo_content.pdf", 'PDF', resolution=100.0)
137
  # save image files
138
- Image.open(images_wo_content).save("img_paragraphs.png")
139
- Image.open(images_wo_content).save("img_lines.png")
140
  # save csv files
141
- df_paragraphs_wo_content.to_csv("paragraphs.csv", encoding="utf-8", index=False)
142
- df_lines_wo_content.to_csv("lines.csv", encoding="utf-8", index=False)
143
 
144
- return msg_error, "wo_content.pdf", images_wo_content, images_wo_content, "img_paragraphs.png", "img_lines.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs.csv", visible=False), gr.File.update(value="lines.csv", visible=False)
145
  else:
146
  # get random image & PDF data
147
  index = random.randint(0, len(example))
@@ -155,8 +149,9 @@ def generate_annotated_image(dataset_name, split, domain, category):
155
  # resize image to original
156
  image = image.resize((original_width, original_height))
157
 
158
- # get pdf of image
159
- image.save(original_filename)
 
160
 
161
  # get corresponding annotations
162
  texts = example[index]["texts"]
@@ -233,8 +228,9 @@ def generate_annotated_image(dataset_name, split, domain, category):
233
  if i == 0:
234
  imgs["paragraphs"] = img
235
 
236
- # save
237
- img.save("img_paragraphs.png")
 
238
 
239
  df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list)))
240
  df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list]
@@ -242,13 +238,15 @@ def generate_annotated_image(dataset_name, split, domain, category):
242
  df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list]
243
 
244
  # save
245
- df_paragraphs.to_csv("paragraphs.csv", encoding="utf-8", index=False)
 
246
 
247
  else:
248
  imgs["lines"] = img
249
 
250
  # save
251
- img.save("img_lines.png")
 
252
 
253
  df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list)))
254
  df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list]
@@ -256,11 +254,12 @@ def generate_annotated_image(dataset_name, split, domain, category):
256
  df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
257
 
258
  # save
259
- df_lines.to_csv("lines.csv", encoding="utf-8", index=False)
 
260
 
261
  msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your settings.'
262
 
263
- return msg, original_filename, imgs["paragraphs"], imgs["lines"], "img_paragraphs.png", "img_lines.png", df_paragraphs, df_lines, gr.File.update(value="paragraphs.csv", visible=True), gr.File.update(value="lines.csv", visible=True)
264
 
265
  # gradio APP
266
  with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
@@ -270,8 +269,7 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
270
  <div><p>It uses the datasets <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> (you can also run this APP in Google Colab by running this <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>).</p></div>
271
  <div><p>Make your settings and the output will show 2 images of a randomly selected PDF with labeled bounding boxes, one of paragraphs and the other of lines, and their corresponding tables of texts with their labels.</p></div>
272
  <div><p>For example, if you select the domain "laws_and_regulations" and the category "Caption", you will get a random PDF that corresponds to these settings (ie, it will have at least one bounding box labeled with "Caption" in the PDF).</p></div>
273
- <div style="margin-top: 20px"><p>More information about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
274
- <div><ul><li>- <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li>- <a href="" target="_blank">(01/31/2023) Document AI | DocLayNet image viewer APP</a></li></ul></div>
275
  """)
276
  with gr.Row():
277
  with gr.Column():
@@ -287,7 +285,7 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
287
  with gr.Column():
288
  output_msg = gr.Textbox(label="Output message")
289
  with gr.Column():
290
- pdf_file = gr.File(visible=True, label="PDF file (original)")
291
  with gr.Row():
292
  with gr.Column():
293
  img_paragraphs_file = gr.File(visible=True, label="Image file (labeled paragraphs)")
@@ -322,15 +320,15 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
322
  type="pandas",
323
  wrap=True
324
  )
325
- btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, pdf_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
326
 
327
  gr.Markdown("## Example")
328
  gr.Examples(
329
  [["small", "all", "all", "all"]],
330
  [dataset_name_gr, split_gr, domain_gr, category_gr],
331
- [output_msg, pdf_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
332
  fn=generate_annotated_image,
333
  cache_examples=True,
334
  )
335
 
336
- demo.launch()
 
129
  example = dict()
130
 
131
  if len(msg_error) > 0:
 
 
 
 
 
132
  # save image files
133
+ Image.open(images_wo_content).save("wo_content.png")
 
134
  # save csv files
135
+ df_paragraphs_wo_content.to_csv("paragraphs_wo_content.csv", encoding="utf-8", index=False)
136
+ df_lines_wo_content.to_csv("lines_wo_content.csv", encoding="utf-8", index=False)
137
 
138
+ return msg_error, "wo_content.png", images_wo_content, images_wo_content, "wo_content.png", "wo_content.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs_wo_content.csv", visible=False), gr.File.update(value="lines_wo_content.csv", visible=False)
139
  else:
140
  # get random image & PDF data
141
  index = random.randint(0, len(example))
 
149
  # resize image to original
150
  image = image.resize((original_width, original_height))
151
 
152
+ # get image of PDF without bounding boxes
153
+ img_file = original_filename.replace(".pdf", ".png")
154
+ image.save(img_file)
155
 
156
  # get corresponding annotations
157
  texts = example[index]["texts"]
 
228
  if i == 0:
229
  imgs["paragraphs"] = img
230
 
231
+ # save
232
+ img_paragraphs = "img_paragraphs_" + original_filename.replace(".pdf", ".png")
233
+ img.save(img_paragraphs)
234
 
235
  df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list)))
236
  df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list]
 
238
  df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list]
239
 
240
  # save
241
+ csv_paragraphs = "csv_paragraphs_" + original_filename.replace(".pdf", ".csv")
242
+ df_paragraphs.to_csv(csv_paragraphs, encoding="utf-8", index=False)
243
 
244
  else:
245
  imgs["lines"] = img
246
 
247
  # save
248
+ img_lines = "img_lines_" + original_filename.replace(".pdf", ".png")
249
+ img.save(img_lines)
250
 
251
  df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list)))
252
  df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list]
 
254
  df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
255
 
256
  # save
257
+ csv_lines = "csv_lines_" + original_filename.replace(".pdf", ".csv")
258
+ df_lines.to_csv(csv_lines, encoding="utf-8", index=False)
259
 
260
  msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your settings.'
261
 
262
+ return msg, img_file, imgs["paragraphs"], imgs["lines"], img_paragraphs, img_lines, df_paragraphs, df_lines, gr.File.update(value=csv_paragraphs, visible=True), gr.File.update(value=csv_lines, visible=True)
263
 
264
  # gradio APP
265
  with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
 
269
  <div><p>It uses the datasets <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> (you can also run this APP in Google Colab by running this <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>).</p></div>
270
  <div><p>Make your settings and the output will show 2 images of a randomly selected PDF with labeled bounding boxes, one of paragraphs and the other of lines, and their corresponding tables of texts with their labels.</p></div>
271
  <div><p>For example, if you select the domain "laws_and_regulations" and the category "Caption", you will get a random PDF that corresponds to these settings (ie, it will have at least one bounding box labeled with "Caption" in the PDF).</p></div>
272
+ <div style="margin-top: 20px"><p>More information about the DocLayNet datasets and this APP in the following blog post: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></div>
 
273
  """)
274
  with gr.Row():
275
  with gr.Column():
 
285
  with gr.Column():
286
  output_msg = gr.Textbox(label="Output message")
287
  with gr.Column():
288
+ img_file = gr.File(visible=True, label="Image file of the PDF")
289
  with gr.Row():
290
  with gr.Column():
291
  img_paragraphs_file = gr.File(visible=True, label="Image file (labeled paragraphs)")
 
320
  type="pandas",
321
  wrap=True
322
  )
323
+ btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
324
 
325
  gr.Markdown("## Example")
326
  gr.Examples(
327
  [["small", "all", "all", "all"]],
328
  [dataset_name_gr, split_gr, domain_gr, category_gr],
329
+ [output_msg, img_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
330
  fn=generate_annotated_image,
331
  cache_examples=True,
332
  )
333
 
334
+ demo.launch()