Spaces:
Runtime error
Runtime error
Commit
·
ec64fd3
1
Parent(s):
26691c8
Update app.py
Browse files
app.py
CHANGED
@@ -11,7 +11,7 @@ import collections
|
|
11 |
from datasets import load_dataset
|
12 |
|
13 |
dataset_small = load_dataset("pierreguillou/DocLayNet-small")
|
14 |
-
|
15 |
|
16 |
id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
|
17 |
label2id = {label:idx for idx,label in id2label.items()}
|
@@ -89,7 +89,7 @@ font = ImageFont.load_default()
|
|
89 |
dataset_names = ["small", "base"]
|
90 |
splits = ["all", "train", "validation", "test"]
|
91 |
domains = ["all", "Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"]
|
92 |
-
domains_names = [domain_name.lower().replace(" ", "_") for domain_name in domains]
|
93 |
categories = labels + ["all"]
|
94 |
|
95 |
# function to get a rendom image and all data from DocLayNet
|
@@ -113,7 +113,7 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
113 |
if domain_name != "all":
|
114 |
example = example.filter(lambda example: example["doc_category"] == domain_name)
|
115 |
if len(example) == 0:
|
116 |
-
msg_error = f'There is no image with at least one
|
117 |
example = dict()
|
118 |
|
119 |
# get category
|
@@ -125,16 +125,23 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
125 |
if len(idx_list) > 0:
|
126 |
example = example.select(idx_list)
|
127 |
else:
|
128 |
-
msg_error = f'There is no image with at least one
|
129 |
example = dict()
|
130 |
|
131 |
if len(msg_error) > 0:
|
132 |
-
# save
|
|
|
|
|
|
|
|
|
|
|
133 |
Image.open(images_wo_content).save("img_paragraphs.png")
|
134 |
Image.open(images_wo_content).save("img_lines.png")
|
|
|
135 |
df_paragraphs_wo_content.to_csv("paragraphs.csv", encoding="utf-8", index=False)
|
136 |
df_lines_wo_content.to_csv("lines.csv", encoding="utf-8", index=False)
|
137 |
-
|
|
|
138 |
else:
|
139 |
# get random image & PDF data
|
140 |
index = random.randint(0, len(example))
|
@@ -148,6 +155,9 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
148 |
# resize image to original
|
149 |
image = image.resize((original_width, original_height))
|
150 |
|
|
|
|
|
|
|
151 |
# get corresponding annotations
|
152 |
texts = example[index]["texts"]
|
153 |
bboxes_block = example[index]["bboxes_block"]
|
@@ -155,10 +165,6 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
155 |
categories = example[index]["categories"]
|
156 |
domain = example[index]["doc_category"]
|
157 |
|
158 |
-
# get list of categories
|
159 |
-
categories_unique = sorted(list(set([categories_list for categories_list in categories])))
|
160 |
-
categories_unique = [id2label[idx] for idx in categories_unique]
|
161 |
-
|
162 |
# convert boxes to original
|
163 |
original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block]
|
164 |
original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line]
|
@@ -207,7 +213,7 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
207 |
sorted_category_line_list = np.array(category_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
|
208 |
sorted_text_line_list = np.array(text_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
|
209 |
|
210 |
-
# setup images &
|
211 |
columns = 2
|
212 |
images = [image.copy(), image.copy()]
|
213 |
num_imgs = len(images)
|
@@ -252,77 +258,79 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
252 |
# save
|
253 |
df_lines.to_csv("lines.csv", encoding="utf-8", index=False)
|
254 |
|
255 |
-
msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your
|
256 |
|
257 |
-
return msg, imgs["paragraphs"], imgs["lines"], "img_paragraphs.png", "img_lines.png", df_paragraphs, df_lines, gr.File.update(value="paragraphs.csv", visible=True), gr.File.update(value="lines.csv", visible=True)
|
258 |
|
259 |
# gradio APP
|
260 |
with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
261 |
gr.HTML("""
|
262 |
<div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>DocLayNet image viewer</h1></div>
|
263 |
-
<div style="margin-top:
|
264 |
-
<div><p>It uses the
|
265 |
-
<div><p>Make your
|
266 |
-
<div><p>
|
267 |
-
<div
|
|
|
268 |
""")
|
269 |
with gr.Row():
|
270 |
with gr.Column():
|
271 |
-
dataset_name_gr = gr.Radio(
|
272 |
with gr.Column():
|
273 |
split_gr = gr.Dropdown(splits, value="all", label="Split")
|
274 |
with gr.Column():
|
275 |
domain_gr = gr.Dropdown(domains, value="all", label="Domain")
|
276 |
with gr.Column():
|
277 |
category_gr = gr.Dropdown(categories, value="all", label="Category")
|
278 |
-
btn = gr.Button("Display PDF image")
|
279 |
with gr.Row():
|
280 |
-
|
|
|
|
|
|
|
281 |
with gr.Row():
|
282 |
-
# with gr.Column():
|
283 |
-
# json = gr.JSON(label="JSON")
|
284 |
with gr.Column():
|
285 |
-
img_paragraphs_file = gr.File(
|
286 |
img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs", visible=True)
|
287 |
with gr.Column():
|
288 |
-
img_lines_file = gr.File(
|
289 |
img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines", visible=True)
|
290 |
with gr.Row():
|
291 |
with gr.Column():
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
with gr.Column():
|
305 |
-
|
|
|
|
|
306 |
df_lines = gr.Dataframe(
|
307 |
headers=["lines", "categories", "texts", "bounding boxes"],
|
308 |
datatype=["number", "str", "str", "str"],
|
309 |
-
# row_count='dynamic',
|
310 |
col_count=(4, "fixed"),
|
311 |
-
interactive=False,
|
312 |
visible=True,
|
313 |
label="Lines data",
|
314 |
type="pandas",
|
315 |
wrap=True
|
316 |
)
|
317 |
-
btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
|
318 |
|
319 |
gr.Markdown("## Example")
|
320 |
gr.Examples(
|
321 |
[["small", "all", "all", "all"]],
|
322 |
[dataset_name_gr, split_gr, domain_gr, category_gr],
|
323 |
-
[output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
|
324 |
fn=generate_annotated_image,
|
325 |
cache_examples=True,
|
326 |
)
|
327 |
|
328 |
-
demo.launch()
|
|
|
11 |
from datasets import load_dataset
|
12 |
|
13 |
dataset_small = load_dataset("pierreguillou/DocLayNet-small")
|
14 |
+
dataset_base = load_dataset("pierreguillou/DocLayNet-base")
|
15 |
|
16 |
id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
|
17 |
label2id = {label:idx for idx,label in id2label.items()}
|
|
|
89 |
dataset_names = ["small", "base"]
|
90 |
splits = ["all", "train", "validation", "test"]
|
91 |
domains = ["all", "Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"]
|
92 |
+
domains_names = [domain_name.lower().replace(" ", "_").replace("&", "and") for domain_name in domains]
|
93 |
categories = labels + ["all"]
|
94 |
|
95 |
# function to get a rendom image and all data from DocLayNet
|
|
|
113 |
if domain_name != "all":
|
114 |
example = example.filter(lambda example: example["doc_category"] == domain_name)
|
115 |
if len(example) == 0:
|
116 |
+
msg_error = f'There is no image with at least one labeled bounding box that matches your settings (dataset: "DocLayNet {dataset_name}" / domain: "{domain}" / split: "{split}").'
|
117 |
example = dict()
|
118 |
|
119 |
# get category
|
|
|
125 |
if len(idx_list) > 0:
|
126 |
example = example.select(idx_list)
|
127 |
else:
|
128 |
+
msg_error = f'There is no image with at least one labeled bounding box that matches your settings (dataset: "DocLayNet {dataset_name}" / split: "{split}" / domain: "{domain}" / category: "{category}").'
|
129 |
example = dict()
|
130 |
|
131 |
if len(msg_error) > 0:
|
132 |
+
# save PDF
|
133 |
+
rgba = Image.open(images_wo_content)
|
134 |
+
rgb = Image.new('RGB', rgba.size, (255, 255, 255)) # white background
|
135 |
+
rgb.paste(rgba, mask=rgba.split()[3]) # paste using alpha channel as mask
|
136 |
+
rgb.save("wo_content.pdf", 'PDF', resolution=100.0)
|
137 |
+
# save image files
|
138 |
Image.open(images_wo_content).save("img_paragraphs.png")
|
139 |
Image.open(images_wo_content).save("img_lines.png")
|
140 |
+
# save csv files
|
141 |
df_paragraphs_wo_content.to_csv("paragraphs.csv", encoding="utf-8", index=False)
|
142 |
df_lines_wo_content.to_csv("lines.csv", encoding="utf-8", index=False)
|
143 |
+
|
144 |
+
return msg_error, "wo_content.pdf", images_wo_content, images_wo_content, "img_paragraphs.png", "img_lines.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs.csv", visible=False), gr.File.update(value="lines.csv", visible=False)
|
145 |
else:
|
146 |
# get random image & PDF data
|
147 |
index = random.randint(0, len(example))
|
|
|
155 |
# resize image to original
|
156 |
image = image.resize((original_width, original_height))
|
157 |
|
158 |
+
# get pdf of image
|
159 |
+
image.save(original_filename)
|
160 |
+
|
161 |
# get corresponding annotations
|
162 |
texts = example[index]["texts"]
|
163 |
bboxes_block = example[index]["bboxes_block"]
|
|
|
165 |
categories = example[index]["categories"]
|
166 |
domain = example[index]["doc_category"]
|
167 |
|
|
|
|
|
|
|
|
|
168 |
# convert boxes to original
|
169 |
original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block]
|
170 |
original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line]
|
|
|
213 |
sorted_category_line_list = np.array(category_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
|
214 |
sorted_text_line_list = np.array(text_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
|
215 |
|
216 |
+
# setup images & PDF data
|
217 |
columns = 2
|
218 |
images = [image.copy(), image.copy()]
|
219 |
num_imgs = len(images)
|
|
|
258 |
# save
|
259 |
df_lines.to_csv("lines.csv", encoding="utf-8", index=False)
|
260 |
|
261 |
+
msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your settings.'
|
262 |
|
263 |
+
return msg, original_filename, imgs["paragraphs"], imgs["lines"], "img_paragraphs.png", "img_lines.png", df_paragraphs, df_lines, gr.File.update(value="paragraphs.csv", visible=True), gr.File.update(value="lines.csv", visible=True)
|
264 |
|
265 |
# gradio APP
|
266 |
with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
267 |
gr.HTML("""
|
268 |
<div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>DocLayNet image viewer</h1></div>
|
269 |
+
<div style="margin-top: 40px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset and a data extraction tool.</p></div>
|
270 |
+
<div><p>It uses the datasets <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> (you can also run this APP in Google Colab by running this <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>).</p></div>
|
271 |
+
<div><p>Make your settings and the output will show 2 images of a randomly selected PDF with labeled bounding boxes, one of paragraphs and the other of lines, and their corresponding tables of texts with their labels.</p></div>
|
272 |
+
<div><p>For example, if you select the domain "laws_and_regulations" and the category "Caption", you will get a random PDF that corresponds to these settings (ie, it will have at least one bounding box labeled with "Caption" in the PDF).</p></div>
|
273 |
+
<div style="margin-top: 20px"><p>More information about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
|
274 |
+
<div><ul><li>- <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li>- <a href="" target="_blank">(01/31/2023) Document AI | DocLayNet image viewer APP</a></li></ul></div>
|
275 |
""")
|
276 |
with gr.Row():
|
277 |
with gr.Column():
|
278 |
+
dataset_name_gr = gr.Radio(dataset_names, value="small", label="DocLayNet dataset")
|
279 |
with gr.Column():
|
280 |
split_gr = gr.Dropdown(splits, value="all", label="Split")
|
281 |
with gr.Column():
|
282 |
domain_gr = gr.Dropdown(domains, value="all", label="Domain")
|
283 |
with gr.Column():
|
284 |
category_gr = gr.Dropdown(categories, value="all", label="Category")
|
285 |
+
btn = gr.Button("Display labeled PDF image & data")
|
286 |
with gr.Row():
|
287 |
+
with gr.Column():
|
288 |
+
output_msg = gr.Textbox(label="Output message")
|
289 |
+
with gr.Column():
|
290 |
+
pdf_file = gr.File(visible=True, label="PDF file (original)")
|
291 |
with gr.Row():
|
|
|
|
|
292 |
with gr.Column():
|
293 |
+
img_paragraphs_file = gr.File(visible=True, label="Image file (labeled paragraphs)")
|
294 |
img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs", visible=True)
|
295 |
with gr.Column():
|
296 |
+
img_lines_file = gr.File(visible=True, label="Image file (labeled lines)")
|
297 |
img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines", visible=True)
|
298 |
with gr.Row():
|
299 |
with gr.Column():
|
300 |
+
with gr.Row():
|
301 |
+
csv_paragraphs = gr.File(visible=False, label="CSV file (paragraphs)")
|
302 |
+
with gr.Row():
|
303 |
+
df_paragraphs = gr.Dataframe(
|
304 |
+
headers=["paragraphs", "categories", "texts", "bounding boxes"],
|
305 |
+
datatype=["number", "str", "str", "str"],
|
306 |
+
col_count=(4, "fixed"),
|
307 |
+
visible=True,
|
308 |
+
label="Paragraphs data",
|
309 |
+
type="pandas",
|
310 |
+
wrap=True
|
311 |
+
)
|
312 |
with gr.Column():
|
313 |
+
with gr.Row():
|
314 |
+
csv_lines = gr.File(visible=False, label="CSV file (lines)")
|
315 |
+
with gr.Row():
|
316 |
df_lines = gr.Dataframe(
|
317 |
headers=["lines", "categories", "texts", "bounding boxes"],
|
318 |
datatype=["number", "str", "str", "str"],
|
|
|
319 |
col_count=(4, "fixed"),
|
|
|
320 |
visible=True,
|
321 |
label="Lines data",
|
322 |
type="pandas",
|
323 |
wrap=True
|
324 |
)
|
325 |
+
btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, pdf_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
|
326 |
|
327 |
gr.Markdown("## Example")
|
328 |
gr.Examples(
|
329 |
[["small", "all", "all", "all"]],
|
330 |
[dataset_name_gr, split_gr, domain_gr, category_gr],
|
331 |
+
[output_msg, pdf_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
|
332 |
fn=generate_annotated_image,
|
333 |
cache_examples=True,
|
334 |
)
|
335 |
|
336 |
+
demo.launch()
|