Spaces:
Runtime error
Runtime error
Commit
·
17b7e45
1
Parent(s):
44970cd
Update app.py
Browse files
app.py
CHANGED
@@ -11,9 +11,10 @@ import collections
|
|
11 |
from datasets import load_dataset
|
12 |
|
13 |
dataset_small = load_dataset("pierreguillou/DocLayNet-small")
|
14 |
-
#dataset_base = load_dataset("pierreguillou/DocLayNet-base")
|
15 |
|
16 |
id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
|
|
|
17 |
labels = [label for idx, label in id2label.items()]
|
18 |
|
19 |
# need to change the coordinates format
|
@@ -112,27 +113,27 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
112 |
if domain_name != "all":
|
113 |
example = example.filter(lambda example: example["doc_category"] == domain_name)
|
114 |
if len(example) == 0:
|
115 |
-
msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (
|
116 |
example = dict()
|
117 |
|
118 |
# get category
|
119 |
idx_list = list()
|
120 |
if category != "all":
|
121 |
-
for idx, categories_list in
|
122 |
-
if category in categories_list:
|
123 |
idx_list.append(idx)
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
127 |
example = dict()
|
128 |
|
129 |
if len(msg_error) > 0:
|
130 |
return msg_error, images_wo_content, images_wo_content, df_paragraphs_wo_content, df_lines_wo_content
|
131 |
else:
|
132 |
# get random image & PDF data
|
133 |
-
|
134 |
-
|
135 |
-
image = image_files[index] # original image
|
136 |
coco_width, coco_height = example[index]["coco_width"], example[index]["coco_height"]
|
137 |
original_width, original_height = example[index]["original_width"], example[index]["original_height"]
|
138 |
original_filename = example[index]["original_filename"]
|
@@ -234,7 +235,7 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
234 |
df_lines["texts"] = sorted_text_line_list
|
235 |
df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
|
236 |
|
237 |
-
msg = f'The page {page_no} of PDF "{original_filename}" (domain "{domain}") matches your parameters.'
|
238 |
|
239 |
return msg, imgs["paragraphs"], imgs["lines"], df_paragraphs, df_lines
|
240 |
|
@@ -245,6 +246,8 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
|
245 |
<div style="margin-top: 20px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset.</p></div>
|
246 |
<div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
|
247 |
<div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
|
|
|
|
|
248 |
""")
|
249 |
with gr.Row():
|
250 |
with gr.Column():
|
|
|
11 |
from datasets import load_dataset
|
12 |
|
13 |
dataset_small = load_dataset("pierreguillou/DocLayNet-small")
|
14 |
+
# dataset_base = load_dataset("pierreguillou/DocLayNet-base")
|
15 |
|
16 |
id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
|
17 |
+
label2id = {label:idx for idx,label in id2label.items()}
|
18 |
labels = [label for idx, label in id2label.items()]
|
19 |
|
20 |
# need to change the coordinates format
|
|
|
113 |
if domain_name != "all":
|
114 |
example = example.filter(lambda example: example["doc_category"] == domain_name)
|
115 |
if len(example) == 0:
|
116 |
+
msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (dataset: "DocLayNet {dataset_name}" / domain: "{domain}" / split: "{split}").'
|
117 |
example = dict()
|
118 |
|
119 |
# get category
|
120 |
idx_list = list()
|
121 |
if category != "all":
|
122 |
+
for idx, categories_list in enumerate(example["categories"]):
|
123 |
+
if int(label2id[category]) in categories_list:
|
124 |
idx_list.append(idx)
|
125 |
+
if len(idx_list) > 0:
|
126 |
+
example = example.select(idx_list)
|
127 |
+
else:
|
128 |
+
msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (dataset: "DocLayNet {dataset_name}" / split: "{split}" / domain: "{domain}" / category: "{category}").'
|
129 |
example = dict()
|
130 |
|
131 |
if len(msg_error) > 0:
|
132 |
return msg_error, images_wo_content, images_wo_content, df_paragraphs_wo_content, df_lines_wo_content
|
133 |
else:
|
134 |
# get random image & PDF data
|
135 |
+
index = random.randint(0, len(example))
|
136 |
+
image = example[index]["image"] # original image
|
|
|
137 |
coco_width, coco_height = example[index]["coco_width"], example[index]["coco_height"]
|
138 |
original_width, original_height = example[index]["original_width"], example[index]["original_height"]
|
139 |
original_filename = example[index]["original_filename"]
|
|
|
235 |
df_lines["texts"] = sorted_text_line_list
|
236 |
df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
|
237 |
|
238 |
+
msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your parameters.'
|
239 |
|
240 |
return msg, imgs["paragraphs"], imgs["lines"], df_paragraphs, df_lines
|
241 |
|
|
|
246 |
<div style="margin-top: 20px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset.</p></div>
|
247 |
<div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
|
248 |
<div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
|
249 |
+
<div><p>More informations about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
|
250 |
+
<div><ul><li>- <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li>- <a href="" target="_blank">Document AI | DocLayNet image viewer APP</a></li></ul></div>
|
251 |
""")
|
252 |
with gr.Row():
|
253 |
with gr.Column():
|