Spaces:
Runtime error
Runtime error
Commit
·
d768dd3
1
Parent(s):
230d25f
Update app.py
Browse files
app.py
CHANGED
@@ -129,7 +129,12 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
129 |
example = dict()
|
130 |
|
131 |
if len(msg_error) > 0:
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
133 |
else:
|
134 |
# get random image & PDF data
|
135 |
index = random.randint(0, len(example))
|
@@ -221,23 +226,35 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
221 |
|
222 |
if i == 0:
|
223 |
imgs["paragraphs"] = img
|
|
|
|
|
|
|
224 |
|
225 |
df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list)))
|
226 |
df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list]
|
227 |
df_paragraphs["texts"] = sorted_text_block_list
|
228 |
df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list]
|
229 |
|
|
|
|
|
|
|
230 |
else:
|
231 |
imgs["lines"] = img
|
232 |
|
|
|
|
|
|
|
233 |
df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list)))
|
234 |
df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list]
|
235 |
df_lines["texts"] = sorted_text_line_list
|
236 |
df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
|
|
|
|
|
|
|
237 |
|
238 |
msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your parameters.'
|
239 |
|
240 |
-
return msg, imgs["paragraphs"], imgs["lines"], df_paragraphs, df_lines
|
241 |
|
242 |
# gradio APP
|
243 |
with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
@@ -247,7 +264,7 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
|
247 |
<div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
|
248 |
<div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
|
249 |
<div><p>More informations about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
|
250 |
-
<div><ul><li
|
251 |
""")
|
252 |
with gr.Row():
|
253 |
with gr.Column():
|
@@ -265,40 +282,45 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
|
265 |
# with gr.Column():
|
266 |
# json = gr.JSON(label="JSON")
|
267 |
with gr.Column():
|
268 |
-
|
|
|
269 |
with gr.Column():
|
270 |
-
|
271 |
-
|
272 |
with gr.Row():
|
273 |
with gr.Column():
|
|
|
274 |
df_paragraphs = gr.Dataframe(
|
275 |
headers=["paragraphs", "categories", "texts", "bounding boxes"],
|
276 |
datatype=["number", "str", "str", "str"],
|
277 |
# row_count='dynamic',
|
278 |
col_count=(4, "fixed"),
|
279 |
interactive=False,
|
|
|
280 |
label="Paragraphs data",
|
281 |
type="pandas",
|
282 |
wrap=True
|
283 |
)
|
284 |
with gr.Column():
|
|
|
285 |
df_lines = gr.Dataframe(
|
286 |
headers=["lines", "categories", "texts", "bounding boxes"],
|
287 |
datatype=["number", "str", "str", "str"],
|
288 |
# row_count='dynamic',
|
289 |
col_count=(4, "fixed"),
|
290 |
interactive=False,
|
|
|
291 |
label="Lines data",
|
292 |
type="pandas",
|
293 |
wrap=True
|
294 |
)
|
295 |
-
btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_paragraphs, img_lines, df_paragraphs, df_lines])
|
296 |
|
297 |
gr.Markdown("## Example")
|
298 |
gr.Examples(
|
299 |
[["small", "all", "all", "all"]],
|
300 |
[dataset_name_gr, split_gr, domain_gr, category_gr],
|
301 |
-
[output_msg, img_paragraphs, img_lines, df_paragraphs, df_lines],
|
302 |
fn=generate_annotated_image,
|
303 |
cache_examples=True,
|
304 |
)
|
|
|
129 |
example = dict()
|
130 |
|
131 |
if len(msg_error) > 0:
|
132 |
+
# save
|
133 |
+
images_wo_content.save("img_paragraphs.png")
|
134 |
+
images_wo_content.save("img_lines.png")
|
135 |
+
df_paragraphs_wo_content.to_csv("paragraphs.csv", encoding="utf-8", index=False)
|
136 |
+
df_lines_wo_content.to_csv("lines.csv", encoding="utf-8", index=False)
|
137 |
+
return msg_error, images_wo_content, images_wo_content, "img_paragraphs.png", "img_lines.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs.csv", visible=False), gr.File.update(value="lines.csv", visible=False)
|
138 |
else:
|
139 |
# get random image & PDF data
|
140 |
index = random.randint(0, len(example))
|
|
|
226 |
|
227 |
if i == 0:
|
228 |
imgs["paragraphs"] = img
|
229 |
+
|
230 |
+
# save
|
231 |
+
img.save("img_paragraphs.png")
|
232 |
|
233 |
df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list)))
|
234 |
df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list]
|
235 |
df_paragraphs["texts"] = sorted_text_block_list
|
236 |
df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list]
|
237 |
|
238 |
+
# save
|
239 |
+
df_paragraphs.to_csv("paragraphs.csv", encoding="utf-8", index=False)
|
240 |
+
|
241 |
else:
|
242 |
imgs["lines"] = img
|
243 |
|
244 |
+
# save
|
245 |
+
img.save("img_lines.png")
|
246 |
+
|
247 |
df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list)))
|
248 |
df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list]
|
249 |
df_lines["texts"] = sorted_text_line_list
|
250 |
df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
|
251 |
+
|
252 |
+
# save
|
253 |
+
df_lines.to_csv("lines.csv", encoding="utf-8", index=False)
|
254 |
|
255 |
msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your parameters.'
|
256 |
|
257 |
+
return msg, imgs["paragraphs"], imgs["lines"], "img_paragraphs.png", "img_lines.png", df_paragraphs, df_lines, gr.File.update(value="paragraphs.csv", visible=True), gr.File.update(value="lines.csv", visible=True)
|
258 |
|
259 |
# gradio APP
|
260 |
with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
|
|
264 |
<div><p>It uses the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> (in the corresponding <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>, it is possible to use the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> as well if the notebook is run in a system setup more powerful than Google Colab).</p></div>
|
265 |
<div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
|
266 |
<div><p>More informations about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
|
267 |
+
<div><ul><li><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li><a href="" target="_blank">Document AI | DocLayNet image viewer APP</a></li></ul></div>
|
268 |
""")
|
269 |
with gr.Row():
|
270 |
with gr.Column():
|
|
|
282 |
# with gr.Column():
|
283 |
# json = gr.JSON(label="JSON")
|
284 |
with gr.Column():
|
285 |
+
img_paragraphs_file = gr.File(interactive=False, visible=True)
|
286 |
+
img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs", visible=True)
|
287 |
with gr.Column():
|
288 |
+
img_lines_file = gr.File(interactive=False, visible=True)
|
289 |
+
img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines", visible=True)
|
290 |
with gr.Row():
|
291 |
with gr.Column():
|
292 |
+
csv_paragraphs = gr.File(interactive=False, visible=False)
|
293 |
df_paragraphs = gr.Dataframe(
|
294 |
headers=["paragraphs", "categories", "texts", "bounding boxes"],
|
295 |
datatype=["number", "str", "str", "str"],
|
296 |
# row_count='dynamic',
|
297 |
col_count=(4, "fixed"),
|
298 |
interactive=False,
|
299 |
+
visible=True,
|
300 |
label="Paragraphs data",
|
301 |
type="pandas",
|
302 |
wrap=True
|
303 |
)
|
304 |
with gr.Column():
|
305 |
+
csv_lines = gr.File(interactive=False, visible=False)
|
306 |
df_lines = gr.Dataframe(
|
307 |
headers=["lines", "categories", "texts", "bounding boxes"],
|
308 |
datatype=["number", "str", "str", "str"],
|
309 |
# row_count='dynamic',
|
310 |
col_count=(4, "fixed"),
|
311 |
interactive=False,
|
312 |
+
visible=True,
|
313 |
label="Lines data",
|
314 |
type="pandas",
|
315 |
wrap=True
|
316 |
)
|
317 |
+
btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
|
318 |
|
319 |
gr.Markdown("## Example")
|
320 |
gr.Examples(
|
321 |
[["small", "all", "all", "all"]],
|
322 |
[dataset_name_gr, split_gr, domain_gr, category_gr],
|
323 |
+
[output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
|
324 |
fn=generate_annotated_image,
|
325 |
cache_examples=True,
|
326 |
)
|