Spaces:

Duplicated from pierreguillou/Inference-APP-Document-Understanding-at-linelevel-LiLT-base-LayoutXLM-base-v1

pierreguillou
/

Inference-APP-Document-Understanding-at-paragraphlevel-v3

Runtime error

App Files Files Community

pierreguillou commited on Apr 4, 2023

Commit

c41516f

·

1 Parent(s): ce38f80

Update files/functions.py

Files changed (1) hide show

files/functions.py +16 -5

files/functions.py CHANGED Viewed

@@ -25,8 +25,7 @@ import pypdf
 from pypdf import PdfReader
 from pypdf.errors import PdfReadError
-import pdf2image
-from pdf2image import convert_from_path
 import langdetect
 from langdetect import detect_langs
@@ -109,7 +108,7 @@ from huggingface_hub import hf_hub_download
 files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
 for file_name in files:
     path_to_file = hf_hub_download(
-        repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-LiLT-base-LayoutXLM-base-v1",
         filename = "files/" + file_name,
         repo_type = "space"
         )
@@ -424,7 +423,7 @@ def pdf_to_images(uploaded_pdf):
     else:
         # path to the uploaded PDF
         path_to_file = uploaded_pdf.name
-        filename = path_to_file.replace("/tmp/","")
         try:
             PdfReader(path_to_file)
@@ -435,7 +434,19 @@ def pdf_to_images(uploaded_pdf):
             images = [Image.open(image_blank)]
         else:
             try:
-                images = convert_from_path(path_to_file, last_page=max_imgboxes)
                 num_imgs = len(images)
                 msg = f'The PDF "{filename}" was converted into {num_imgs} images.'
             except:

 from pypdf import PdfReader
 from pypdf.errors import PdfReadError
+import pypdfium2 as pdfium
 import langdetect
 from langdetect import detect_langs
 files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
 for file_name in files:
     path_to_file = hf_hub_download(
+        repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v3",
         filename = "files/" + file_name,
         repo_type = "space"
         )
     else:
         # path to the uploaded PDF
         path_to_file = uploaded_pdf.name
+        filename = path_to_file# .replace("/tmp/","")
         try:
             PdfReader(path_to_file)
             images = [Image.open(image_blank)]
         else:
             try:
+                # images = convert_from_path(path_to_file, last_page=max_imgboxes)
+                pdf = pdfium.PdfDocument(str(filename))
+                version = pdf.get_version()  # get the PDF standard version
+                n_pages = len(pdf)  # get the number of pages in the document
+                last_page = max_imgboxes
+                page_indices = [i for i in range(last_page)]  # pages until last_page
+                images = list(pdf.render(
+                    pdfium.PdfBitmap.to_pil,
+                    page_indices = page_indices,
+                    scale = 300/72,  # 300dpi resolution
+                ))
                 num_imgs = len(images)
                 msg = f'The PDF "{filename}" was converted into {num_imgs} images.'
             except: