Commit
·
c41516f
1
Parent(s):
ce38f80
Update files/functions.py
Browse files- files/functions.py +16 -5
files/functions.py
CHANGED
@@ -25,8 +25,7 @@ import pypdf
|
|
25 |
from pypdf import PdfReader
|
26 |
from pypdf.errors import PdfReadError
|
27 |
|
28 |
-
import
|
29 |
-
from pdf2image import convert_from_path
|
30 |
import langdetect
|
31 |
from langdetect import detect_langs
|
32 |
|
@@ -109,7 +108,7 @@ from huggingface_hub import hf_hub_download
|
|
109 |
files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
|
110 |
for file_name in files:
|
111 |
path_to_file = hf_hub_download(
|
112 |
-
repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-
|
113 |
filename = "files/" + file_name,
|
114 |
repo_type = "space"
|
115 |
)
|
@@ -424,7 +423,7 @@ def pdf_to_images(uploaded_pdf):
|
|
424 |
else:
|
425 |
# path to the uploaded PDF
|
426 |
path_to_file = uploaded_pdf.name
|
427 |
-
filename = path_to_file.replace("/tmp/","")
|
428 |
|
429 |
try:
|
430 |
PdfReader(path_to_file)
|
@@ -435,7 +434,19 @@ def pdf_to_images(uploaded_pdf):
|
|
435 |
images = [Image.open(image_blank)]
|
436 |
else:
|
437 |
try:
|
438 |
-
images = convert_from_path(path_to_file, last_page=max_imgboxes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
num_imgs = len(images)
|
440 |
msg = f'The PDF "{filename}" was converted into {num_imgs} images.'
|
441 |
except:
|
|
|
25 |
from pypdf import PdfReader
|
26 |
from pypdf.errors import PdfReadError
|
27 |
|
28 |
+
import pypdfium2 as pdfium
|
|
|
29 |
import langdetect
|
30 |
from langdetect import detect_langs
|
31 |
|
|
|
108 |
files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
|
109 |
for file_name in files:
|
110 |
path_to_file = hf_hub_download(
|
111 |
+
repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v3",
|
112 |
filename = "files/" + file_name,
|
113 |
repo_type = "space"
|
114 |
)
|
|
|
423 |
else:
|
424 |
# path to the uploaded PDF
|
425 |
path_to_file = uploaded_pdf.name
|
426 |
+
filename = path_to_file# .replace("/tmp/","")
|
427 |
|
428 |
try:
|
429 |
PdfReader(path_to_file)
|
|
|
434 |
images = [Image.open(image_blank)]
|
435 |
else:
|
436 |
try:
|
437 |
+
# images = convert_from_path(path_to_file, last_page=max_imgboxes)
|
438 |
+
|
439 |
+
pdf = pdfium.PdfDocument(str(filename))
|
440 |
+
version = pdf.get_version() # get the PDF standard version
|
441 |
+
n_pages = len(pdf) # get the number of pages in the document
|
442 |
+
last_page = max_imgboxes
|
443 |
+
page_indices = [i for i in range(last_page)] # pages until last_page
|
444 |
+
images = list(pdf.render(
|
445 |
+
pdfium.PdfBitmap.to_pil,
|
446 |
+
page_indices = page_indices,
|
447 |
+
scale = 300/72, # 300dpi resolution
|
448 |
+
))
|
449 |
+
|
450 |
num_imgs = len(images)
|
451 |
msg = f'The PDF "{filename}" was converted into {num_imgs} images.'
|
452 |
except:
|