pierreguillou commited on
Commit
c41516f
·
1 Parent(s): ce38f80

Update files/functions.py

Browse files
Files changed (1) hide show
  1. files/functions.py +16 -5
files/functions.py CHANGED
@@ -25,8 +25,7 @@ import pypdf
25
  from pypdf import PdfReader
26
  from pypdf.errors import PdfReadError
27
 
28
- import pdf2image
29
- from pdf2image import convert_from_path
30
  import langdetect
31
  from langdetect import detect_langs
32
 
@@ -109,7 +108,7 @@ from huggingface_hub import hf_hub_download
109
  files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
110
  for file_name in files:
111
  path_to_file = hf_hub_download(
112
- repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-LiLT-base-LayoutXLM-base-v1",
113
  filename = "files/" + file_name,
114
  repo_type = "space"
115
  )
@@ -424,7 +423,7 @@ def pdf_to_images(uploaded_pdf):
424
  else:
425
  # path to the uploaded PDF
426
  path_to_file = uploaded_pdf.name
427
- filename = path_to_file.replace("/tmp/","")
428
 
429
  try:
430
  PdfReader(path_to_file)
@@ -435,7 +434,19 @@ def pdf_to_images(uploaded_pdf):
435
  images = [Image.open(image_blank)]
436
  else:
437
  try:
438
- images = convert_from_path(path_to_file, last_page=max_imgboxes)
 
 
 
 
 
 
 
 
 
 
 
 
439
  num_imgs = len(images)
440
  msg = f'The PDF "{filename}" was converted into {num_imgs} images.'
441
  except:
 
25
  from pypdf import PdfReader
26
  from pypdf.errors import PdfReadError
27
 
28
+ import pypdfium2 as pdfium
 
29
  import langdetect
30
  from langdetect import detect_langs
31
 
 
108
  files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
109
  for file_name in files:
110
  path_to_file = hf_hub_download(
111
+ repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v3",
112
  filename = "files/" + file_name,
113
  repo_type = "space"
114
  )
 
423
  else:
424
  # path to the uploaded PDF
425
  path_to_file = uploaded_pdf.name
426
+ filename = path_to_file# .replace("/tmp/","")
427
 
428
  try:
429
  PdfReader(path_to_file)
 
434
  images = [Image.open(image_blank)]
435
  else:
436
  try:
437
+ # images = convert_from_path(path_to_file, last_page=max_imgboxes)
438
+
439
+ pdf = pdfium.PdfDocument(str(filename))
440
+ version = pdf.get_version() # get the PDF standard version
441
+ n_pages = len(pdf) # get the number of pages in the document
442
+ last_page = max_imgboxes
443
+ page_indices = [i for i in range(last_page)] # pages until last_page
444
+ images = list(pdf.render(
445
+ pdfium.PdfBitmap.to_pil,
446
+ page_indices = page_indices,
447
+ scale = 300/72, # 300dpi resolution
448
+ ))
449
+
450
  num_imgs = len(images)
451
  msg = f'The PDF "{filename}" was converted into {num_imgs} images.'
452
  except: