File size: 5,581 Bytes
1ceb840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
from io import BytesIO
from tqdm import tqdm
import numpy as np
from typing import Callable, Dict, List  # , Literal, NamedTuple, Optional, Tuple, Union
from PIL import Image as PIL_Image
from PIL.Image import Image

from datasets import logging

logger = logging.get_logger(__name__)
import PyPDF2

MAX_PAGES = 50
MAX_PDF_SIZE = 100000000  # almost 100MB
MIN_WIDTH, MIN_HEIGHT = 150, 150
import pdf2image


def pdf2image_image_extraction(pdf_stream):
    try:
        images: List[Image] = pdf2image.convert_from_bytes(pdf_stream)
        return images
    except Exception as e:
        logger.warning(f"{e}")


def pdf_to_pixelvalues_extractor(example, feature_extractor, inference_method):
    example["pages"] = 0
    example["pixel_values"] = None
    pixel_values = []
    if len(example["file"]) > MAX_PDF_SIZE:
        logger.warning(f"too large file {len(example['file'])}")
        return example
    try:
        reader = PyPDF2.PdfReader(BytesIO(example["file"]))
    except Exception as e:
        logger.warning(f"read_pdf {e}")
        return example
    example["pages"] = len(reader.pages)
    reached_page_limit = False
    if "sample" in inference_method.scope and inference_method.scope != "sample-grid":
        page_iterator = [inference_method.get_page_scope(reader.pages)]
    else:
        page_iterator = reader.pages

    try:
        for p, page in enumerate(page_iterator):
            if reached_page_limit:
                break
            for image in page.images:
                if len(pixel_values) == MAX_PAGES:
                    reached_page_limit = True
                    break
                im = PIL_Image.open(BytesIO(image.data))
                if im.width < MIN_WIDTH and im.height < MIN_HEIGHT:
                    continue
                # try:
                # except Exception as e:
                #     logger.warning(f"get_images {e}")
                if inference_method.scope != "sample-grid":
                    im = feature_extractor([im.convert("RGB")])["pixel_values"][0]
                pixel_values.append(im)
    except Exception as e:
        print(f"{example.get('id')} PyPDF get_images {e}")
        pixel_values = []

    if len(pixel_values) == 0:
        # at least try with another API
        try:
            images = pdf2image_image_extraction(example["file"])
        except Exception as e:
            print(f"{example.get('id')} pdf2image get_images {e}")
            images = []

        if not images:
            print(f"{example.get('id')} pdf2image has no images")
            example["pages"] = 0
            return example

        # got lucky with pdf2image
        example["pages"] = len(images)
        for im in images:
            if len(pixel_values) == MAX_PAGES:
                reached_page_limit = True
                break
            if im.width < MIN_WIDTH and im.height < MIN_HEIGHT:
                continue
            if inference_method.scope != "sample-grid":
                im = feature_extractor([im.convert("RGB")])["pixel_values"][0]
            pixel_values.append(im)

    if inference_method.scope == "sample-grid":
        grid = inference_method.get_page_scope(pixel_values)
        pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0]
    elif "sample" in inference_method.scope:
        pixel_values = pixel_values[0]
    example["pixel_values"] = np.array(pixel_values)
    return example


def nativepdf_to_pixelvalues_extractor(example, feature_extractor, inference_method):
    IMPOSSIBLE = ["6483941-Letter-to-John-Campbell.pdf", "7276809-Ocoee-Newspaper-Pages.pdf"]
    example["pages"] = 0
    example["pixel_values"] = None
    pixel_values = []
    if len(example["file"]) > MAX_PDF_SIZE:
        logger.warning(f"too large file {len(example['file'])}")
        return example

    # images = example['images']
    try:
        images = pdf2image_image_extraction(example["file"])
    except Exception as e:
        print(f"{example.get('id')} pdf2image get_images {e}")
        images = []

    if not images:
        print(f"{example.get('id')} pdf2image has no images")
        example["pages"] = 0
        return example

    # do image checks before and after
    images = [im for im in images if im.width >= MIN_WIDTH and im.height >= MIN_HEIGHT]

    if not images or (example.get("id") in IMPOSSIBLE and inference_method.scope == "sample-grid"):
        print(f"{example.get('id')} pdf2image has no images")
        example["pages"] = 0
        return example

    example["pages"] = len(images)
    reached_page_limit = False
    if "sample" in inference_method.scope and inference_method.scope != "sample-grid":
        page_iterator = [inference_method.get_page_scope(images)]
    else:
        page_iterator = images

    for im in page_iterator:
        if len(pixel_values) == MAX_PAGES:
            reached_page_limit = True
            break
        if inference_method.scope != "sample-grid":
            im = feature_extractor([im.convert("RGB")])["pixel_values"][0]
        pixel_values.append(im)

    if len(pixel_values) == 0:
        print(f"{example.get('id')} pdf2image has no valid images")
        example["pages"] = 0
        return example

    if inference_method.scope == "sample-grid":
        grid = inference_method.get_page_scope(pixel_values)
        pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0]
    elif "sample" in inference_method.scope:
        pixel_values = pixel_values[0]
    example["pixel_values"] = np.array(pixel_values)
    return example