|
import fitz |
|
|
|
from PIL import Image |
|
from langchain_community.document_loaders import PyMuPDFLoader |
|
|
|
|
|
class ImagePDFLoader(PyMuPDFLoader): |
|
def load_pdf_page(self, page: fitz.Page, dpi: int) -> Image.Image: |
|
pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72)) |
|
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
|
|
if pix.width > 3000 or pix.height > 3000: |
|
pix = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) |
|
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
|
|
return image |
|
|
|
def load(self) -> list[Image.Image]: |
|
images = [] |
|
|
|
doc = fitz.open(self.file_path) |
|
for i in range(len(doc)): |
|
page = doc[i] |
|
image = self.load_pdf_page(page, dpi=250) |
|
images.append(image) |
|
|
|
return images |
|
|