pdfextract / app.py
GAS17's picture
Update app.py
41311bb verified
raw
history blame
1.46 kB
import gradio as gr
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import fitz # PyMuPDF
import io
from PIL import Image
# Initialize the OCR model
model = ocr_predictor(pretrained=True)
def perform_ocr(file):
if file.name.lower().endswith('.pdf'):
# Process PDF
text = ""
pdf_document = fitz.open(file.name)
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Convert PIL Image to bytes
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format='PNG')
img_byte_arr = img_byte_arr.getvalue()
# Perform OCR on the image
doc = DocumentFile.from_images(img_byte_arr)
result = model(doc)
text += result.render() + "\n\n" # Add newlines between pages
return text.strip()
else:
# Process image
doc = DocumentFile.from_images(file.name)
result = model(doc)
return result.render()
# Create Gradio interface
iface = gr.Interface(
fn=perform_ocr,
inputs=gr.File(label="Upload PDF or Image"),
outputs="text",
title="OCR with doctr (PDF and Images)",
description="Upload a PDF file or an image to extract text using OCR."
)
# Launch the interface
iface.launch()