Spaces:

GAS17
/

pdfextract

Runtime error

pdfextract / app.py

Update app.py

41311bb verified 6 months ago

1.46 kB

	import gradio as gr
	from doctr.io import DocumentFile
	from doctr.models import ocr_predictor
	import fitz # PyMuPDF
	import io
	from PIL import Image

	# Initialize the OCR model
	model = ocr_predictor(pretrained=True)

	def perform_ocr(file):
	if file.name.lower().endswith('.pdf'):
	# Process PDF
	text = ""
	pdf_document = fitz.open(file.name)
	for page_num in range(pdf_document.page_count):
	page = pdf_document[page_num]
	pix = page.get_pixmap()
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

	# Convert PIL Image to bytes
	img_byte_arr = io.BytesIO()
	img.save(img_byte_arr, format='PNG')
	img_byte_arr = img_byte_arr.getvalue()

	# Perform OCR on the image
	doc = DocumentFile.from_images(img_byte_arr)
	result = model(doc)
	text += result.render() + "\n\n" # Add newlines between pages
	return text.strip()
	else:
	# Process image
	doc = DocumentFile.from_images(file.name)
	result = model(doc)
	return result.render()

	# Create Gradio interface
	iface = gr.Interface(
	fn=perform_ocr,
	inputs=gr.File(label="Upload PDF or Image"),
	outputs="text",
	title="OCR with doctr (PDF and Images)",
	description="Upload a PDF file or an image to extract text using OCR."
	)

	# Launch the interface
	iface.launch()