Spaces:

CultriX
/

Easy-OCR

Running on Zero

App Files Files Community

Easy-OCR / app.py

CultriX

First Commit

1ea570a verified 3 months ago

raw

history blame

2.58 kB


	"""
	ZeroGPU‑ready OCR PDF extractor for HuggingFace Spaces
	-----------------------------------------------------
	• Uses @spaces.GPU to request a GPU only while needed (ZeroGPU compatible)
	• Extracts native text with `pdfplumber`
	• Runs GPU‑accelerated OCR on page images with `EasyOCR`
	"""

	import gradio as gr
	import fitz # PyMuPDF
	import pdfplumber
	import easyocr
	import torch
	import tempfile
	import os
	import spaces # <-- ZeroGPU decorator

	# Global reader object (lazy‑loaded after GPU is allocated)
	READER = None
	LANGS = ['en'] # add more language codes as desired

	@spaces.GPU(duration=600) # request a GPU for up to 10 min per call
	def extract_text(pdf_file):
	"""Extract text (native + OCR) from an uploaded PDF"""
	global READER

	# Initialise EasyOCR reader after GPU becomes available
	if READER is None:
	READER = easyocr.Reader(LANGS, gpu=torch.cuda.is_available())

	native_chunks = []
	ocr_chunks = []

	# Pass 1 — native text via pdfplumber
	with pdfplumber.open(pdf_file.name) as pdf:
	for idx, page in enumerate(pdf.pages, start=1):
	txt = page.extract_text() or ""
	if txt.strip():
	native_chunks.append(f"--- Page {idx} (native) ---\n{txt}\n")

	# Pass 2 — OCR each rendered page image with PyMuPDF + EasyOCR
	doc = fitz.open(pdf_file.name)
	for idx, page in enumerate(doc, start=1):
	# Render page image at ~300 dpi
	pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
	tmp_path = os.path.join(tempfile.gettempdir(), f"page_{idx}.png")
	pix.save(tmp_path)

	ocr_result = READER.readtext(tmp_path, detail=0)
	os.remove(tmp_path)

	if any(line.strip() for line in ocr_result):
	ocr_text = "\n".join(ocr_result)
	ocr_chunks.append(f"--- Page {idx} (OCR) ---\n{ocr_text}\n")

	combined = "\n".join(native_chunks + ocr_chunks)
	return combined or "⚠️ No text detected in the document."

	DESCRIPTION = (
	"Drop a PDF to extract all text. "
	"Native PDF text is captured first; any remaining text in images is "
	"recognized using EasyOCR. On ZeroGPU hardware, the app requests a "
	"GPU only while OCR is running."
	)

	iface = gr.Interface(
	fn=extract_text,
	inputs=gr.File(label="Upload PDF"),
	outputs=gr.Textbox(label="Extracted Text", show_copy_button=True),
	title="ZeroGPU OCR PDF Extractor",
	description=DESCRIPTION,
	allow_flagging="never",
	examples=None,
	theme="default",
	)

	if __name__ == "__main__":
	iface.launch()