Spaces:

CultriX
/

Easy-OCR

Running on Zero

App Files Files Community

Easy-OCR / app.py

CultriX

Upload 3 files

6583fc2 verified 2 months ago

raw

history blame

7.99 kB


	"""
	📚 ZeroGPU Multilingual PDF Text Extractor
	=========================================

	Features
	--------
	• Native / OCR / Hybrid modes
	• Language chooser (multiselect) with EasyOCR model caching
	• ZeroGPU pay‑as‑you‑go: GPU allocated only while OCR runs
	• Streamed output page‑by‑page + real‑time progress bar
	• Download‑as‑TXT button
	• Basic error handling (oversize PDF, CUDA OOM, unsupported language)

	Maintained as a single file (`app.py`) for simplicity.
	"""

	import os, tempfile, concurrent.futures, itertools, functools, uuid
	from typing import List, Tuple

	import fitz # PyMuPDF
	import pdfplumber
	import torch
	import gradio as gr
	import spaces # HF Spaces helper (for ZeroGPU)
	import easyocr

	# ----------------------------------------------------------------------
	# Caching for EasyOCR readers (models are heavy; reuse them)
	# ----------------------------------------------------------------------
	_READERS = {}

	def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
	key = tuple(sorted(lang_codes))
	if key not in _READERS:
	try:
	_READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
	except ValueError as e:
	raise gr.Error(str(e))
	return _READERS[key]


	# ----------------------------------------------------------------------
	# GPU‑decorated OCR worker (runs ONLY when called)
	# ----------------------------------------------------------------------
	@spaces.GPU(duration=600)
	def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]:
	"""OCR designated pages and return list[(page_num, text)]."""
	reader = get_reader(lang_codes)
	doc = fitz.open(pdf_path)
	results = []

	def ocr_single(idx: int):
	pg = doc[idx - 1]
	# Adaptive rasterisation scale (A4 ~= 595 × 842 pt)
	max_side = max(pg.rect.width, pg.rect.height)
	scale = 2 if max_side <= 600 else 1.5
	try:
	pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
	except RuntimeError:
	# Fallback lower dpi if page too huge
	pix = pg.get_pixmap()
	img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
	pix.save(img_path)

	# Single-language ⇒ use detail=1 to filter low‑confidence lines
	if len(lang_codes) == 1:
	tmp = reader.readtext(img_path, detail=1)
	txt_lines = [text for _, text, conf in tmp if conf > 0.2]
	else:
	txt_lines = reader.readtext(img_path, detail=0)

	os.remove(img_path)
	return idx, "\n".join(txt_lines)

	# Light parallelism (GPU friendly)
	with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
	futures = {ex.submit(ocr_single, i): i for i in page_ids}
	for fut in concurrent.futures.as_completed(futures):
	results.append(fut.result())

	return results


	# ----------------------------------------------------------------------
	# Native text extractor helper
	# ----------------------------------------------------------------------
	def extract_native(pdf_path: str, x_tol: float = 1) -> List[Tuple[int, str]]:
	with pdfplumber.open(pdf_path) as pdf:
	out = []
	for idx, page in enumerate(pdf.pages, start=1):
	txt = page.extract_text(x_tolerance=x_tol) or ""
	out.append((idx, txt))
	return out


	# ----------------------------------------------------------------------
	# Main pipeline (Gradio generator)
	# ----------------------------------------------------------------------
	def pipeline(pdf_file, langs, mode):
	if pdf_file is None:
	raise gr.Error("Please upload a PDF.")

	# Guard: size limit 200 MB
	max_size = 200 * 1024 * 1024
	if os.path.getsize(pdf_file.name) > max_size:
	raise gr.Error("PDF larger than 200 MB. Please split the document.")

	langs = langs if isinstance(langs, list) else [langs]
	lang_tuple = tuple(langs)

	native_chunks, ocr_chunks = [], []
	combined_text = ""

	# Create a temporary TXT file for incremental writing (download button)
	tmp_txt = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
	tmp_txt_path = tmp_txt.name

	# Progress bar context
	with gr.Progress(track_tqdm=False) as prog:
	native_pages = extract_native(pdf_file.name) if mode in ("native", "auto") else []
	total_pages = len(native_pages) if native_pages else fitz.open(pdf_file.name).page_count
	prog.tqdm(total=total_pages)

	# Process pages one by one (stream output)
	pending_ocr = []

	for idx in range(1, total_pages + 1):
	native_txt = ""
	if mode in ("native", "auto"):
	native_txt = native_pages[idx - 1][1]

	if native_txt.strip():
	chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
	native_chunks.append(chunk)
	combined_text += chunk
	tmp_txt.write(chunk.encode("utf-8"))
	yield combined_text, None
	else:
	if mode == "auto":
	pending_ocr.append(idx)
	elif mode == "ocr":
	pending_ocr.append(idx)
	prog.update(advance=1)

	# OCR if needed
	if pending_ocr:
	try:
	ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
	except RuntimeError as e:
	# Likely CUDA OOM → retry at lower dpi
	ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)

	for idx, text in sorted(ocr_results, key=lambda x: x[0]):
	if text.strip():
	chunk = f"--- Page {idx} (OCR) ---\n{text}\n"
	ocr_chunks.append(chunk)
	combined_text += chunk
	tmp_txt.write(chunk.encode("utf-8"))
	yield combined_text, None

	tmp_txt.close()
	# Final yield includes download‑file
	yield combined_text or "⚠️ No text detected in the document.", tmp_txt_path


	# ----------------------------------------------------------------------
	# Gradio Blocks UI
	# ----------------------------------------------------------------------
	THEME = gr.themes.Base(
	primary_hue="purple",
	radius_size=gr.themes.sizes.radius_xl,
	spacing_size=gr.themes.sizes.spacing_md,
	)

	EXAMPLE_URLS = [
	"https://arxiv.org/pdf/2106.14834.pdf",
	"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
	]

	with gr.Blocks(theme=THEME, title="ZeroGPU PDF OCR") as demo:
	gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")

	with gr.Row():
	with gr.Column(scale=1, min_width=250):
	file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
	lang_in = gr.Dropdown(
	["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"],
	multiselect=True,
	value=["en"],
	label="OCR language(s)"
	)
	mode_in = gr.Radio(
	["native", "ocr", "auto"],
	value="auto",
	label="Document type",
	info="native = text only · ocr = images only · auto = mixed",
	)
	run_btn = gr.Button("Extract", variant="primary")

	with gr.Column(scale=2):
	txt_out = gr.Textbox(
	label="Extracted Text (streaming)",
	lines=18,
	show_copy_button=True,
	)
	download_out = gr.File(label="Download .txt")

	run_btn.click(
	fn=pipeline,
	inputs=[file_in, lang_in, mode_in],
	outputs=[txt_out, download_out],
	)

	gr.Examples(
	EXAMPLE_URLS,
	inputs=file_in,
	label="Quick‑test PDFs",
	fn=None,
	)

	if __name__ == "__main__":
	demo.launch()