File size: 2,579 Bytes
1ea570a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
"""
ZeroGPU‑ready OCR PDF extractor for HuggingFace Spaces
-----------------------------------------------------
• Uses @spaces.GPU to request a GPU only while needed (ZeroGPU compatible)
• Extracts native text with `pdfplumber`
• Runs GPU‑accelerated OCR on page images with `EasyOCR`
"""
import gradio as gr
import fitz # PyMuPDF
import pdfplumber
import easyocr
import torch
import tempfile
import os
import spaces # <-- ZeroGPU decorator
# Global reader object (lazy‑loaded after GPU is allocated)
READER = None
LANGS = ['en'] # add more language codes as desired
@spaces.GPU(duration=600) # request a GPU for up to 10 min per call
def extract_text(pdf_file):
"""Extract text (native + OCR) from an uploaded PDF"""
global READER
# Initialise EasyOCR reader after GPU becomes available
if READER is None:
READER = easyocr.Reader(LANGS, gpu=torch.cuda.is_available())
native_chunks = []
ocr_chunks = []
# Pass 1 — native text via pdfplumber
with pdfplumber.open(pdf_file.name) as pdf:
for idx, page in enumerate(pdf.pages, start=1):
txt = page.extract_text() or ""
if txt.strip():
native_chunks.append(f"--- Page {idx} (native) ---\n{txt}\n")
# Pass 2 — OCR each rendered page image with PyMuPDF + EasyOCR
doc = fitz.open(pdf_file.name)
for idx, page in enumerate(doc, start=1):
# Render page image at ~300 dpi
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
tmp_path = os.path.join(tempfile.gettempdir(), f"page_{idx}.png")
pix.save(tmp_path)
ocr_result = READER.readtext(tmp_path, detail=0)
os.remove(tmp_path)
if any(line.strip() for line in ocr_result):
ocr_text = "\n".join(ocr_result)
ocr_chunks.append(f"--- Page {idx} (OCR) ---\n{ocr_text}\n")
combined = "\n".join(native_chunks + ocr_chunks)
return combined or "⚠️ No text detected in the document."
DESCRIPTION = (
"Drop a PDF to extract **all** text. "
"Native PDF text is captured first; any remaining text in images is "
"recognized using EasyOCR. On ZeroGPU hardware, the app requests a "
"GPU *only* while OCR is running."
)
iface = gr.Interface(
fn=extract_text,
inputs=gr.File(label="Upload PDF"),
outputs=gr.Textbox(label="Extracted Text", show_copy_button=True),
title="ZeroGPU OCR PDF Extractor",
description=DESCRIPTION,
allow_flagging="never",
examples=None,
theme="default",
)
if __name__ == "__main__":
iface.launch()
|