|
|
|
""" |
|
ZeroGPU‑ready OCR PDF extractor for HuggingFace Spaces |
|
----------------------------------------------------- |
|
• Uses @spaces.GPU to request a GPU only while needed (ZeroGPU compatible) |
|
• Extracts native text with `pdfplumber` |
|
• Runs GPU‑accelerated OCR on page images with `EasyOCR` |
|
""" |
|
|
|
import gradio as gr |
|
import fitz |
|
import pdfplumber |
|
import easyocr |
|
import torch |
|
import tempfile |
|
import os |
|
import spaces |
|
|
|
|
|
READER = None |
|
LANGS = ['en'] |
|
|
|
@spaces.GPU(duration=600) |
|
def extract_text(pdf_file): |
|
"""Extract text (native + OCR) from an uploaded PDF""" |
|
global READER |
|
|
|
|
|
if READER is None: |
|
READER = easyocr.Reader(LANGS, gpu=torch.cuda.is_available()) |
|
|
|
native_chunks = [] |
|
ocr_chunks = [] |
|
|
|
|
|
with pdfplumber.open(pdf_file.name) as pdf: |
|
for idx, page in enumerate(pdf.pages, start=1): |
|
txt = page.extract_text() or "" |
|
if txt.strip(): |
|
native_chunks.append(f"--- Page {idx} (native) ---\n{txt}\n") |
|
|
|
|
|
doc = fitz.open(pdf_file.name) |
|
for idx, page in enumerate(doc, start=1): |
|
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) |
|
tmp_path = os.path.join(tempfile.gettempdir(), f"page_{idx}.png") |
|
pix.save(tmp_path) |
|
|
|
ocr_result = READER.readtext(tmp_path, detail=0) |
|
os.remove(tmp_path) |
|
|
|
if any(line.strip() for line in ocr_result): |
|
ocr_text = "\n".join(ocr_result) |
|
ocr_chunks.append(f"--- Page {idx} (OCR) ---\n{ocr_text}\n") |
|
|
|
combined = "\n".join(native_chunks + ocr_chunks) |
|
return combined or "⚠️ No text detected in the document." |
|
|
|
DESCRIPTION = ( |
|
"Drop a PDF to extract **all** text. " |
|
"Native PDF text is captured first; any remaining text in images is " |
|
"recognized using EasyOCR. On ZeroGPU hardware, the app requests a " |
|
"GPU *only* while OCR is running." |
|
) |
|
|
|
iface = gr.Interface( |
|
fn=extract_text, |
|
inputs=gr.File(label="Upload PDF"), |
|
outputs=gr.Textbox(label="Extracted Text", show_copy_button=True), |
|
title="ZeroGPU OCR PDF Extractor", |
|
description=DESCRIPTION, |
|
allow_flagging="never", |
|
examples=None, |
|
theme="default", |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|