File size: 2,579 Bytes
1ea570a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

"""
ZeroGPU‑ready OCR PDF extractor for HuggingFace Spaces
-----------------------------------------------------
• Uses @spaces.GPU to request a GPU only while needed (ZeroGPU compatible)
• Extracts native text with `pdfplumber`
• Runs GPU‑accelerated OCR on page images with `EasyOCR`
"""

import gradio as gr
import fitz  # PyMuPDF
import pdfplumber
import easyocr
import torch
import tempfile
import os
import spaces  # <-- ZeroGPU decorator

# Global reader object (lazy‑loaded after GPU is allocated)
READER = None
LANGS = ['en']  # add more language codes as desired

@spaces.GPU(duration=600)  # request a GPU for up to 10 min per call
def extract_text(pdf_file):
    """Extract text (native + OCR) from an uploaded PDF"""
    global READER

    # Initialise EasyOCR reader after GPU becomes available
    if READER is None:
        READER = easyocr.Reader(LANGS, gpu=torch.cuda.is_available())

    native_chunks = []
    ocr_chunks = []

    # Pass 1 — native text via pdfplumber
    with pdfplumber.open(pdf_file.name) as pdf:
        for idx, page in enumerate(pdf.pages, start=1):
            txt = page.extract_text() or ""
            if txt.strip():
                native_chunks.append(f"--- Page {idx} (native) ---\n{txt}\n")

    # Pass 2 — OCR each rendered page image with PyMuPDF + EasyOCR
    doc = fitz.open(pdf_file.name)
    for idx, page in enumerate(doc, start=1):
        # Render page image at ~300 dpi
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
        tmp_path = os.path.join(tempfile.gettempdir(), f"page_{idx}.png")
        pix.save(tmp_path)

        ocr_result = READER.readtext(tmp_path, detail=0)
        os.remove(tmp_path)

        if any(line.strip() for line in ocr_result):
            ocr_text = "\n".join(ocr_result)
            ocr_chunks.append(f"--- Page {idx} (OCR) ---\n{ocr_text}\n")

    combined = "\n".join(native_chunks + ocr_chunks)
    return combined or "⚠️ No text detected in the document."

DESCRIPTION = (
    "Drop a PDF to extract **all** text. "
    "Native PDF text is captured first; any remaining text in images is "
    "recognized using EasyOCR. On ZeroGPU hardware, the app requests a "
    "GPU *only* while OCR is running."
)

iface = gr.Interface(
    fn=extract_text,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.Textbox(label="Extracted Text", show_copy_button=True),
    title="ZeroGPU OCR PDF Extractor",
    description=DESCRIPTION,
    allow_flagging="never",
    examples=None,
    theme="default",
)

if __name__ == "__main__":
    iface.launch()