Spaces:

CultriX
/

Easy-OCR

Running on Zero

File size: 7,991 Bytes


"""
📚 ZeroGPU Multilingual PDF Text Extractor
=========================================

Features
--------
• **Native / OCR / Hybrid** modes  
• **Language chooser** (multiselect) with EasyOCR model caching  
• **ZeroGPU** pay‑as‑you‑go: GPU allocated *only* while OCR runs  
• **Streamed output** page‑by‑page + real‑time progress bar  
• **Download‑as‑TXT** button  
• Basic **error handling** (oversize PDF, CUDA OOM, unsupported language)

Maintained as a single file (`app.py`) for simplicity.
"""

import os, tempfile, concurrent.futures, itertools, functools, uuid
from typing import List, Tuple

import fitz  # PyMuPDF
import pdfplumber
import torch
import gradio as gr
import spaces  # HF Spaces helper (for ZeroGPU)
import easyocr

# ----------------------------------------------------------------------
# Caching for EasyOCR readers (models are heavy; reuse them)
# ----------------------------------------------------------------------
_READERS = {}

def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
    key = tuple(sorted(lang_codes))
    if key not in _READERS:
        try:
            _READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
        except ValueError as e:
            raise gr.Error(str(e))
    return _READERS[key]


# ----------------------------------------------------------------------
# GPU‑decorated OCR worker (runs ONLY when called)
# ----------------------------------------------------------------------
@spaces.GPU(duration=600)
def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]:
    """OCR designated pages and return list[(page_num, text)]."""
    reader = get_reader(lang_codes)
    doc = fitz.open(pdf_path)
    results = []

    def ocr_single(idx: int):
        pg = doc[idx - 1]
        # Adaptive rasterisation scale (A4 ~= 595 × 842 pt)
        max_side = max(pg.rect.width, pg.rect.height)
        scale = 2 if max_side <= 600 else 1.5
        try:
            pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
        except RuntimeError:
            # Fallback lower dpi if page too huge
            pix = pg.get_pixmap()
        img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
        pix.save(img_path)

        # Single-language ⇒ use detail=1 to filter low‑confidence lines
        if len(lang_codes) == 1:
            tmp = reader.readtext(img_path, detail=1)
            txt_lines = [text for _, text, conf in tmp if conf > 0.2]
        else:
            txt_lines = reader.readtext(img_path, detail=0)

        os.remove(img_path)
        return idx, "\n".join(txt_lines)

    # Light parallelism (GPU friendly)
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
        futures = {ex.submit(ocr_single, i): i for i in page_ids}
        for fut in concurrent.futures.as_completed(futures):
            results.append(fut.result())

    return results


# ----------------------------------------------------------------------
# Native text extractor helper
# ----------------------------------------------------------------------
def extract_native(pdf_path: str, x_tol: float = 1) -> List[Tuple[int, str]]:
    with pdfplumber.open(pdf_path) as pdf:
        out = []
        for idx, page in enumerate(pdf.pages, start=1):
            txt = page.extract_text(x_tolerance=x_tol) or ""
            out.append((idx, txt))
        return out


# ----------------------------------------------------------------------
# Main pipeline (Gradio generator)
# ----------------------------------------------------------------------
def pipeline(pdf_file, langs, mode):
    if pdf_file is None:
        raise gr.Error("Please upload a PDF.")

    # Guard: size limit 200 MB
    max_size = 200 * 1024 * 1024
    if os.path.getsize(pdf_file.name) > max_size:
        raise gr.Error("PDF larger than 200 MB. Please split the document.")

    langs = langs if isinstance(langs, list) else [langs]
    lang_tuple = tuple(langs)

    native_chunks, ocr_chunks = [], []
    combined_text = ""

    # Create a temporary TXT file for incremental writing (download button)
    tmp_txt = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
    tmp_txt_path = tmp_txt.name

    # Progress bar context
    with gr.Progress(track_tqdm=False) as prog:
        native_pages = extract_native(pdf_file.name) if mode in ("native", "auto") else []
        total_pages = len(native_pages) if native_pages else fitz.open(pdf_file.name).page_count
        prog.tqdm(total=total_pages)

        # Process pages one by one (stream output)
        pending_ocr = []

        for idx in range(1, total_pages + 1):
            native_txt = ""
            if mode in ("native", "auto"):
                native_txt = native_pages[idx - 1][1]

            if native_txt.strip():
                chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
                native_chunks.append(chunk)
                combined_text += chunk
                tmp_txt.write(chunk.encode("utf-8"))
                yield combined_text, None
            else:
                if mode == "auto":
                    pending_ocr.append(idx)
                elif mode == "ocr":
                    pending_ocr.append(idx)
            prog.update(advance=1)

        # OCR if needed
        if pending_ocr:
            try:
                ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
            except RuntimeError as e:
                # Likely CUDA OOM → retry at lower dpi
                ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)

            for idx, text in sorted(ocr_results, key=lambda x: x[0]):
                if text.strip():
                    chunk = f"--- Page {idx} (OCR) ---\n{text}\n"
                    ocr_chunks.append(chunk)
                    combined_text += chunk
                    tmp_txt.write(chunk.encode("utf-8"))
                    yield combined_text, None

    tmp_txt.close()
    # Final yield includes download‑file
    yield combined_text or "⚠️ No text detected in the document.", tmp_txt_path


# ----------------------------------------------------------------------
# Gradio Blocks UI
# ----------------------------------------------------------------------
THEME = gr.themes.Base(
    primary_hue="purple",
    radius_size=gr.themes.sizes.radius_xl,
    spacing_size=gr.themes.sizes.spacing_md,
)

EXAMPLE_URLS = [
    "https://arxiv.org/pdf/2106.14834.pdf",
    "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
]

with gr.Blocks(theme=THEME, title="ZeroGPU PDF OCR") as demo:
    gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")

    with gr.Row():
        with gr.Column(scale=1, min_width=250):
            file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
            lang_in = gr.Dropdown(
                ["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"],
                multiselect=True,
                value=["en"],
                label="OCR language(s)"
            )
            mode_in = gr.Radio(
                ["native", "ocr", "auto"],
                value="auto",
                label="Document type",
                info="native = text only · ocr = images only · auto = mixed",
            )
            run_btn = gr.Button("Extract", variant="primary")

        with gr.Column(scale=2):
            txt_out = gr.Textbox(
                label="Extracted Text (streaming)",
                lines=18,
                show_copy_button=True,
            )
            download_out = gr.File(label="Download .txt")

    run_btn.click(
        fn=pipeline,
        inputs=[file_in, lang_in, mode_in],
        outputs=[txt_out, download_out],
    )

    gr.Examples(
        EXAMPLE_URLS,
        inputs=file_in,
        label="Quick‑test PDFs",
        fn=None,
    )

if __name__ == "__main__":
    demo.launch()