Spaces:

CultriX
/

Easy-OCR

Running on Zero

App Files Files Community

CultriX commited on Jun 18

Commit

6583fc2

verified ·

1 Parent(s): e10399f

Upload 3 files

Browse files

Files changed (3) hide show

README.md +31 -29
app.py +206 -62
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -1,29 +1,31 @@
----
-license: apache-2.0
-title: Easy-OCR
-sdk: gradio
-emoji: 📚
-colorFrom: blue
-colorTo: purple
-thumbnail: >-
-  https://cdn-uploads.huggingface.co/production/uploads/6495d5a915d8ef6f01bc75eb/TSmoqoWGoatq_GLsau_La.png
-short_description: GPU-Accelerated OCR
----
-# ZeroGPU OCR PDF Extractor
-**Key features**
-* ⚡️ *On‑demand GPU* — the `@spaces.GPU` decorator grabs a GPU only while OCR is running. Perfect for HuggingFace **ZeroGPU** Spaces.
-* 📝 Combines native PDF text (via **pdfplumber**) with OCR from images (via **EasyOCR**).
-* 🌍 Multilingual: add language codes to the `LANGS` list in `app.py`.
-## Deploy
-1. Create a *Gradio* Space and pick **ZeroGPU** in the **Hardware** dropdown (requires a PRO subscription).
-2. Upload these files or the ZIP bundle.
-3. Commit — the Space will build automatically. The first call downloads EasyOCR model weights (~200 MB).
-## Usage Tips
-* Large PDFs can take several minutes; the decorator is set to `duration=600` s. Adjust if needed.
-* For faster queues, lower the duration if your documents are small.

+# ZeroGPU Multilingual PDF Text Extractor
+This Space marries **speed**, **accuracy**, and a polished **UX**:
+| Capability | How |
+|------------|-----|
+| On‑demand GPU | `@spaces.GPU` wraps only the OCR phase – 0 credits burnt when you choose **native** mode. |
+| Streaming output | Results appear page‑by‑page; no more guessing “is it stuck?”. |
+| Progress bar | Slick Gradio 4 `Progress` widget with pages processed / total. |
+| Language picker | Loads exactly the EasyOCR models you need for sharper accuracy & faster warm‑up. |
+| Modes | **native** (embedded text only), **ocr** (images only), **auto** (mixed). |
+| Download button | Get a `.txt` file of the final output. |
+| UX polish | Two‑column responsive layout, soft purple theme, sample PDFs for instant demo. |
+| Robustness | File‑size guard (200 MB), CUDA OOM retry at lower DPI, unsupported language error message. |
+## Running locally
+```bash
+pip install -r requirements.txt
+python app.py
+```
+## Deploy on HuggingFace
+1. Create a **Gradio** Space and pick **ZeroGPU** hardware.
+2. Upload these files or the ZIP bundle.
+3. Commit – first OCR call will download model weights (~200 MB each language family).
+## Maintainers
+*Run `black app.py && ruff app.py` before committing to stay stylish.*

app.py CHANGED Viewed

@@ -1,79 +1,223 @@
 """
-ZeroGPU‑ready OCR PDF extractor for HuggingFace Spaces
------------------------------------------------------
-• Uses @spaces.GPU to request a GPU only while needed (ZeroGPU compatible)
-• Extracts native text with `pdfplumber`
-• Runs GPU‑accelerated OCR on page images with `EasyOCR`
 """
-import gradio as gr
 import fitz  # PyMuPDF
 import pdfplumber
-import easyocr
 import torch
-import tempfile
-import os
-import spaces  # <-- ZeroGPU decorator
-# Global reader object (lazy‑loaded after GPU is allocated)
-READER = None
-LANGS = ['en']  # add more language codes as desired
-@spaces.GPU(duration=600)  # request a GPU for up to 10 min per call
-def extract_text(pdf_file):
-    """Extract text (native + OCR) from an uploaded PDF"""
-    global READER
-    # Initialise EasyOCR reader after GPU becomes available
-    if READER is None:
-        READER = easyocr.Reader(LANGS, gpu=torch.cuda.is_available())
-    native_chunks = []
-    ocr_chunks = []
-    # Pass 1 — native text via pdfplumber
-    with pdfplumber.open(pdf_file.name) as pdf:
         for idx, page in enumerate(pdf.pages, start=1):
-            txt = page.extract_text() or ""
-            if txt.strip():
-                native_chunks.append(f"--- Page {idx} (native) ---\n{txt}\n")
-    # Pass 2 — OCR each rendered page image with PyMuPDF + EasyOCR
-    doc = fitz.open(pdf_file.name)
-    for idx, page in enumerate(doc, start=1):
-        # Render page image at ~300 dpi
-        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
-        tmp_path = os.path.join(tempfile.gettempdir(), f"page_{idx}.png")
-        pix.save(tmp_path)
-        ocr_result = READER.readtext(tmp_path, detail=0)
-        os.remove(tmp_path)
-        if any(line.strip() for line in ocr_result):
-            ocr_text = "\n".join(ocr_result)
-            ocr_chunks.append(f"--- Page {idx} (OCR) ---\n{ocr_text}\n")
-    combined = "\n".join(native_chunks + ocr_chunks)
-    return combined or "⚠️ No text detected in the document."
-DESCRIPTION = (
-    "Drop a PDF to extract **all** text. "
-    "Native PDF text is captured first; any remaining text in images is "
-    "recognized using EasyOCR. On ZeroGPU hardware, the app requests a "
-    "GPU *only* while OCR is running."
-)
-iface = gr.Interface(
-    fn=extract_text,
-    inputs=gr.File(label="Upload PDF"),
-    outputs=gr.Textbox(label="Extracted Text", show_copy_button=True),
-    title="ZeroGPU OCR PDF Extractor",
-    description=DESCRIPTION,
-    allow_flagging="never",
-    examples=None,
-    theme="default",
 )
 if __name__ == "__main__":
-    iface.launch()

 """
+📚 ZeroGPU Multilingual PDF Text Extractor
+=========================================
+Features
+--------
+• **Native / OCR / Hybrid** modes
+• **Language chooser** (multiselect) with EasyOCR model caching
+• **ZeroGPU** pay‑as‑you‑go: GPU allocated *only* while OCR runs
+• **Streamed output** page‑by‑page + real‑time progress bar
+• **Download‑as‑TXT** button
+• Basic **error handling** (oversize PDF, CUDA OOM, unsupported language)
+Maintained as a single file (`app.py`) for simplicity.
 """
+import os, tempfile, concurrent.futures, itertools, functools, uuid
+from typing import List, Tuple
 import fitz  # PyMuPDF
 import pdfplumber
 import torch
+import gradio as gr
+import spaces  # HF Spaces helper (for ZeroGPU)
+import easyocr
+# ----------------------------------------------------------------------
+# Caching for EasyOCR readers (models are heavy; reuse them)
+# ----------------------------------------------------------------------
+_READERS = {}
+def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
+    key = tuple(sorted(lang_codes))
+    if key not in _READERS:
+        try:
+            _READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
+        except ValueError as e:
+            raise gr.Error(str(e))
+    return _READERS[key]
+# ----------------------------------------------------------------------
+# GPU‑decorated OCR worker (runs ONLY when called)
+# ----------------------------------------------------------------------
+@spaces.GPU(duration=600)
+def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]:
+    """OCR designated pages and return list[(page_num, text)]."""
+    reader = get_reader(lang_codes)
+    doc = fitz.open(pdf_path)
+    results = []
+    def ocr_single(idx: int):
+        pg = doc[idx - 1]
+        # Adaptive rasterisation scale (A4 ~= 595 × 842 pt)
+        max_side = max(pg.rect.width, pg.rect.height)
+        scale = 2 if max_side <= 600 else 1.5
+        try:
+            pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
+        except RuntimeError:
+            # Fallback lower dpi if page too huge
+            pix = pg.get_pixmap()
+        img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
+        pix.save(img_path)
+        # Single-language ⇒ use detail=1 to filter low‑confidence lines
+        if len(lang_codes) == 1:
+            tmp = reader.readtext(img_path, detail=1)
+            txt_lines = [text for _, text, conf in tmp if conf > 0.2]
+        else:
+            txt_lines = reader.readtext(img_path, detail=0)
+        os.remove(img_path)
+        return idx, "\n".join(txt_lines)
+    # Light parallelism (GPU friendly)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
+        futures = {ex.submit(ocr_single, i): i for i in page_ids}
+        for fut in concurrent.futures.as_completed(futures):
+            results.append(fut.result())
+    return results
+# ----------------------------------------------------------------------
+# Native text extractor helper
+# ----------------------------------------------------------------------
+def extract_native(pdf_path: str, x_tol: float = 1) -> List[Tuple[int, str]]:
+    with pdfplumber.open(pdf_path) as pdf:
+        out = []
         for idx, page in enumerate(pdf.pages, start=1):
+            txt = page.extract_text(x_tolerance=x_tol) or ""
+            out.append((idx, txt))
+        return out
+# ----------------------------------------------------------------------
+# Main pipeline (Gradio generator)
+# ----------------------------------------------------------------------
+def pipeline(pdf_file, langs, mode):
+    if pdf_file is None:
+        raise gr.Error("Please upload a PDF.")
+    # Guard: size limit 200 MB
+    max_size = 200 * 1024 * 1024
+    if os.path.getsize(pdf_file.name) > max_size:
+        raise gr.Error("PDF larger than 200 MB. Please split the document.")
+    langs = langs if isinstance(langs, list) else [langs]
+    lang_tuple = tuple(langs)
+    native_chunks, ocr_chunks = [], []
+    combined_text = ""
+    # Create a temporary TXT file for incremental writing (download button)
+    tmp_txt = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
+    tmp_txt_path = tmp_txt.name
+    # Progress bar context
+    with gr.Progress(track_tqdm=False) as prog:
+        native_pages = extract_native(pdf_file.name) if mode in ("native", "auto") else []
+        total_pages = len(native_pages) if native_pages else fitz.open(pdf_file.name).page_count
+        prog.tqdm(total=total_pages)
+        # Process pages one by one (stream output)
+        pending_ocr = []
+        for idx in range(1, total_pages + 1):
+            native_txt = ""
+            if mode in ("native", "auto"):
+                native_txt = native_pages[idx - 1][1]
+            if native_txt.strip():
+                chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
+                native_chunks.append(chunk)
+                combined_text += chunk
+                tmp_txt.write(chunk.encode("utf-8"))
+                yield combined_text, None
+            else:
+                if mode == "auto":
+                    pending_ocr.append(idx)
+                elif mode == "ocr":
+                    pending_ocr.append(idx)
+            prog.update(advance=1)
+        # OCR if needed
+        if pending_ocr:
+            try:
+                ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
+            except RuntimeError as e:
+                # Likely CUDA OOM → retry at lower dpi
+                ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
+            for idx, text in sorted(ocr_results, key=lambda x: x[0]):
+                if text.strip():
+                    chunk = f"--- Page {idx} (OCR) ---\n{text}\n"
+                    ocr_chunks.append(chunk)
+                    combined_text += chunk
+                    tmp_txt.write(chunk.encode("utf-8"))
+                    yield combined_text, None
+    tmp_txt.close()
+    # Final yield includes download‑file
+    yield combined_text or "⚠️ No text detected in the document.", tmp_txt_path
+# ----------------------------------------------------------------------
+# Gradio Blocks UI
+# ----------------------------------------------------------------------
+THEME = gr.themes.Base(
+    primary_hue="purple",
+    radius_size=gr.themes.sizes.radius_xl,
+    spacing_size=gr.themes.sizes.spacing_md,
 )
+EXAMPLE_URLS = [
+    "https://arxiv.org/pdf/2106.14834.pdf",
+    "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
+]
+with gr.Blocks(theme=THEME, title="ZeroGPU PDF OCR") as demo:
+    gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")
+    with gr.Row():
+        with gr.Column(scale=1, min_width=250):
+            file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
+            lang_in = gr.Dropdown(
+                ["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"],
+                multiselect=True,
+                value=["en"],
+                label="OCR language(s)"
+            )
+            mode_in = gr.Radio(
+                ["native", "ocr", "auto"],
+                value="auto",
+                label="Document type",
+                info="native = text only · ocr = images only · auto = mixed",
+            )
+            run_btn = gr.Button("Extract", variant="primary")
+        with gr.Column(scale=2):
+            txt_out = gr.Textbox(
+                label="Extracted Text (streaming)",
+                lines=18,
+                show_copy_button=True,
+            )
+            download_out = gr.File(label="Download .txt")
+    run_btn.click(
+        fn=pipeline,
+        inputs=[file_in, lang_in, mode_in],
+        outputs=[txt_out, download_out],
+    )
+    gr.Examples(
+        EXAMPLE_URLS,
+        inputs=file_in,
+        label="Quick‑test PDFs",
+        fn=None,
+    )
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio>=5.34.1
 easyocr>=1.7.1
 torch>=2.0
 pdfplumber>=0.10.3

+gradio>=4.1
 easyocr>=1.7.1
 torch>=2.0
 pdfplumber>=0.10.3