Spaces:

CultriX
/

Easy-OCR

Running on Zero

App Files Files Community

CultriX commited on Jun 18

Commit

3d8470e

verified ·

1 Parent(s): 59f3059

Upload 2 files

Browse files

Files changed (1) hide show

app.py +73 -172

app.py CHANGED Viewed

@@ -1,223 +1,124 @@
 """
 📚 ZeroGPU Multilingual PDF Text Extractor
-=========================================
-Features
---------
-• **Native / OCR / Hybrid** modes
-• **Language chooser** (multiselect) with EasyOCR model caching
-• **ZeroGPU** pay‑as‑you‑go: GPU allocated *only* while OCR runs
-• **Streamed output** page‑by‑page + real‑time progress bar
-• **Download‑as‑TXT** button
-• Basic **error handling** (oversize PDF, CUDA OOM, unsupported language)
-Maintained as a single file (`app.py`) for simplicity.
 """
-import os, tempfile, concurrent.futures, itertools, functools, uuid
 from typing import List, Tuple
 import fitz  # PyMuPDF
 import pdfplumber
 import torch
 import gradio as gr
-import spaces  # HF Spaces helper (for ZeroGPU)
 import easyocr
-# ----------------------------------------------------------------------
-# Caching for EasyOCR readers (models are heavy; reuse them)
-# ----------------------------------------------------------------------
 _READERS = {}
 def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
     key = tuple(sorted(lang_codes))
     if key not in _READERS:
-        try:
-            _READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
-        except ValueError as e:
-            raise gr.Error(str(e))
     return _READERS[key]
-# ----------------------------------------------------------------------
-# GPU‑decorated OCR worker (runs ONLY when called)
-# ----------------------------------------------------------------------
-@spaces.GPU(duration=60)
-def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]:
-    """OCR designated pages and return list[(page_num, text)]."""
     reader = get_reader(lang_codes)
     doc = fitz.open(pdf_path)
-    results = []
-    def ocr_single(idx: int):
         pg = doc[idx - 1]
-        # Adaptive rasterisation scale (A4 ~= 595 × 842 pt)
-        max_side = max(pg.rect.width, pg.rect.height)
-        scale = 2 if max_side <= 600 else 1.5
-        try:
-            pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
-        except RuntimeError:
-            # Fallback lower dpi if page too huge
-            pix = pg.get_pixmap()
         img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
         pix.save(img_path)
-        # Single-language ⇒ use detail=1 to filter low‑confidence lines
         if len(lang_codes) == 1:
-            tmp = reader.readtext(img_path, detail=1)
-            txt_lines = [text for _, text, conf in tmp if conf > 0.2]
         else:
-            txt_lines = reader.readtext(img_path, detail=0)
         os.remove(img_path)
-        return idx, "\n".join(txt_lines)
-    # Light parallelism (GPU friendly)
     with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
-        futures = {ex.submit(ocr_single, i): i for i in page_ids}
-        for fut in concurrent.futures.as_completed(futures):
-            results.append(fut.result())
-    return results
-# ----------------------------------------------------------------------
-# Native text extractor helper
-# ----------------------------------------------------------------------
-def extract_native(pdf_path: str, x_tol: float = 1) -> List[Tuple[int, str]]:
     with pdfplumber.open(pdf_path) as pdf:
-        out = []
-        for idx, page in enumerate(pdf.pages, start=1):
-            txt = page.extract_text(x_tolerance=x_tol) or ""
-            out.append((idx, txt))
-        return out
-# ----------------------------------------------------------------------
-# Main pipeline (Gradio generator)
-# ----------------------------------------------------------------------
 def pipeline(pdf_file, langs, mode):
     if pdf_file is None:
         raise gr.Error("Please upload a PDF.")
-    # Guard: size limit 200 MB
-    max_size = 200 * 1024 * 1024
-    if os.path.getsize(pdf_file.name) > max_size:
-        raise gr.Error("PDF larger than 200 MB. Please split the document.")
     langs = langs if isinstance(langs, list) else [langs]
     lang_tuple = tuple(langs)
-    native_chunks, ocr_chunks = [], []
-    combined_text = ""
-    # Create a temporary TXT file for incremental writing (download button)
-    tmp_txt = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
-    tmp_txt_path = tmp_txt.name
-    # Progress bar context
-    with gr.Progress(track_tqdm=False) as prog:
-        native_pages = extract_native(pdf_file.name) if mode in ("native", "auto") else []
-        total_pages = len(native_pages) if native_pages else fitz.open(pdf_file.name).page_count
-        prog.tqdm(total=total_pages)
-        # Process pages one by one (stream output)
-        pending_ocr = []
-        for idx in range(1, total_pages + 1):
-            native_txt = ""
-            if mode in ("native", "auto"):
-                native_txt = native_pages[idx - 1][1]
-            if native_txt.strip():
-                chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
-                native_chunks.append(chunk)
-                combined_text += chunk
-                tmp_txt.write(chunk.encode("utf-8"))
-                yield combined_text, None
-            else:
-                if mode == "auto":
-                    pending_ocr.append(idx)
-                elif mode == "ocr":
-                    pending_ocr.append(idx)
-            prog.update(advance=1)
-        # OCR if needed
-        if pending_ocr:
-            try:
-                ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
-            except RuntimeError as e:
-                # Likely CUDA OOM → retry at lower dpi
-                ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
-            for idx, text in sorted(ocr_results, key=lambda x: x[0]):
-                if text.strip():
-                    chunk = f"--- Page {idx} (OCR) ---\n{text}\n"
-                    ocr_chunks.append(chunk)
-                    combined_text += chunk
-                    tmp_txt.write(chunk.encode("utf-8"))
-                    yield combined_text, None
-    tmp_txt.close()
-    # Final yield includes download‑file
-    yield combined_text or "⚠️ No text detected in the document.", tmp_txt_path
-# ----------------------------------------------------------------------
-# Gradio Blocks UI
-# ----------------------------------------------------------------------
-THEME = gr.themes.Base(
-    primary_hue="purple",
-    radius_size=gr.themes.sizes.radius_xxl,
-    spacing_size=gr.themes.sizes.spacing_md,
-)
-EXAMPLE_URLS = [
-    "https://arxiv.org/pdf/2106.14834.pdf",
-    "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
-]
-with gr.Blocks(theme=THEME, title="ZeroGPU PDF OCR") as demo:
-    gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")
     with gr.Row():
-        with gr.Column(scale=1, min_width=250):
             file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
             lang_in = gr.Dropdown(
-                ["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"],
-                multiselect=True,
-                value=["en"],
-                label="OCR language(s)"
             )
             mode_in = gr.Radio(
-                ["native", "ocr", "auto"],
-                value="auto",
                 label="Document type",
-                info="native = text only · ocr = images only · auto = mixed",
             )
-            run_btn = gr.Button("Extract", variant="primary")
         with gr.Column(scale=2):
-            txt_out = gr.Textbox(
-                label="Extracted Text (streaming)",
-                lines=18,
-                show_copy_button=True,
-            )
-            download_out = gr.File(label="Download .txt")
-    run_btn.click(
-        fn=pipeline,
-        inputs=[file_in, lang_in, mode_in],
-        outputs=[txt_out, download_out],
-    )
-    gr.Examples(
-        EXAMPLE_URLS,
-        inputs=file_in,
-        label="Quick‑test PDFs",
-        fn=None,
-    )
 if __name__ == "__main__":
     demo.launch()

 """
 📚 ZeroGPU Multilingual PDF Text Extractor
+(Gradio >= 4.1 compatible – Progress call‑style)
 """
+import os, tempfile, concurrent.futures, uuid
 from typing import List, Tuple
 import fitz  # PyMuPDF
 import pdfplumber
 import torch
 import gradio as gr
+import spaces
 import easyocr
+# ----------------- EasyOCR Reader Cache -----------------
 _READERS = {}
 def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
     key = tuple(sorted(lang_codes))
     if key not in _READERS:
+        _READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
     return _READERS[key]
+# --------------- OCR Worker (GPU) -----------------------
+@spaces.GPU(duration=600)
+def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]):
     reader = get_reader(lang_codes)
     doc = fitz.open(pdf_path)
+    def ocr_page(idx: int):
         pg = doc[idx - 1]
+        scale = 2 if max(pg.rect.width, pg.rect.height) <= 600 else 1.5
+        pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
         img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
         pix.save(img_path)
         if len(lang_codes) == 1:
+            details = reader.readtext(img_path, detail=1)
+            lines = [t for _, t, conf in details if conf > 0.2]
         else:
+            lines = reader.readtext(img_path, detail=0)
         os.remove(img_path)
+        return idx, "\n".join(lines)
     with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
+        return list(ex.map(ocr_page, page_ids))
+# --------------- Native text extraction ----------------
+def extract_native(pdf_path: str):
     with pdfplumber.open(pdf_path) as pdf:
+        return [(i+1, p.extract_text() or "") for i, p in enumerate(pdf.pages)]
+# --------------- Pipeline (generator) -------------------
 def pipeline(pdf_file, langs, mode):
     if pdf_file is None:
         raise gr.Error("Please upload a PDF.")
+    if os.path.getsize(pdf_file.name) > 200 * 1024 * 1024:
+        raise gr.Error("PDF larger than 200 MB; split it first.")
     langs = langs if isinstance(langs, list) else [langs]
     lang_tuple = tuple(langs)
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
+    combined = ""
+    progress = gr.Progress(track_tqdm=False)
+    doc = fitz.open(pdf_file.name)
+    page_total = doc.page_count
+    native = extract_native(pdf_file.name) if mode in ("native", "auto") else [(None,"")]*page_total
+    ocr_needed = []
+    for idx in range(1, page_total+1):
+        native_txt = native[idx-1][1] if mode in ("native","auto") else ""
+        if native_txt.strip():
+            chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
+            combined += chunk
+            tmp.write(chunk.encode())
+            yield combined, None
+        else:
+            if mode != "native":
+                ocr_needed.append(idx)
+        progress(idx/page_total)
+    if ocr_needed:
+        try:
+            ocr_results = run_ocr(pdf_file.name, ocr_needed, lang_tuple)
+        except RuntimeError:
+            ocr_results = run_ocr(pdf_file.name, ocr_needed, lang_tuple)
+        for idx, txt in sorted(ocr_results):
+            if txt.strip():
+                chunk = f"--- Page {idx} (OCR) ---\n{txt}\n"
+                combined += chunk
+                tmp.write(chunk.encode())
+                yield combined, None
+    tmp.close()
+    yield combined or "⚠️ No text detected.", tmp.name
+# ------------------ Interface --------------------------
+theme = gr.themes.Base(primary_hue="purple")
+with gr.Blocks(title="ZeroGPU OCR", theme=theme) as demo:
+    gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")
     with gr.Row():
+        with gr.Column(scale=1):
             file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
             lang_in = gr.Dropdown(
+                ["en","nl","de","fr","es","it","pt","ru","zh_cn","ja","ar"],
+                multiselect=True, value=["en"], label="OCR language(s)"
             )
             mode_in = gr.Radio(
+                ["native","ocr","auto"], value="auto",
                 label="Document type",
+                info="native=text · ocr=image · auto=mix"
             )
+            run_btn = gr.Button("Extract")
         with gr.Column(scale=2):
+            out_box = gr.Textbox(lines=18, label="Extracted Text", show_copy_button=True)
+            dl = gr.File(label="Download .txt")
+    run_btn.click(pipeline, [file_in, lang_in, mode_in], [out_box, dl])
 if __name__ == "__main__":
     demo.launch()