Spaces:

CultriX
/

Easy-OCR

Running on Zero

App Files Files Community

CultriX commited on Jun 18

Commit

f57cf41

verified ·

1 Parent(s): eda1863

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -94

app.py CHANGED Viewed

@@ -1,124 +1,213 @@
-"""
-📚 ZeroGPU Multilingual PDF Text Extractor
-(Gradio >= 4.1 compatible – Progress call‑style)
-"""
-import os, tempfile, concurrent.futures, uuid
 from typing import List, Tuple
-import fitz  # PyMuPDF
-import pdfplumber
 import torch
 import gradio as gr
-import spaces
 import easyocr
-# ----------------- EasyOCR Reader Cache -----------------
 _READERS = {}
-def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
     key = tuple(sorted(lang_codes))
     if key not in _READERS:
-        _READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
     return _READERS[key]
-# --------------- OCR Worker (GPU) -----------------------
-@spaces.GPU(duration=60)
-def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]):
     reader = get_reader(lang_codes)
-    doc = fitz.open(pdf_path)
-    def ocr_page(idx: int):
-        pg = doc[idx - 1]
-        scale = 2 if max(pg.rect.width, pg.rect.height) <= 600 else 1.5
-        pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
-        img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
-        pix.save(img_path)
         if len(lang_codes) == 1:
-            details = reader.readtext(img_path, detail=1)
-            lines = [t for _, t, conf in details if conf > 0.2]
         else:
-            lines = reader.readtext(img_path, detail=0)
-        os.remove(img_path)
-        return idx, "\n".join(lines)
-    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
-        return list(ex.map(ocr_page, page_ids))
-# --------------- Native text extraction ----------------
-def extract_native(pdf_path: str):
-    with pdfplumber.open(pdf_path) as pdf:
-        return [(i+1, p.extract_text() or "") for i, p in enumerate(pdf.pages)]
-# --------------- Pipeline (generator) -------------------
-def pipeline(pdf_file, langs, mode):
-    if pdf_file is None:
-        raise gr.Error("Please upload a PDF.")
-    if os.path.getsize(pdf_file.name) > 200 * 1024 * 1024:
-        raise gr.Error("PDF larger than 200 MB; split it first.")
     langs = langs if isinstance(langs, list) else [langs]
     lang_tuple = tuple(langs)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
     combined = ""
-    progress = gr.Progress(track_tqdm=False)
-    doc = fitz.open(pdf_file.name)
-    page_total = doc.page_count
-    native = extract_native(pdf_file.name) if mode in ("native", "auto") else [(None,"")]*page_total
-    ocr_needed = []
-    for idx in range(1, page_total+1):
-        native_txt = native[idx-1][1] if mode in ("native","auto") else ""
-        if native_txt.strip():
-            chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
-            combined += chunk
-            tmp.write(chunk.encode())
-            yield combined, None
-        else:
-            if mode != "native":
-                ocr_needed.append(idx)
-        progress(idx/page_total)
-    if ocr_needed:
-        try:
-            ocr_results = run_ocr(pdf_file.name, ocr_needed, lang_tuple)
-        except RuntimeError:
-            ocr_results = run_ocr(pdf_file.name, ocr_needed, lang_tuple)
-        for idx, txt in sorted(ocr_results):
-            if txt.strip():
                 chunk = f"--- Page {idx} (OCR) ---\n{txt}\n"
-                combined += chunk
-                tmp.write(chunk.encode())
                 yield combined, None
     tmp.close()
-    yield combined or "⚠️ No text detected.", tmp.name
-# ------------------ Interface --------------------------
 theme = gr.themes.Base(primary_hue="purple")
-with gr.Blocks(title="ZeroGPU OCR", theme=theme) as demo:
-    gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")
     with gr.Row():
         with gr.Column(scale=1):
-            file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
-            lang_in = gr.Dropdown(
-                ["en","nl","de","fr","es","it","pt","ru","zh_cn","ja","ar"],
-                multiselect=True, value=["en"], label="OCR language(s)"
-            )
-            mode_in = gr.Radio(
-                ["native","ocr","auto"], value="auto",
-                label="Document type",
-                info="native=text · ocr=image · auto=mix"
-            )
-            run_btn = gr.Button("Extract")
         with gr.Column(scale=2):
-            out_box = gr.Textbox(lines=18, label="Extracted Text", show_copy_button=True)
             dl = gr.File(label="Download .txt")
-    run_btn.click(pipeline, [file_in, lang_in, mode_in], [out_box, dl])
 if __name__ == "__main__":
     demo.launch()

+import os
+import tempfile
+import uuid
+import concurrent.futures
 from typing import List, Tuple
+import fitz  # PyMuPDF for PDF operations
 import torch
 import gradio as gr
+import spaces                # HuggingFace Spaces helper (ZeroGPU)
 import easyocr
+import warnings
+# Suppress benign CuDNN LSTM warning
+warnings.filterwarnings("ignore", "RNN module weights are not part")
+# ----------------------------------------------------------------------
+# Configuration constants
+# ----------------------------------------------------------------------
+SUPPORTED_FILE_TYPES = [
+    ".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"
+]
+LANGUAGES = ["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"]
+# Cap parallel OCR threads to avoid GPU OOM
+OCR_THREADS = min(int(os.getenv("OCR_THREADS", "2")), 2)
+# ----------------------------------------------------------------------
+# EasyOCR reader cache
+# ----------------------------------------------------------------------
 _READERS = {}
+def get_reader(lang_codes: Tuple[str, ...]):
+    """
+    Lazily initialize or retrieve an EasyOCR Reader for the given languages.
+    Uses spaces.is_gpu_enabled() to decide whether to run on GPU or CPU.
+    """
     key = tuple(sorted(lang_codes))
     if key not in _READERS:
+        gpu_flag = spaces.is_gpu_enabled()
+        _READERS[key] = easyocr.Reader(list(key), gpu=gpu_flag)
+        print(f"[Init] EasyOCR reader for {key} (GPU={'yes' if gpu_flag else 'no'})")
     return _READERS[key]
+# ----------------------------------------------------------------------
+# OCR helpers
+# ----------------------------------------------------------------------
+@spaces.GPU(duration=600)
+def run_ocr_pages(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]:
+    """
+    OCR the specified pages of a PDF.
+    Runs only when GPU is allocated (ZeroGPU); falls back to CPU if unavailable.
+    Processes pages in parallel threads, with per-page error handling.
+    """
+    reader = get_reader(lang_codes)
+    results = []
+    with fitz.open(pdf_path) as doc:
+        def ocr_page(idx: int) -> Tuple[int, str]:
+            try:
+                page = doc[idx - 1]
+                # Adaptive resolution: up to ~300dpi on normal pages
+                scale = 2 if max(page.rect.width, page.rect.height) <= 600 else 1.5
+                pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))
+                img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
+                pix.save(img_path)
+                # Single-language => detail mode with confidence filtering
+                if len(lang_codes) == 1:
+                    items = reader.readtext(img_path, detail=1)
+                    lines = [t for _, t, conf in items if conf > 0.2]
+                else:
+                    lines = reader.readtext(img_path, detail=0)
+                os.remove(img_path)
+                return idx, "\n".join(lines)
+            except Exception as e:
+                # Emit a warning instead of halting the entire batch
+                msg = f"⚠️ OCR error on page {idx}: {e}"
+                print(msg)
+                return idx, msg
+        # Cap threadpool size to avoid overloading GPU
+        workers = min(OCR_THREADS, len(page_ids))
+        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as pool:
+            futures = {pool.submit(ocr_page, pid): pid for pid in page_ids}
+            for fut in concurrent.futures.as_completed(futures):
+                results.append(fut.result())
+    return results
+def run_ocr_image(image_path: str, lang_codes: Tuple[str, ...]) -> str:
+    """
+    OCR a single image file.
+    Mirrors run_ocr_pages' logic but for one-shot image inputs.
+    """
     reader = get_reader(lang_codes)
+    try:
         if len(lang_codes) == 1:
+            items = reader.readtext(image_path, detail=1)
+            lines = [t for _, t, conf in items if conf > 0.2]
         else:
+            lines = reader.readtext(image_path, detail=0)
+        return "\n".join(lines)
+    except Exception as e:
+        msg = f"⚠️ OCR error on image: {e}"
+        print(msg)
+        return msg
+# ----------------------------------------------------------------------
+# Streamed output helper
+# ----------------------------------------------------------------------
+def emit_chunk(chunk: str, combined: str, tmp_file) -> Tuple[str, None]:
+    """
+    Append 'chunk' to the in-memory combined text and the temp file,
+    then return the updated combined text for streaming.
+    """
+    combined += chunk
+    tmp_file.write(chunk.encode("utf-8"))
+    return combined, None
+# ----------------------------------------------------------------------
+# Main extraction pipeline
+# ----------------------------------------------------------------------
+def pipeline(upload, langs, mode):
+    """
+    Handles PDF or image uploads, emits native and OCR text incrementally,
+    and provides a downloadable .txt at the end.
+    """
+    if upload is None:
+        raise gr.Error("Please upload a file.")
+    # File-size guard (200MB)
+    if os.path.getsize(upload.name) > 200 * 1024 * 1024:
+        raise gr.Error("File larger than 200 MB; please split it.")
+    # Prepare languages and temp output
     langs = langs if isinstance(langs, list) else [langs]
     lang_tuple = tuple(langs)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
     combined = ""
+    ext = os.path.splitext(upload.name)[1].lower()
+    # PDF flow
+    if ext == ".pdf":
+        # Streaming progress bar
+        progress = gr.Progress(track_tqdm=False)
+        with fitz.open(upload.name) as doc:
+            total_pages = doc.page_count
+        # Phase 1: Native-text extraction & OCR scheduling
+        ocr_pages = []
+        with fitz.open(upload.name) as doc:
+            for i, page in enumerate(doc, start=1):
+                text = page.get_text("text") if mode in ("native", "auto") else ""
+                if text.strip():
+                    chunk = f"--- Page {i} (native) ---\n{text}\n"
+                    combined, _ = emit_chunk(chunk, combined, tmp)
+                    yield combined, None
+                else:
+                    if mode in ("ocr", "auto"):
+                        ocr_pages.append(i)
+                progress(i / total_pages)
+        # Phase 2: OCR pass on scheduled pages
+        if ocr_pages:
+            ocr_results = run_ocr_pages(upload.name, ocr_pages, lang_tuple)
+            for idx, txt in sorted(ocr_results, key=lambda x: x[0]):
                 chunk = f"--- Page {idx} (OCR) ---\n{txt}\n"
+                combined, _ = emit_chunk(chunk, combined, tmp)
                 yield combined, None
+    # Image flow
+    else:
+        txt = run_ocr_image(upload.name, lang_tuple)
+        chunk = f"--- Image OCR ---\n{txt}\n"
+        combined, _ = emit_chunk(chunk, combined, tmp)
+        yield combined, None
     tmp.close()
+    # Final step: offer download link
+    yield combined or "⚠️ No text detected.", tmp.name
+# ----------------------------------------------------------------------
+# Gradio UI (Blocks + streaming)
+# ----------------------------------------------------------------------
 theme = gr.themes.Base(primary_hue="purple")
+with gr.Blocks(theme=theme, title="ZeroGPU OCR PDF & Image Extractor") as demo:
+    gr.Markdown("## 📚 ZeroGPU Multilingual OCR Extractor")
     with gr.Row():
         with gr.Column(scale=1):
+            file_in = gr.File(label="Upload PDF or image",
+                              file_types=SUPPORTED_FILE_TYPES)
+            lang_in = gr.Dropdown(LANGUAGES, multiselect=True, value=["en"],
+                                  label="OCR language(s)")
+            mode_in = gr.Radio(["native", "ocr", "auto"], value="auto",
+                               label="Mode",
+                               info="native=text · ocr=image · auto=mix")
+            btn = gr.Button("Extract", variant="primary")
         with gr.Column(scale=2):
+            out_txt = gr.Textbox(label="Extracted Text", lines=18,
+                                 show_copy_button=True)
             dl = gr.File(label="Download .txt")
+    # Explicit output mapping and enable streaming
+    btn.click(
+        fn=pipeline,
+        inputs=[file_in, lang_in, mode_in],
+        outputs={"out": out_txt, "dl": dl}
+    )
+    demo.queue()
 if __name__ == "__main__":
     demo.launch()