CultriX commited on
Commit
6583fc2
·
verified ·
1 Parent(s): e10399f

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +31 -29
  2. app.py +206 -62
  3. requirements.txt +1 -1
README.md CHANGED
@@ -1,29 +1,31 @@
1
- ---
2
- license: apache-2.0
3
- title: Easy-OCR
4
- sdk: gradio
5
- emoji: 📚
6
- colorFrom: blue
7
- colorTo: purple
8
- thumbnail: >-
9
- https://cdn-uploads.huggingface.co/production/uploads/6495d5a915d8ef6f01bc75eb/TSmoqoWGoatq_GLsau_La.png
10
- short_description: GPU-Accelerated OCR
11
- ---
12
- # ZeroGPU OCR PDF Extractor
13
-
14
- **Key features**
15
-
16
- * ⚡️ *On‑demand GPU* — the `@spaces.GPU` decorator grabs a GPU only while OCR is running. Perfect for HuggingFace **ZeroGPU** Spaces.
17
- * 📝 Combines native PDF text (via **pdfplumber**) with OCR from images (via **EasyOCR**).
18
- * 🌍 Multilingual: add language codes to the `LANGS` list in `app.py`.
19
-
20
- ## Deploy
21
-
22
- 1. Create a *Gradio* Space and pick **ZeroGPU** in the **Hardware** dropdown (requires a PRO subscription).
23
- 2. Upload these files or the ZIP bundle.
24
- 3. Commit — the Space will build automatically. The first call downloads EasyOCR model weights (~200 MB).
25
-
26
- ## Usage Tips
27
-
28
- * Large PDFs can take several minutes; the decorator is set to `duration=600` s. Adjust if needed.
29
- * For faster queues, lower the duration if your documents are small.
 
 
 
1
+ # ZeroGPU Multilingual PDF Text Extractor
2
+
3
+ This Space marries **speed**, **accuracy**, and a polished **UX**:
4
+
5
+ | Capability | How |
6
+ |------------|-----|
7
+ | On‑demand GPU | `@spaces.GPU` wraps only the OCR phase – 0 credits burnt when you choose **native** mode. |
8
+ | Streaming output | Results appear page‑by‑page; no more guessing “is it stuck?”. |
9
+ | Progress bar | Slick Gradio 4 `Progress` widget with pages processed / total. |
10
+ | Language picker | Loads exactly the EasyOCR models you need for sharper accuracy & faster warm‑up. |
11
+ | Modes | **native** (embedded text only), **ocr** (images only), **auto** (mixed). |
12
+ | Download button | Get a `.txt` file of the final output. |
13
+ | UX polish | Two‑column responsive layout, soft purple theme, sample PDFs for instant demo. |
14
+ | Robustness | File‑size guard (200 MB), CUDA OOM retry at lower DPI, unsupported language error message. |
15
+
16
+ ## Running locally
17
+
18
+ ```bash
19
+ pip install -r requirements.txt
20
+ python app.py
21
+ ```
22
+
23
+ ## Deploy on HuggingFace
24
+
25
+ 1. Create a **Gradio** Space and pick **ZeroGPU** hardware.
26
+ 2. Upload these files or the ZIP bundle.
27
+ 3. Commit – first OCR call will download model weights (~200 MB each language family).
28
+
29
+ ## Maintainers
30
+
31
+ *Run `black app.py && ruff app.py` before committing to stay stylish.*
app.py CHANGED
@@ -1,79 +1,223 @@
1
 
2
  """
3
- ZeroGPU‑ready OCR PDF extractor for HuggingFace Spaces
4
- -----------------------------------------------------
5
- • Uses @spaces.GPU to request a GPU only while needed (ZeroGPU compatible)
6
- • Extracts native text with `pdfplumber`
7
- • Runs GPU‑accelerated OCR on page images with `EasyOCR`
 
 
 
 
 
 
 
 
8
  """
9
 
10
- import gradio as gr
 
 
11
  import fitz # PyMuPDF
12
  import pdfplumber
13
- import easyocr
14
  import torch
15
- import tempfile
16
- import os
17
- import spaces # <-- ZeroGPU decorator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- # Global reader object (lazy‑loaded after GPU is allocated)
20
- READER = None
21
- LANGS = ['en'] # add more language codes as desired
22
 
23
- @spaces.GPU(duration=600) # request a GPU for up to 10 min per call
24
- def extract_text(pdf_file):
25
- """Extract text (native + OCR) from an uploaded PDF"""
26
- global READER
 
 
 
 
 
27
 
28
- # Initialise EasyOCR reader after GPU becomes available
29
- if READER is None:
30
- READER = easyocr.Reader(LANGS, gpu=torch.cuda.is_available())
 
 
 
 
 
 
 
 
 
31
 
32
- native_chunks = []
33
- ocr_chunks = []
 
 
 
 
34
 
35
- # Pass 1 — native text via pdfplumber
36
- with pdfplumber.open(pdf_file.name) as pdf:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  for idx, page in enumerate(pdf.pages, start=1):
38
- txt = page.extract_text() or ""
39
- if txt.strip():
40
- native_chunks.append(f"--- Page {idx} (native) ---\n{txt}\n")
41
-
42
- # Pass 2 — OCR each rendered page image with PyMuPDF + EasyOCR
43
- doc = fitz.open(pdf_file.name)
44
- for idx, page in enumerate(doc, start=1):
45
- # Render page image at ~300 dpi
46
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
47
- tmp_path = os.path.join(tempfile.gettempdir(), f"page_{idx}.png")
48
- pix.save(tmp_path)
49
-
50
- ocr_result = READER.readtext(tmp_path, detail=0)
51
- os.remove(tmp_path)
52
-
53
- if any(line.strip() for line in ocr_result):
54
- ocr_text = "\n".join(ocr_result)
55
- ocr_chunks.append(f"--- Page {idx} (OCR) ---\n{ocr_text}\n")
56
-
57
- combined = "\n".join(native_chunks + ocr_chunks)
58
- return combined or "⚠️ No text detected in the document."
59
-
60
- DESCRIPTION = (
61
- "Drop a PDF to extract **all** text. "
62
- "Native PDF text is captured first; any remaining text in images is "
63
- "recognized using EasyOCR. On ZeroGPU hardware, the app requests a "
64
- "GPU *only* while OCR is running."
65
- )
66
 
67
- iface = gr.Interface(
68
- fn=extract_text,
69
- inputs=gr.File(label="Upload PDF"),
70
- outputs=gr.Textbox(label="Extracted Text", show_copy_button=True),
71
- title="ZeroGPU OCR PDF Extractor",
72
- description=DESCRIPTION,
73
- allow_flagging="never",
74
- examples=None,
75
- theme="default",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  )
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  if __name__ == "__main__":
79
- iface.launch()
 
1
 
2
  """
3
+ 📚 ZeroGPU Multilingual PDF Text Extractor
4
+ =========================================
5
+
6
+ Features
7
+ --------
8
+ • **Native / OCR / Hybrid** modes
9
+ • **Language chooser** (multiselect) with EasyOCR model caching
10
+ • **ZeroGPU** pay‑as‑you‑go: GPU allocated *only* while OCR runs
11
+ • **Streamed output** page‑by‑page + real‑time progress bar
12
+ • **Download‑as‑TXT** button
13
+ • Basic **error handling** (oversize PDF, CUDA OOM, unsupported language)
14
+
15
+ Maintained as a single file (`app.py`) for simplicity.
16
  """
17
 
18
+ import os, tempfile, concurrent.futures, itertools, functools, uuid
19
+ from typing import List, Tuple
20
+
21
  import fitz # PyMuPDF
22
  import pdfplumber
 
23
  import torch
24
+ import gradio as gr
25
+ import spaces # HF Spaces helper (for ZeroGPU)
26
+ import easyocr
27
+
28
+ # ----------------------------------------------------------------------
29
+ # Caching for EasyOCR readers (models are heavy; reuse them)
30
+ # ----------------------------------------------------------------------
31
+ _READERS = {}
32
+
33
+ def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
34
+ key = tuple(sorted(lang_codes))
35
+ if key not in _READERS:
36
+ try:
37
+ _READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
38
+ except ValueError as e:
39
+ raise gr.Error(str(e))
40
+ return _READERS[key]
41
 
 
 
 
42
 
43
+ # ----------------------------------------------------------------------
44
+ # GPU‑decorated OCR worker (runs ONLY when called)
45
+ # ----------------------------------------------------------------------
46
+ @spaces.GPU(duration=600)
47
+ def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]:
48
+ """OCR designated pages and return list[(page_num, text)]."""
49
+ reader = get_reader(lang_codes)
50
+ doc = fitz.open(pdf_path)
51
+ results = []
52
 
53
+ def ocr_single(idx: int):
54
+ pg = doc[idx - 1]
55
+ # Adaptive rasterisation scale (A4 ~= 595 × 842 pt)
56
+ max_side = max(pg.rect.width, pg.rect.height)
57
+ scale = 2 if max_side <= 600 else 1.5
58
+ try:
59
+ pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
60
+ except RuntimeError:
61
+ # Fallback lower dpi if page too huge
62
+ pix = pg.get_pixmap()
63
+ img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
64
+ pix.save(img_path)
65
 
66
+ # Single-language ⇒ use detail=1 to filter low‑confidence lines
67
+ if len(lang_codes) == 1:
68
+ tmp = reader.readtext(img_path, detail=1)
69
+ txt_lines = [text for _, text, conf in tmp if conf > 0.2]
70
+ else:
71
+ txt_lines = reader.readtext(img_path, detail=0)
72
 
73
+ os.remove(img_path)
74
+ return idx, "\n".join(txt_lines)
75
+
76
+ # Light parallelism (GPU friendly)
77
+ with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
78
+ futures = {ex.submit(ocr_single, i): i for i in page_ids}
79
+ for fut in concurrent.futures.as_completed(futures):
80
+ results.append(fut.result())
81
+
82
+ return results
83
+
84
+
85
+ # ----------------------------------------------------------------------
86
+ # Native text extractor helper
87
+ # ----------------------------------------------------------------------
88
+ def extract_native(pdf_path: str, x_tol: float = 1) -> List[Tuple[int, str]]:
89
+ with pdfplumber.open(pdf_path) as pdf:
90
+ out = []
91
  for idx, page in enumerate(pdf.pages, start=1):
92
+ txt = page.extract_text(x_tolerance=x_tol) or ""
93
+ out.append((idx, txt))
94
+ return out
95
+
96
+
97
+ # ----------------------------------------------------------------------
98
+ # Main pipeline (Gradio generator)
99
+ # ----------------------------------------------------------------------
100
+ def pipeline(pdf_file, langs, mode):
101
+ if pdf_file is None:
102
+ raise gr.Error("Please upload a PDF.")
103
+
104
+ # Guard: size limit 200 MB
105
+ max_size = 200 * 1024 * 1024
106
+ if os.path.getsize(pdf_file.name) > max_size:
107
+ raise gr.Error("PDF larger than 200 MB. Please split the document.")
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ langs = langs if isinstance(langs, list) else [langs]
110
+ lang_tuple = tuple(langs)
111
+
112
+ native_chunks, ocr_chunks = [], []
113
+ combined_text = ""
114
+
115
+ # Create a temporary TXT file for incremental writing (download button)
116
+ tmp_txt = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
117
+ tmp_txt_path = tmp_txt.name
118
+
119
+ # Progress bar context
120
+ with gr.Progress(track_tqdm=False) as prog:
121
+ native_pages = extract_native(pdf_file.name) if mode in ("native", "auto") else []
122
+ total_pages = len(native_pages) if native_pages else fitz.open(pdf_file.name).page_count
123
+ prog.tqdm(total=total_pages)
124
+
125
+ # Process pages one by one (stream output)
126
+ pending_ocr = []
127
+
128
+ for idx in range(1, total_pages + 1):
129
+ native_txt = ""
130
+ if mode in ("native", "auto"):
131
+ native_txt = native_pages[idx - 1][1]
132
+
133
+ if native_txt.strip():
134
+ chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
135
+ native_chunks.append(chunk)
136
+ combined_text += chunk
137
+ tmp_txt.write(chunk.encode("utf-8"))
138
+ yield combined_text, None
139
+ else:
140
+ if mode == "auto":
141
+ pending_ocr.append(idx)
142
+ elif mode == "ocr":
143
+ pending_ocr.append(idx)
144
+ prog.update(advance=1)
145
+
146
+ # OCR if needed
147
+ if pending_ocr:
148
+ try:
149
+ ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
150
+ except RuntimeError as e:
151
+ # Likely CUDA OOM → retry at lower dpi
152
+ ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
153
+
154
+ for idx, text in sorted(ocr_results, key=lambda x: x[0]):
155
+ if text.strip():
156
+ chunk = f"--- Page {idx} (OCR) ---\n{text}\n"
157
+ ocr_chunks.append(chunk)
158
+ combined_text += chunk
159
+ tmp_txt.write(chunk.encode("utf-8"))
160
+ yield combined_text, None
161
+
162
+ tmp_txt.close()
163
+ # Final yield includes download‑file
164
+ yield combined_text or "⚠️ No text detected in the document.", tmp_txt_path
165
+
166
+
167
+ # ----------------------------------------------------------------------
168
+ # Gradio Blocks UI
169
+ # ----------------------------------------------------------------------
170
+ THEME = gr.themes.Base(
171
+ primary_hue="purple",
172
+ radius_size=gr.themes.sizes.radius_xl,
173
+ spacing_size=gr.themes.sizes.spacing_md,
174
  )
175
 
176
+ EXAMPLE_URLS = [
177
+ "https://arxiv.org/pdf/2106.14834.pdf",
178
+ "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
179
+ ]
180
+
181
+ with gr.Blocks(theme=THEME, title="ZeroGPU PDF OCR") as demo:
182
+ gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")
183
+
184
+ with gr.Row():
185
+ with gr.Column(scale=1, min_width=250):
186
+ file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
187
+ lang_in = gr.Dropdown(
188
+ ["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"],
189
+ multiselect=True,
190
+ value=["en"],
191
+ label="OCR language(s)"
192
+ )
193
+ mode_in = gr.Radio(
194
+ ["native", "ocr", "auto"],
195
+ value="auto",
196
+ label="Document type",
197
+ info="native = text only · ocr = images only · auto = mixed",
198
+ )
199
+ run_btn = gr.Button("Extract", variant="primary")
200
+
201
+ with gr.Column(scale=2):
202
+ txt_out = gr.Textbox(
203
+ label="Extracted Text (streaming)",
204
+ lines=18,
205
+ show_copy_button=True,
206
+ )
207
+ download_out = gr.File(label="Download .txt")
208
+
209
+ run_btn.click(
210
+ fn=pipeline,
211
+ inputs=[file_in, lang_in, mode_in],
212
+ outputs=[txt_out, download_out],
213
+ )
214
+
215
+ gr.Examples(
216
+ EXAMPLE_URLS,
217
+ inputs=file_in,
218
+ label="Quick‑test PDFs",
219
+ fn=None,
220
+ )
221
+
222
  if __name__ == "__main__":
223
+ demo.launch()
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio>=5.34.1
2
  easyocr>=1.7.1
3
  torch>=2.0
4
  pdfplumber>=0.10.3
 
1
+ gradio>=4.1
2
  easyocr>=1.7.1
3
  torch>=2.0
4
  pdfplumber>=0.10.3