CultriX commited on
Commit
3d8470e
·
verified ·
1 Parent(s): 59f3059

Upload 2 files

Browse files
Files changed (1) hide show
  1. app.py +73 -172
app.py CHANGED
@@ -1,223 +1,124 @@
1
 
2
  """
3
  📚 ZeroGPU Multilingual PDF Text Extractor
4
- =========================================
5
-
6
- Features
7
- --------
8
- • **Native / OCR / Hybrid** modes
9
- • **Language chooser** (multiselect) with EasyOCR model caching
10
- • **ZeroGPU** pay‑as‑you‑go: GPU allocated *only* while OCR runs
11
- • **Streamed output** page‑by‑page + real‑time progress bar
12
- • **Download‑as‑TXT** button
13
- • Basic **error handling** (oversize PDF, CUDA OOM, unsupported language)
14
-
15
- Maintained as a single file (`app.py`) for simplicity.
16
  """
17
 
18
- import os, tempfile, concurrent.futures, itertools, functools, uuid
19
  from typing import List, Tuple
20
 
21
  import fitz # PyMuPDF
22
  import pdfplumber
23
  import torch
24
  import gradio as gr
25
- import spaces # HF Spaces helper (for ZeroGPU)
26
  import easyocr
27
 
28
- # ----------------------------------------------------------------------
29
- # Caching for EasyOCR readers (models are heavy; reuse them)
30
- # ----------------------------------------------------------------------
31
  _READERS = {}
32
-
33
  def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
34
  key = tuple(sorted(lang_codes))
35
  if key not in _READERS:
36
- try:
37
- _READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
38
- except ValueError as e:
39
- raise gr.Error(str(e))
40
  return _READERS[key]
41
 
42
-
43
- # ----------------------------------------------------------------------
44
- # GPU‑decorated OCR worker (runs ONLY when called)
45
- # ----------------------------------------------------------------------
46
- @spaces.GPU(duration=60)
47
- def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]:
48
- """OCR designated pages and return list[(page_num, text)]."""
49
  reader = get_reader(lang_codes)
50
  doc = fitz.open(pdf_path)
51
- results = []
52
 
53
- def ocr_single(idx: int):
54
  pg = doc[idx - 1]
55
- # Adaptive rasterisation scale (A4 ~= 595 × 842 pt)
56
- max_side = max(pg.rect.width, pg.rect.height)
57
- scale = 2 if max_side <= 600 else 1.5
58
- try:
59
- pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
60
- except RuntimeError:
61
- # Fallback lower dpi if page too huge
62
- pix = pg.get_pixmap()
63
  img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
64
  pix.save(img_path)
65
-
66
- # Single-language ⇒ use detail=1 to filter low‑confidence lines
67
  if len(lang_codes) == 1:
68
- tmp = reader.readtext(img_path, detail=1)
69
- txt_lines = [text for _, text, conf in tmp if conf > 0.2]
70
  else:
71
- txt_lines = reader.readtext(img_path, detail=0)
72
-
73
  os.remove(img_path)
74
- return idx, "\n".join(txt_lines)
75
 
76
- # Light parallelism (GPU friendly)
77
  with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
78
- futures = {ex.submit(ocr_single, i): i for i in page_ids}
79
- for fut in concurrent.futures.as_completed(futures):
80
- results.append(fut.result())
81
 
82
- return results
83
-
84
-
85
- # ----------------------------------------------------------------------
86
- # Native text extractor helper
87
- # ----------------------------------------------------------------------
88
- def extract_native(pdf_path: str, x_tol: float = 1) -> List[Tuple[int, str]]:
89
  with pdfplumber.open(pdf_path) as pdf:
90
- out = []
91
- for idx, page in enumerate(pdf.pages, start=1):
92
- txt = page.extract_text(x_tolerance=x_tol) or ""
93
- out.append((idx, txt))
94
- return out
95
 
96
-
97
- # ----------------------------------------------------------------------
98
- # Main pipeline (Gradio generator)
99
- # ----------------------------------------------------------------------
100
  def pipeline(pdf_file, langs, mode):
101
  if pdf_file is None:
102
  raise gr.Error("Please upload a PDF.")
103
-
104
- # Guard: size limit 200 MB
105
- max_size = 200 * 1024 * 1024
106
- if os.path.getsize(pdf_file.name) > max_size:
107
- raise gr.Error("PDF larger than 200 MB. Please split the document.")
108
 
109
  langs = langs if isinstance(langs, list) else [langs]
110
  lang_tuple = tuple(langs)
111
 
112
- native_chunks, ocr_chunks = [], []
113
- combined_text = ""
114
-
115
- # Create a temporary TXT file for incremental writing (download button)
116
- tmp_txt = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
117
- tmp_txt_path = tmp_txt.name
118
-
119
- # Progress bar context
120
- with gr.Progress(track_tqdm=False) as prog:
121
- native_pages = extract_native(pdf_file.name) if mode in ("native", "auto") else []
122
- total_pages = len(native_pages) if native_pages else fitz.open(pdf_file.name).page_count
123
- prog.tqdm(total=total_pages)
124
-
125
- # Process pages one by one (stream output)
126
- pending_ocr = []
127
-
128
- for idx in range(1, total_pages + 1):
129
- native_txt = ""
130
- if mode in ("native", "auto"):
131
- native_txt = native_pages[idx - 1][1]
132
-
133
- if native_txt.strip():
134
- chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
135
- native_chunks.append(chunk)
136
- combined_text += chunk
137
- tmp_txt.write(chunk.encode("utf-8"))
138
- yield combined_text, None
139
- else:
140
- if mode == "auto":
141
- pending_ocr.append(idx)
142
- elif mode == "ocr":
143
- pending_ocr.append(idx)
144
- prog.update(advance=1)
145
-
146
- # OCR if needed
147
- if pending_ocr:
148
- try:
149
- ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
150
- except RuntimeError as e:
151
- # Likely CUDA OOM → retry at lower dpi
152
- ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
153
-
154
- for idx, text in sorted(ocr_results, key=lambda x: x[0]):
155
- if text.strip():
156
- chunk = f"--- Page {idx} (OCR) ---\n{text}\n"
157
- ocr_chunks.append(chunk)
158
- combined_text += chunk
159
- tmp_txt.write(chunk.encode("utf-8"))
160
- yield combined_text, None
161
-
162
- tmp_txt.close()
163
- # Final yield includes download‑file
164
- yield combined_text or "⚠️ No text detected in the document.", tmp_txt_path
165
-
166
-
167
- # ----------------------------------------------------------------------
168
- # Gradio Blocks UI
169
- # ----------------------------------------------------------------------
170
- THEME = gr.themes.Base(
171
- primary_hue="purple",
172
- radius_size=gr.themes.sizes.radius_xxl,
173
- spacing_size=gr.themes.sizes.spacing_md,
174
- )
175
-
176
- EXAMPLE_URLS = [
177
- "https://arxiv.org/pdf/2106.14834.pdf",
178
- "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
179
- ]
180
-
181
- with gr.Blocks(theme=THEME, title="ZeroGPU PDF OCR") as demo:
182
- gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  with gr.Row():
185
- with gr.Column(scale=1, min_width=250):
186
  file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
187
  lang_in = gr.Dropdown(
188
- ["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"],
189
- multiselect=True,
190
- value=["en"],
191
- label="OCR language(s)"
192
  )
193
  mode_in = gr.Radio(
194
- ["native", "ocr", "auto"],
195
- value="auto",
196
  label="Document type",
197
- info="native = text only · ocr = images only · auto = mixed",
198
  )
199
- run_btn = gr.Button("Extract", variant="primary")
200
-
201
  with gr.Column(scale=2):
202
- txt_out = gr.Textbox(
203
- label="Extracted Text (streaming)",
204
- lines=18,
205
- show_copy_button=True,
206
- )
207
- download_out = gr.File(label="Download .txt")
208
-
209
- run_btn.click(
210
- fn=pipeline,
211
- inputs=[file_in, lang_in, mode_in],
212
- outputs=[txt_out, download_out],
213
- )
214
-
215
- gr.Examples(
216
- EXAMPLE_URLS,
217
- inputs=file_in,
218
- label="Quick‑test PDFs",
219
- fn=None,
220
- )
221
-
222
  if __name__ == "__main__":
223
  demo.launch()
 
1
 
2
  """
3
  📚 ZeroGPU Multilingual PDF Text Extractor
4
+ (Gradio >= 4.1 compatible – Progress call‑style)
 
 
 
 
 
 
 
 
 
 
 
5
  """
6
 
7
+ import os, tempfile, concurrent.futures, uuid
8
  from typing import List, Tuple
9
 
10
  import fitz # PyMuPDF
11
  import pdfplumber
12
  import torch
13
  import gradio as gr
14
+ import spaces
15
  import easyocr
16
 
17
+ # ----------------- EasyOCR Reader Cache -----------------
 
 
18
  _READERS = {}
 
19
  def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
20
  key = tuple(sorted(lang_codes))
21
  if key not in _READERS:
22
+ _READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
 
 
 
23
  return _READERS[key]
24
 
25
+ # --------------- OCR Worker (GPU) -----------------------
26
+ @spaces.GPU(duration=600)
27
+ def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]):
 
 
 
 
28
  reader = get_reader(lang_codes)
29
  doc = fitz.open(pdf_path)
 
30
 
31
+ def ocr_page(idx: int):
32
  pg = doc[idx - 1]
33
+ scale = 2 if max(pg.rect.width, pg.rect.height) <= 600 else 1.5
34
+ pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
 
 
 
 
 
 
35
  img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
36
  pix.save(img_path)
 
 
37
  if len(lang_codes) == 1:
38
+ details = reader.readtext(img_path, detail=1)
39
+ lines = [t for _, t, conf in details if conf > 0.2]
40
  else:
41
+ lines = reader.readtext(img_path, detail=0)
 
42
  os.remove(img_path)
43
+ return idx, "\n".join(lines)
44
 
 
45
  with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
46
+ return list(ex.map(ocr_page, page_ids))
 
 
47
 
48
+ # --------------- Native text extraction ----------------
49
+ def extract_native(pdf_path: str):
 
 
 
 
 
50
  with pdfplumber.open(pdf_path) as pdf:
51
+ return [(i+1, p.extract_text() or "") for i, p in enumerate(pdf.pages)]
 
 
 
 
52
 
53
+ # --------------- Pipeline (generator) -------------------
 
 
 
54
  def pipeline(pdf_file, langs, mode):
55
  if pdf_file is None:
56
  raise gr.Error("Please upload a PDF.")
57
+ if os.path.getsize(pdf_file.name) > 200 * 1024 * 1024:
58
+ raise gr.Error("PDF larger than 200 MB; split it first.")
 
 
 
59
 
60
  langs = langs if isinstance(langs, list) else [langs]
61
  lang_tuple = tuple(langs)
62
 
63
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
64
+ combined = ""
65
+
66
+ progress = gr.Progress(track_tqdm=False)
67
+
68
+ doc = fitz.open(pdf_file.name)
69
+ page_total = doc.page_count
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ native = extract_native(pdf_file.name) if mode in ("native", "auto") else [(None,"")]*page_total
72
+
73
+ ocr_needed = []
74
+ for idx in range(1, page_total+1):
75
+ native_txt = native[idx-1][1] if mode in ("native","auto") else ""
76
+ if native_txt.strip():
77
+ chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
78
+ combined += chunk
79
+ tmp.write(chunk.encode())
80
+ yield combined, None
81
+ else:
82
+ if mode != "native":
83
+ ocr_needed.append(idx)
84
+ progress(idx/page_total)
85
+
86
+ if ocr_needed:
87
+ try:
88
+ ocr_results = run_ocr(pdf_file.name, ocr_needed, lang_tuple)
89
+ except RuntimeError:
90
+ ocr_results = run_ocr(pdf_file.name, ocr_needed, lang_tuple)
91
+
92
+ for idx, txt in sorted(ocr_results):
93
+ if txt.strip():
94
+ chunk = f"--- Page {idx} (OCR) ---\n{txt}\n"
95
+ combined += chunk
96
+ tmp.write(chunk.encode())
97
+ yield combined, None
98
+
99
+ tmp.close()
100
+ yield combined or "⚠️ No text detected.", tmp.name
101
+
102
+ # ------------------ Interface --------------------------
103
+ theme = gr.themes.Base(primary_hue="purple")
104
+ with gr.Blocks(title="ZeroGPU OCR", theme=theme) as demo:
105
+ gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")
106
  with gr.Row():
107
+ with gr.Column(scale=1):
108
  file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
109
  lang_in = gr.Dropdown(
110
+ ["en","nl","de","fr","es","it","pt","ru","zh_cn","ja","ar"],
111
+ multiselect=True, value=["en"], label="OCR language(s)"
 
 
112
  )
113
  mode_in = gr.Radio(
114
+ ["native","ocr","auto"], value="auto",
 
115
  label="Document type",
116
+ info="native=text · ocr=image · auto=mix"
117
  )
118
+ run_btn = gr.Button("Extract")
 
119
  with gr.Column(scale=2):
120
+ out_box = gr.Textbox(lines=18, label="Extracted Text", show_copy_button=True)
121
+ dl = gr.File(label="Download .txt")
122
+ run_btn.click(pipeline, [file_in, lang_in, mode_in], [out_box, dl])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  if __name__ == "__main__":
124
  demo.launch()