CultriX commited on
Commit
f57cf41
·
verified ·
1 Parent(s): eda1863

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -94
app.py CHANGED
@@ -1,124 +1,213 @@
1
-
2
- """
3
- 📚 ZeroGPU Multilingual PDF Text Extractor
4
- (Gradio >= 4.1 compatible – Progress call‑style)
5
- """
6
-
7
- import os, tempfile, concurrent.futures, uuid
8
  from typing import List, Tuple
9
 
10
- import fitz # PyMuPDF
11
- import pdfplumber
12
  import torch
13
  import gradio as gr
14
- import spaces
15
  import easyocr
16
-
17
- # ----------------- EasyOCR Reader Cache -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  _READERS = {}
19
- def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
 
 
 
 
 
20
  key = tuple(sorted(lang_codes))
21
  if key not in _READERS:
22
- _READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
 
 
23
  return _READERS[key]
24
 
25
- # --------------- OCR Worker (GPU) -----------------------
26
- @spaces.GPU(duration=60)
27
- def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  reader = get_reader(lang_codes)
29
- doc = fitz.open(pdf_path)
30
-
31
- def ocr_page(idx: int):
32
- pg = doc[idx - 1]
33
- scale = 2 if max(pg.rect.width, pg.rect.height) <= 600 else 1.5
34
- pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
35
- img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
36
- pix.save(img_path)
37
  if len(lang_codes) == 1:
38
- details = reader.readtext(img_path, detail=1)
39
- lines = [t for _, t, conf in details if conf > 0.2]
40
  else:
41
- lines = reader.readtext(img_path, detail=0)
42
- os.remove(img_path)
43
- return idx, "\n".join(lines)
44
-
45
- with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
46
- return list(ex.map(ocr_page, page_ids))
47
-
48
- # --------------- Native text extraction ----------------
49
- def extract_native(pdf_path: str):
50
- with pdfplumber.open(pdf_path) as pdf:
51
- return [(i+1, p.extract_text() or "") for i, p in enumerate(pdf.pages)]
52
-
53
- # --------------- Pipeline (generator) -------------------
54
- def pipeline(pdf_file, langs, mode):
55
- if pdf_file is None:
56
- raise gr.Error("Please upload a PDF.")
57
- if os.path.getsize(pdf_file.name) > 200 * 1024 * 1024:
58
- raise gr.Error("PDF larger than 200 MB; split it first.")
59
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  langs = langs if isinstance(langs, list) else [langs]
61
  lang_tuple = tuple(langs)
62
-
63
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
64
  combined = ""
65
 
66
- progress = gr.Progress(track_tqdm=False)
67
-
68
- doc = fitz.open(pdf_file.name)
69
- page_total = doc.page_count
70
-
71
- native = extract_native(pdf_file.name) if mode in ("native", "auto") else [(None,"")]*page_total
72
-
73
- ocr_needed = []
74
- for idx in range(1, page_total+1):
75
- native_txt = native[idx-1][1] if mode in ("native","auto") else ""
76
- if native_txt.strip():
77
- chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
78
- combined += chunk
79
- tmp.write(chunk.encode())
80
- yield combined, None
81
- else:
82
- if mode != "native":
83
- ocr_needed.append(idx)
84
- progress(idx/page_total)
85
-
86
- if ocr_needed:
87
- try:
88
- ocr_results = run_ocr(pdf_file.name, ocr_needed, lang_tuple)
89
- except RuntimeError:
90
- ocr_results = run_ocr(pdf_file.name, ocr_needed, lang_tuple)
91
-
92
- for idx, txt in sorted(ocr_results):
93
- if txt.strip():
94
  chunk = f"--- Page {idx} (OCR) ---\n{txt}\n"
95
- combined += chunk
96
- tmp.write(chunk.encode())
97
  yield combined, None
98
 
 
 
 
 
 
 
 
99
  tmp.close()
100
- yield combined or "⚠️ No text detected.", tmp.name
 
101
 
102
- # ------------------ Interface --------------------------
 
 
103
  theme = gr.themes.Base(primary_hue="purple")
104
- with gr.Blocks(title="ZeroGPU OCR", theme=theme) as demo:
105
- gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")
106
  with gr.Row():
107
  with gr.Column(scale=1):
108
- file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
109
- lang_in = gr.Dropdown(
110
- ["en","nl","de","fr","es","it","pt","ru","zh_cn","ja","ar"],
111
- multiselect=True, value=["en"], label="OCR language(s)"
112
- )
113
- mode_in = gr.Radio(
114
- ["native","ocr","auto"], value="auto",
115
- label="Document type",
116
- info="native=text · ocr=image · auto=mix"
117
- )
118
- run_btn = gr.Button("Extract")
119
  with gr.Column(scale=2):
120
- out_box = gr.Textbox(lines=18, label="Extracted Text", show_copy_button=True)
 
121
  dl = gr.File(label="Download .txt")
122
- run_btn.click(pipeline, [file_in, lang_in, mode_in], [out_box, dl])
 
 
 
 
 
 
 
 
123
  if __name__ == "__main__":
124
  demo.launch()
 
1
+ import os
2
+ import tempfile
3
+ import uuid
4
+ import concurrent.futures
 
 
 
5
  from typing import List, Tuple
6
 
7
+ import fitz # PyMuPDF for PDF operations
 
8
  import torch
9
  import gradio as gr
10
+ import spaces # HuggingFace Spaces helper (ZeroGPU)
11
  import easyocr
12
+ import warnings
13
+
14
+ # Suppress benign CuDNN LSTM warning
15
+ warnings.filterwarnings("ignore", "RNN module weights are not part")
16
+
17
+ # ----------------------------------------------------------------------
18
+ # Configuration constants
19
+ # ----------------------------------------------------------------------
20
+ SUPPORTED_FILE_TYPES = [
21
+ ".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"
22
+ ]
23
+ LANGUAGES = ["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"]
24
+ # Cap parallel OCR threads to avoid GPU OOM
25
+ OCR_THREADS = min(int(os.getenv("OCR_THREADS", "2")), 2)
26
+
27
+ # ----------------------------------------------------------------------
28
+ # EasyOCR reader cache
29
+ # ----------------------------------------------------------------------
30
  _READERS = {}
31
+
32
+ def get_reader(lang_codes: Tuple[str, ...]):
33
+ """
34
+ Lazily initialize or retrieve an EasyOCR Reader for the given languages.
35
+ Uses spaces.is_gpu_enabled() to decide whether to run on GPU or CPU.
36
+ """
37
  key = tuple(sorted(lang_codes))
38
  if key not in _READERS:
39
+ gpu_flag = spaces.is_gpu_enabled()
40
+ _READERS[key] = easyocr.Reader(list(key), gpu=gpu_flag)
41
+ print(f"[Init] EasyOCR reader for {key} (GPU={'yes' if gpu_flag else 'no'})")
42
  return _READERS[key]
43
 
44
+ # ----------------------------------------------------------------------
45
+ # OCR helpers
46
+ # ----------------------------------------------------------------------
47
+ @spaces.GPU(duration=600)
48
+ def run_ocr_pages(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]:
49
+ """
50
+ OCR the specified pages of a PDF.
51
+ Runs only when GPU is allocated (ZeroGPU); falls back to CPU if unavailable.
52
+ Processes pages in parallel threads, with per-page error handling.
53
+ """
54
+ reader = get_reader(lang_codes)
55
+ results = []
56
+
57
+ with fitz.open(pdf_path) as doc:
58
+ def ocr_page(idx: int) -> Tuple[int, str]:
59
+ try:
60
+ page = doc[idx - 1]
61
+ # Adaptive resolution: up to ~300dpi on normal pages
62
+ scale = 2 if max(page.rect.width, page.rect.height) <= 600 else 1.5
63
+ pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))
64
+ img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
65
+ pix.save(img_path)
66
+
67
+ # Single-language => detail mode with confidence filtering
68
+ if len(lang_codes) == 1:
69
+ items = reader.readtext(img_path, detail=1)
70
+ lines = [t for _, t, conf in items if conf > 0.2]
71
+ else:
72
+ lines = reader.readtext(img_path, detail=0)
73
+
74
+ os.remove(img_path)
75
+ return idx, "\n".join(lines)
76
+ except Exception as e:
77
+ # Emit a warning instead of halting the entire batch
78
+ msg = f"⚠️ OCR error on page {idx}: {e}"
79
+ print(msg)
80
+ return idx, msg
81
+
82
+ # Cap threadpool size to avoid overloading GPU
83
+ workers = min(OCR_THREADS, len(page_ids))
84
+ with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as pool:
85
+ futures = {pool.submit(ocr_page, pid): pid for pid in page_ids}
86
+ for fut in concurrent.futures.as_completed(futures):
87
+ results.append(fut.result())
88
+
89
+ return results
90
+
91
+ def run_ocr_image(image_path: str, lang_codes: Tuple[str, ...]) -> str:
92
+ """
93
+ OCR a single image file.
94
+ Mirrors run_ocr_pages' logic but for one-shot image inputs.
95
+ """
96
  reader = get_reader(lang_codes)
97
+ try:
 
 
 
 
 
 
 
98
  if len(lang_codes) == 1:
99
+ items = reader.readtext(image_path, detail=1)
100
+ lines = [t for _, t, conf in items if conf > 0.2]
101
  else:
102
+ lines = reader.readtext(image_path, detail=0)
103
+ return "\n".join(lines)
104
+ except Exception as e:
105
+ msg = f"⚠️ OCR error on image: {e}"
106
+ print(msg)
107
+ return msg
108
+
109
+ # ----------------------------------------------------------------------
110
+ # Streamed output helper
111
+ # ----------------------------------------------------------------------
112
+ def emit_chunk(chunk: str, combined: str, tmp_file) -> Tuple[str, None]:
113
+ """
114
+ Append 'chunk' to the in-memory combined text and the temp file,
115
+ then return the updated combined text for streaming.
116
+ """
117
+ combined += chunk
118
+ tmp_file.write(chunk.encode("utf-8"))
119
+ return combined, None
120
+
121
+ # ----------------------------------------------------------------------
122
+ # Main extraction pipeline
123
+ # ----------------------------------------------------------------------
124
+ def pipeline(upload, langs, mode):
125
+ """
126
+ Handles PDF or image uploads, emits native and OCR text incrementally,
127
+ and provides a downloadable .txt at the end.
128
+ """
129
+ if upload is None:
130
+ raise gr.Error("Please upload a file.")
131
+ # File-size guard (200MB)
132
+ if os.path.getsize(upload.name) > 200 * 1024 * 1024:
133
+ raise gr.Error("File larger than 200 MB; please split it.")
134
+
135
+ # Prepare languages and temp output
136
  langs = langs if isinstance(langs, list) else [langs]
137
  lang_tuple = tuple(langs)
 
138
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
139
  combined = ""
140
 
141
+ ext = os.path.splitext(upload.name)[1].lower()
142
+
143
+ # PDF flow
144
+ if ext == ".pdf":
145
+ # Streaming progress bar
146
+ progress = gr.Progress(track_tqdm=False)
147
+ with fitz.open(upload.name) as doc:
148
+ total_pages = doc.page_count
149
+
150
+ # Phase 1: Native-text extraction & OCR scheduling
151
+ ocr_pages = []
152
+ with fitz.open(upload.name) as doc:
153
+ for i, page in enumerate(doc, start=1):
154
+ text = page.get_text("text") if mode in ("native", "auto") else ""
155
+ if text.strip():
156
+ chunk = f"--- Page {i} (native) ---\n{text}\n"
157
+ combined, _ = emit_chunk(chunk, combined, tmp)
158
+ yield combined, None
159
+ else:
160
+ if mode in ("ocr", "auto"):
161
+ ocr_pages.append(i)
162
+ progress(i / total_pages)
163
+
164
+ # Phase 2: OCR pass on scheduled pages
165
+ if ocr_pages:
166
+ ocr_results = run_ocr_pages(upload.name, ocr_pages, lang_tuple)
167
+ for idx, txt in sorted(ocr_results, key=lambda x: x[0]):
 
168
  chunk = f"--- Page {idx} (OCR) ---\n{txt}\n"
169
+ combined, _ = emit_chunk(chunk, combined, tmp)
 
170
  yield combined, None
171
 
172
+ # Image flow
173
+ else:
174
+ txt = run_ocr_image(upload.name, lang_tuple)
175
+ chunk = f"--- Image OCR ---\n{txt}\n"
176
+ combined, _ = emit_chunk(chunk, combined, tmp)
177
+ yield combined, None
178
+
179
  tmp.close()
180
+ # Final step: offer download link
181
+ yield combined or "⚠️ No text detected.", tmp.name
182
 
183
+ # ----------------------------------------------------------------------
184
+ # Gradio UI (Blocks + streaming)
185
+ # ----------------------------------------------------------------------
186
  theme = gr.themes.Base(primary_hue="purple")
187
+ with gr.Blocks(theme=theme, title="ZeroGPU OCR PDF & Image Extractor") as demo:
188
+ gr.Markdown("## 📚 ZeroGPU Multilingual OCR Extractor")
189
  with gr.Row():
190
  with gr.Column(scale=1):
191
+ file_in = gr.File(label="Upload PDF or image",
192
+ file_types=SUPPORTED_FILE_TYPES)
193
+ lang_in = gr.Dropdown(LANGUAGES, multiselect=True, value=["en"],
194
+ label="OCR language(s)")
195
+ mode_in = gr.Radio(["native", "ocr", "auto"], value="auto",
196
+ label="Mode",
197
+ info="native=text · ocr=image · auto=mix")
198
+ btn = gr.Button("Extract", variant="primary")
 
 
 
199
  with gr.Column(scale=2):
200
+ out_txt = gr.Textbox(label="Extracted Text", lines=18,
201
+ show_copy_button=True)
202
  dl = gr.File(label="Download .txt")
203
+
204
+ # Explicit output mapping and enable streaming
205
+ btn.click(
206
+ fn=pipeline,
207
+ inputs=[file_in, lang_in, mode_in],
208
+ outputs={"out": out_txt, "dl": dl}
209
+ )
210
+ demo.queue()
211
+
212
  if __name__ == "__main__":
213
  demo.launch()