File size: 7,991 Bytes
1ea570a
 
6583fc2
 
 
 
 
 
 
 
 
 
 
 
 
1ea570a
 
6583fc2
 
 
1ea570a
 
 
6583fc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ea570a
 
6583fc2
 
 
 
 
 
 
 
 
1ea570a
6583fc2
 
 
 
 
 
 
 
 
 
 
 
1ea570a
6583fc2
 
 
 
 
 
1ea570a
6583fc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ea570a
6583fc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ea570a
6583fc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ea570a
 
6583fc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ea570a
6583fc2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224

"""
📚 ZeroGPU Multilingual PDF Text Extractor
=========================================

Features
--------
• **Native / OCR / Hybrid** modes  
• **Language chooser** (multiselect) with EasyOCR model caching  
• **ZeroGPU** pay‑as‑you‑go: GPU allocated *only* while OCR runs  
• **Streamed output** page‑by‑page + real‑time progress bar  
• **Download‑as‑TXT** button  
• Basic **error handling** (oversize PDF, CUDA OOM, unsupported language)

Maintained as a single file (`app.py`) for simplicity.
"""

import os, tempfile, concurrent.futures, itertools, functools, uuid
from typing import List, Tuple

import fitz  # PyMuPDF
import pdfplumber
import torch
import gradio as gr
import spaces  # HF Spaces helper (for ZeroGPU)
import easyocr

# ----------------------------------------------------------------------
# Caching for EasyOCR readers (models are heavy; reuse them)
# ----------------------------------------------------------------------
_READERS = {}

def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
    key = tuple(sorted(lang_codes))
    if key not in _READERS:
        try:
            _READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
        except ValueError as e:
            raise gr.Error(str(e))
    return _READERS[key]


# ----------------------------------------------------------------------
# GPU‑decorated OCR worker (runs ONLY when called)
# ----------------------------------------------------------------------
@spaces.GPU(duration=600)
def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]:
    """OCR designated pages and return list[(page_num, text)]."""
    reader = get_reader(lang_codes)
    doc = fitz.open(pdf_path)
    results = []

    def ocr_single(idx: int):
        pg = doc[idx - 1]
        # Adaptive rasterisation scale (A4 ~= 595 × 842 pt)
        max_side = max(pg.rect.width, pg.rect.height)
        scale = 2 if max_side <= 600 else 1.5
        try:
            pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
        except RuntimeError:
            # Fallback lower dpi if page too huge
            pix = pg.get_pixmap()
        img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
        pix.save(img_path)

        # Single-language ⇒ use detail=1 to filter low‑confidence lines
        if len(lang_codes) == 1:
            tmp = reader.readtext(img_path, detail=1)
            txt_lines = [text for _, text, conf in tmp if conf > 0.2]
        else:
            txt_lines = reader.readtext(img_path, detail=0)

        os.remove(img_path)
        return idx, "\n".join(txt_lines)

    # Light parallelism (GPU friendly)
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
        futures = {ex.submit(ocr_single, i): i for i in page_ids}
        for fut in concurrent.futures.as_completed(futures):
            results.append(fut.result())

    return results


# ----------------------------------------------------------------------
# Native text extractor helper
# ----------------------------------------------------------------------
def extract_native(pdf_path: str, x_tol: float = 1) -> List[Tuple[int, str]]:
    with pdfplumber.open(pdf_path) as pdf:
        out = []
        for idx, page in enumerate(pdf.pages, start=1):
            txt = page.extract_text(x_tolerance=x_tol) or ""
            out.append((idx, txt))
        return out


# ----------------------------------------------------------------------
# Main pipeline (Gradio generator)
# ----------------------------------------------------------------------
def pipeline(pdf_file, langs, mode):
    if pdf_file is None:
        raise gr.Error("Please upload a PDF.")

    # Guard: size limit 200 MB
    max_size = 200 * 1024 * 1024
    if os.path.getsize(pdf_file.name) > max_size:
        raise gr.Error("PDF larger than 200 MB. Please split the document.")

    langs = langs if isinstance(langs, list) else [langs]
    lang_tuple = tuple(langs)

    native_chunks, ocr_chunks = [], []
    combined_text = ""

    # Create a temporary TXT file for incremental writing (download button)
    tmp_txt = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
    tmp_txt_path = tmp_txt.name

    # Progress bar context
    with gr.Progress(track_tqdm=False) as prog:
        native_pages = extract_native(pdf_file.name) if mode in ("native", "auto") else []
        total_pages = len(native_pages) if native_pages else fitz.open(pdf_file.name).page_count
        prog.tqdm(total=total_pages)

        # Process pages one by one (stream output)
        pending_ocr = []

        for idx in range(1, total_pages + 1):
            native_txt = ""
            if mode in ("native", "auto"):
                native_txt = native_pages[idx - 1][1]

            if native_txt.strip():
                chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
                native_chunks.append(chunk)
                combined_text += chunk
                tmp_txt.write(chunk.encode("utf-8"))
                yield combined_text, None
            else:
                if mode == "auto":
                    pending_ocr.append(idx)
                elif mode == "ocr":
                    pending_ocr.append(idx)
            prog.update(advance=1)

        # OCR if needed
        if pending_ocr:
            try:
                ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
            except RuntimeError as e:
                # Likely CUDA OOM → retry at lower dpi
                ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)

            for idx, text in sorted(ocr_results, key=lambda x: x[0]):
                if text.strip():
                    chunk = f"--- Page {idx} (OCR) ---\n{text}\n"
                    ocr_chunks.append(chunk)
                    combined_text += chunk
                    tmp_txt.write(chunk.encode("utf-8"))
                    yield combined_text, None

    tmp_txt.close()
    # Final yield includes download‑file
    yield combined_text or "⚠️ No text detected in the document.", tmp_txt_path


# ----------------------------------------------------------------------
# Gradio Blocks UI
# ----------------------------------------------------------------------
THEME = gr.themes.Base(
    primary_hue="purple",
    radius_size=gr.themes.sizes.radius_xl,
    spacing_size=gr.themes.sizes.spacing_md,
)

EXAMPLE_URLS = [
    "https://arxiv.org/pdf/2106.14834.pdf",
    "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
]

with gr.Blocks(theme=THEME, title="ZeroGPU PDF OCR") as demo:
    gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")

    with gr.Row():
        with gr.Column(scale=1, min_width=250):
            file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
            lang_in = gr.Dropdown(
                ["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"],
                multiselect=True,
                value=["en"],
                label="OCR language(s)"
            )
            mode_in = gr.Radio(
                ["native", "ocr", "auto"],
                value="auto",
                label="Document type",
                info="native = text only · ocr = images only · auto = mixed",
            )
            run_btn = gr.Button("Extract", variant="primary")

        with gr.Column(scale=2):
            txt_out = gr.Textbox(
                label="Extracted Text (streaming)",
                lines=18,
                show_copy_button=True,
            )
            download_out = gr.File(label="Download .txt")

    run_btn.click(
        fn=pipeline,
        inputs=[file_in, lang_in, mode_in],
        outputs=[txt_out, download_out],
    )

    gr.Examples(
        EXAMPLE_URLS,
        inputs=file_in,
        label="Quick‑test PDFs",
        fn=None,
    )

if __name__ == "__main__":
    demo.launch()