File size: 7,991 Bytes
1ea570a 6583fc2 1ea570a 6583fc2 1ea570a 6583fc2 1ea570a 6583fc2 1ea570a 6583fc2 1ea570a 6583fc2 1ea570a 6583fc2 1ea570a 6583fc2 1ea570a 6583fc2 1ea570a 6583fc2 1ea570a 6583fc2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
"""
📚 ZeroGPU Multilingual PDF Text Extractor
=========================================
Features
--------
• **Native / OCR / Hybrid** modes
• **Language chooser** (multiselect) with EasyOCR model caching
• **ZeroGPU** pay‑as‑you‑go: GPU allocated *only* while OCR runs
• **Streamed output** page‑by‑page + real‑time progress bar
• **Download‑as‑TXT** button
• Basic **error handling** (oversize PDF, CUDA OOM, unsupported language)
Maintained as a single file (`app.py`) for simplicity.
"""
import os, tempfile, concurrent.futures, itertools, functools, uuid
from typing import List, Tuple
import fitz # PyMuPDF
import pdfplumber
import torch
import gradio as gr
import spaces # HF Spaces helper (for ZeroGPU)
import easyocr
# ----------------------------------------------------------------------
# Caching for EasyOCR readers (models are heavy; reuse them)
# ----------------------------------------------------------------------
_READERS = {}
def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader":
key = tuple(sorted(lang_codes))
if key not in _READERS:
try:
_READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available())
except ValueError as e:
raise gr.Error(str(e))
return _READERS[key]
# ----------------------------------------------------------------------
# GPU‑decorated OCR worker (runs ONLY when called)
# ----------------------------------------------------------------------
@spaces.GPU(duration=600)
def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]:
"""OCR designated pages and return list[(page_num, text)]."""
reader = get_reader(lang_codes)
doc = fitz.open(pdf_path)
results = []
def ocr_single(idx: int):
pg = doc[idx - 1]
# Adaptive rasterisation scale (A4 ~= 595 × 842 pt)
max_side = max(pg.rect.width, pg.rect.height)
scale = 2 if max_side <= 600 else 1.5
try:
pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale))
except RuntimeError:
# Fallback lower dpi if page too huge
pix = pg.get_pixmap()
img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png")
pix.save(img_path)
# Single-language ⇒ use detail=1 to filter low‑confidence lines
if len(lang_codes) == 1:
tmp = reader.readtext(img_path, detail=1)
txt_lines = [text for _, text, conf in tmp if conf > 0.2]
else:
txt_lines = reader.readtext(img_path, detail=0)
os.remove(img_path)
return idx, "\n".join(txt_lines)
# Light parallelism (GPU friendly)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex:
futures = {ex.submit(ocr_single, i): i for i in page_ids}
for fut in concurrent.futures.as_completed(futures):
results.append(fut.result())
return results
# ----------------------------------------------------------------------
# Native text extractor helper
# ----------------------------------------------------------------------
def extract_native(pdf_path: str, x_tol: float = 1) -> List[Tuple[int, str]]:
with pdfplumber.open(pdf_path) as pdf:
out = []
for idx, page in enumerate(pdf.pages, start=1):
txt = page.extract_text(x_tolerance=x_tol) or ""
out.append((idx, txt))
return out
# ----------------------------------------------------------------------
# Main pipeline (Gradio generator)
# ----------------------------------------------------------------------
def pipeline(pdf_file, langs, mode):
if pdf_file is None:
raise gr.Error("Please upload a PDF.")
# Guard: size limit 200 MB
max_size = 200 * 1024 * 1024
if os.path.getsize(pdf_file.name) > max_size:
raise gr.Error("PDF larger than 200 MB. Please split the document.")
langs = langs if isinstance(langs, list) else [langs]
lang_tuple = tuple(langs)
native_chunks, ocr_chunks = [], []
combined_text = ""
# Create a temporary TXT file for incremental writing (download button)
tmp_txt = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
tmp_txt_path = tmp_txt.name
# Progress bar context
with gr.Progress(track_tqdm=False) as prog:
native_pages = extract_native(pdf_file.name) if mode in ("native", "auto") else []
total_pages = len(native_pages) if native_pages else fitz.open(pdf_file.name).page_count
prog.tqdm(total=total_pages)
# Process pages one by one (stream output)
pending_ocr = []
for idx in range(1, total_pages + 1):
native_txt = ""
if mode in ("native", "auto"):
native_txt = native_pages[idx - 1][1]
if native_txt.strip():
chunk = f"--- Page {idx} (native) ---\n{native_txt}\n"
native_chunks.append(chunk)
combined_text += chunk
tmp_txt.write(chunk.encode("utf-8"))
yield combined_text, None
else:
if mode == "auto":
pending_ocr.append(idx)
elif mode == "ocr":
pending_ocr.append(idx)
prog.update(advance=1)
# OCR if needed
if pending_ocr:
try:
ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
except RuntimeError as e:
# Likely CUDA OOM → retry at lower dpi
ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple)
for idx, text in sorted(ocr_results, key=lambda x: x[0]):
if text.strip():
chunk = f"--- Page {idx} (OCR) ---\n{text}\n"
ocr_chunks.append(chunk)
combined_text += chunk
tmp_txt.write(chunk.encode("utf-8"))
yield combined_text, None
tmp_txt.close()
# Final yield includes download‑file
yield combined_text or "⚠️ No text detected in the document.", tmp_txt_path
# ----------------------------------------------------------------------
# Gradio Blocks UI
# ----------------------------------------------------------------------
THEME = gr.themes.Base(
primary_hue="purple",
radius_size=gr.themes.sizes.radius_xl,
spacing_size=gr.themes.sizes.spacing_md,
)
EXAMPLE_URLS = [
"https://arxiv.org/pdf/2106.14834.pdf",
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
]
with gr.Blocks(theme=THEME, title="ZeroGPU PDF OCR") as demo:
gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor")
with gr.Row():
with gr.Column(scale=1, min_width=250):
file_in = gr.File(label="Upload PDF", file_types=[".pdf"])
lang_in = gr.Dropdown(
["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"],
multiselect=True,
value=["en"],
label="OCR language(s)"
)
mode_in = gr.Radio(
["native", "ocr", "auto"],
value="auto",
label="Document type",
info="native = text only · ocr = images only · auto = mixed",
)
run_btn = gr.Button("Extract", variant="primary")
with gr.Column(scale=2):
txt_out = gr.Textbox(
label="Extracted Text (streaming)",
lines=18,
show_copy_button=True,
)
download_out = gr.File(label="Download .txt")
run_btn.click(
fn=pipeline,
inputs=[file_in, lang_in, mode_in],
outputs=[txt_out, download_out],
)
gr.Examples(
EXAMPLE_URLS,
inputs=file_in,
label="Quick‑test PDFs",
fn=None,
)
if __name__ == "__main__":
demo.launch()
|