""" 📚 ZeroGPU Multilingual PDF Text Extractor ========================================= Features -------- • **Native / OCR / Hybrid** modes • **Language chooser** (multiselect) with EasyOCR model caching • **ZeroGPU** pay‑as‑you‑go: GPU allocated *only* while OCR runs • **Streamed output** page‑by‑page + real‑time progress bar • **Download‑as‑TXT** button • Basic **error handling** (oversize PDF, CUDA OOM, unsupported language) Maintained as a single file (`app.py`) for simplicity. """ import os, tempfile, concurrent.futures, itertools, functools, uuid from typing import List, Tuple import fitz # PyMuPDF import pdfplumber import torch import gradio as gr import spaces # HF Spaces helper (for ZeroGPU) import easyocr # ---------------------------------------------------------------------- # Caching for EasyOCR readers (models are heavy; reuse them) # ---------------------------------------------------------------------- _READERS = {} def get_reader(lang_codes: Tuple[str, ...]) -> "easyocr.Reader": key = tuple(sorted(lang_codes)) if key not in _READERS: try: _READERS[key] = easyocr.Reader(list(key), gpu=torch.cuda.is_available()) except ValueError as e: raise gr.Error(str(e)) return _READERS[key] # ---------------------------------------------------------------------- # GPU‑decorated OCR worker (runs ONLY when called) # ---------------------------------------------------------------------- @spaces.GPU(duration=600) def run_ocr(pdf_path: str, page_ids: List[int], lang_codes: Tuple[str, ...]) -> List[Tuple[int, str]]: """OCR designated pages and return list[(page_num, text)].""" reader = get_reader(lang_codes) doc = fitz.open(pdf_path) results = [] def ocr_single(idx: int): pg = doc[idx - 1] # Adaptive rasterisation scale (A4 ~= 595 × 842 pt) max_side = max(pg.rect.width, pg.rect.height) scale = 2 if max_side <= 600 else 1.5 try: pix = pg.get_pixmap(matrix=fitz.Matrix(scale, scale)) except RuntimeError: # Fallback lower dpi if page too huge pix = pg.get_pixmap() img_path = os.path.join(tempfile.gettempdir(), f"ocr_{uuid.uuid4().hex}.png") pix.save(img_path) # Single-language ⇒ use detail=1 to filter low‑confidence lines if len(lang_codes) == 1: tmp = reader.readtext(img_path, detail=1) txt_lines = [text for _, text, conf in tmp if conf > 0.2] else: txt_lines = reader.readtext(img_path, detail=0) os.remove(img_path) return idx, "\n".join(txt_lines) # Light parallelism (GPU friendly) with concurrent.futures.ThreadPoolExecutor(max_workers=4) as ex: futures = {ex.submit(ocr_single, i): i for i in page_ids} for fut in concurrent.futures.as_completed(futures): results.append(fut.result()) return results # ---------------------------------------------------------------------- # Native text extractor helper # ---------------------------------------------------------------------- def extract_native(pdf_path: str, x_tol: float = 1) -> List[Tuple[int, str]]: with pdfplumber.open(pdf_path) as pdf: out = [] for idx, page in enumerate(pdf.pages, start=1): txt = page.extract_text(x_tolerance=x_tol) or "" out.append((idx, txt)) return out # ---------------------------------------------------------------------- # Main pipeline (Gradio generator) # ---------------------------------------------------------------------- def pipeline(pdf_file, langs, mode): if pdf_file is None: raise gr.Error("Please upload a PDF.") # Guard: size limit 200 MB max_size = 200 * 1024 * 1024 if os.path.getsize(pdf_file.name) > max_size: raise gr.Error("PDF larger than 200 MB. Please split the document.") langs = langs if isinstance(langs, list) else [langs] lang_tuple = tuple(langs) native_chunks, ocr_chunks = [], [] combined_text = "" # Create a temporary TXT file for incremental writing (download button) tmp_txt = tempfile.NamedTemporaryFile(delete=False, suffix=".txt") tmp_txt_path = tmp_txt.name # Progress bar context with gr.Progress(track_tqdm=False) as prog: native_pages = extract_native(pdf_file.name) if mode in ("native", "auto") else [] total_pages = len(native_pages) if native_pages else fitz.open(pdf_file.name).page_count prog.tqdm(total=total_pages) # Process pages one by one (stream output) pending_ocr = [] for idx in range(1, total_pages + 1): native_txt = "" if mode in ("native", "auto"): native_txt = native_pages[idx - 1][1] if native_txt.strip(): chunk = f"--- Page {idx} (native) ---\n{native_txt}\n" native_chunks.append(chunk) combined_text += chunk tmp_txt.write(chunk.encode("utf-8")) yield combined_text, None else: if mode == "auto": pending_ocr.append(idx) elif mode == "ocr": pending_ocr.append(idx) prog.update(advance=1) # OCR if needed if pending_ocr: try: ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple) except RuntimeError as e: # Likely CUDA OOM → retry at lower dpi ocr_results = run_ocr(pdf_file.name, pending_ocr, lang_tuple) for idx, text in sorted(ocr_results, key=lambda x: x[0]): if text.strip(): chunk = f"--- Page {idx} (OCR) ---\n{text}\n" ocr_chunks.append(chunk) combined_text += chunk tmp_txt.write(chunk.encode("utf-8")) yield combined_text, None tmp_txt.close() # Final yield includes download‑file yield combined_text or "⚠️ No text detected in the document.", tmp_txt_path # ---------------------------------------------------------------------- # Gradio Blocks UI # ---------------------------------------------------------------------- THEME = gr.themes.Base( primary_hue="purple", radius_size=gr.themes.sizes.radius_xl, spacing_size=gr.themes.sizes.spacing_md, ) EXAMPLE_URLS = [ "https://arxiv.org/pdf/2106.14834.pdf", "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" ] with gr.Blocks(theme=THEME, title="ZeroGPU PDF OCR") as demo: gr.Markdown("## 📚 ZeroGPU Multilingual PDF Text Extractor") with gr.Row(): with gr.Column(scale=1, min_width=250): file_in = gr.File(label="Upload PDF", file_types=[".pdf"]) lang_in = gr.Dropdown( ["en", "nl", "de", "fr", "es", "it", "pt", "ru", "zh_cn", "ja", "ar"], multiselect=True, value=["en"], label="OCR language(s)" ) mode_in = gr.Radio( ["native", "ocr", "auto"], value="auto", label="Document type", info="native = text only · ocr = images only · auto = mixed", ) run_btn = gr.Button("Extract", variant="primary") with gr.Column(scale=2): txt_out = gr.Textbox( label="Extracted Text (streaming)", lines=18, show_copy_button=True, ) download_out = gr.File(label="Download .txt") run_btn.click( fn=pipeline, inputs=[file_in, lang_in, mode_in], outputs=[txt_out, download_out], ) gr.Examples( EXAMPLE_URLS, inputs=file_in, label="Quick‑test PDFs", fn=None, ) if __name__ == "__main__": demo.launch()