import os import base64 import tempfile from io import BytesIO import torch import gradio as gr from PIL import Image from PyPDF2 import PdfReader from ebooklib import epub from pdf2image import convert_from_path from transformers import AutoProcessor, Qwen2VLForConditionalGeneration # Set cache and log paths cache_dir = "/tmp/huggingface_cache" os.environ["HF_HOME"] = cache_dir os.environ["TORCH_HOME"] = cache_dir os.environ["OLMOCR_LOG_PATH"] = "/tmp/olmocr-pipeline-debug.log" os.makedirs(cache_dir, exist_ok=True) # Patch logging path before olmocr import import logging original_file_handler = logging.FileHandler def safe_file_handler(filename, *args, **kwargs): if filename == "olmocr-pipeline-debug.log": filename = os.environ.get("OLMOCR_LOG_PATH", "/tmp/olmocr-pipeline-debug.log") return original_file_handler(filename, *args, **kwargs) logging.FileHandler = safe_file_handler # Import olmocr pipeline after setting log path from olmocr.pipeline import PDFToTextOCR from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.prompts import build_finetuning_prompt from olmocr.prompts.anchor import get_anchor_text # Load model and processor device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = Qwen2VLForConditionalGeneration.from_pretrained( "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32 ).eval().to(device) processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") # Load OCR pipeline olmocr = PDFToTextOCR(model=model, processor=processor) def ocr_page(pdf_path, page_num): image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024) anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000) prompt = build_finetuning_prompt(anchor_text) messages = [{ "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}} ], }] prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) main_image = Image.open(BytesIO(base64.b64decode(image_b64))) inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = model.generate( **inputs, temperature=0.8, max_new_tokens=1024, do_sample=True, ) prompt_len = inputs["input_ids"].shape[1] new_tokens = outputs[:, prompt_len:] decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True) return decoded[0] if decoded else "" def create_epub_from_text(text, output_path, title, author, language, cover_image): book = epub.EpubBook() # Set metadata book.set_title(title) book.set_language(language) book.add_author(author) # Add cover image with open(cover_image, "rb") as cover_file: cover_data = cover_file.read() cover_item = epub.EpubItem(uid="cover", file_name="cover.jpg", media_type="image/jpeg", content=cover_data) book.add_item(cover_item) # Create a chapter for the content chapter = epub.EpubHtml(title="Content", file_name="content.xhtml", lang=language) chapter.set_content(f"
{text}
") book.add_item(chapter) # Define Table of Contents (TOC) book.toc = (epub.Link("content.xhtml", "Content", "content"),) # Add default NCX and OPF files book.add_item(epub.EpubNav()) # Write the EPUB file epub.write_epub(output_path, book) def convert_pdf_to_epub(pdf_file, title, author, language): tmp_pdf_path = pdf_file.name # Read PDF to get cover reader = PdfReader(tmp_pdf_path) first_page = reader.pages[0] cover_path = "/tmp/cover.jpg" images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1) images[0].save(cover_path, "JPEG") # Run OCR ocr_text = olmocr.process(tmp_pdf_path) # Write EPUB epub_path = "/tmp/output.epub" create_epub_from_text( text=ocr_text, output_path=epub_path, title=title, author=author, language=language, cover_image=cover_path ) return epub_path, cover_path def interface_fn(pdf, title, author, language): epub_path, _ = convert_pdf_to_epub(pdf, title, author, language) return epub_path demo = gr.Interface( fn=interface_fn, inputs=[ gr.File(label="Upload PDF", file_types=[".pdf"]), gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"), gr.Textbox(label="Author", placeholder="e.g. Allen AI"), gr.Textbox(label="Language", placeholder="e.g. en", value="en"), ], outputs=gr.File(label="Download EPUB"), title="PDF to EPUB Converter (olmOCR)", description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.", allow_flagging="never", ) if __name__ == "__main__": demo.launch(share=True)