olmocr-demo / app.py
leonarb's picture
Update app.py
af75cff verified
raw
history blame
4.24 kB
import os
import base64
import tempfile
from io import BytesIO
import torch
import gradio as gr
from PIL import Image
from PyPDF2 import PdfReader
from ebooklib import epub
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
# Set Hugging Face and Torch cache to a guaranteed-writable location
cache_dir = "/tmp/huggingface_cache"
os.environ["HF_HOME"] = cache_dir
os.environ["TORCH_HOME"] = cache_dir
os.makedirs(cache_dir, exist_ok=True)
# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Qwen2VLForConditionalGeneration.from_pretrained(
"allenai/olmOCR-7B-0225-preview",
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
).eval().to(device)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
def ocr_page(pdf_path, page_num):
image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
messages = [{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
],
}]
prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
temperature=0.8,
max_new_tokens=1024,
do_sample=True,
)
prompt_len = inputs["input_ids"].shape[1]
new_tokens = outputs[:, prompt_len:]
decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
return decoded[0] if decoded else ""
def convert_pdf_to_epub(pdf_file, title, author, language):
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
tmp_pdf.write(pdf_file.read())
tmp_pdf_path = tmp_pdf.name
reader = PdfReader(tmp_pdf_path)
num_pages = len(reader.pages)
book = epub.EpubBook()
book.set_title(title)
book.add_author(author)
book.set_language(language)
# Set cover from page 1
cover_image_b64 = render_pdf_to_base64png(tmp_pdf_path, 1, target_longest_image_dim=1024)
cover_image_bytes = base64.b64decode(cover_image_b64)
book.set_cover("cover.jpg", cover_image_bytes)
# Add OCR'd pages as chapters
for i in range(num_pages):
text = ocr_page(tmp_pdf_path, i)
chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang=language)
chapter.content = f"<h1>Page {i+1}</h1><p>{text}</p>"
book.add_item(chapter)
book.spine.append(chapter)
# Finalize EPUB
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
epub_path = os.path.join(tempfile.gettempdir(), "output.epub")
epub.write_epub(epub_path, book, {})
with open(epub_path, "rb") as f:
return epub_path, f.read()
def interface_fn(pdf, title, author, language):
epub_path, _ = convert_pdf_to_epub(pdf, title, author, language)
return epub_path
demo = gr.Interface(
fn=interface_fn,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
],
outputs=gr.File(label="Download EPUB"),
title="PDF to EPUB Converter (olmOCR)",
description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
allow_flagging="never",
)
if __name__ == "__main__":
demo.launch(share=True)