olmocr-demo / app.py
leonarb's picture
Update app.py
afbaa03 verified
raw
history blame
4.1 kB
import gradio as gr
import torch
from PyPDF2 import PdfReader
from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
from ebooklib import epub
import base64
import tempfile
import os
# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Qwen2VLForConditionalGeneration.from_pretrained(
"allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
).eval().to(device)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
def ocr_page(pdf_path, page_num):
# Render page to base64 PNG
image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
],
}
]
prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
temperature=0.8,
max_new_tokens=1024,
do_sample=True,
)
prompt_len = inputs["input_ids"].shape[1]
new_tokens = outputs[:, prompt_len:]
decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
return decoded[0] if decoded else ""
def convert_pdf_to_epub(pdf_file, title, author, language):
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
tmp_pdf.write(pdf_file.read())
tmp_pdf_path = tmp_pdf.name
reader = PdfReader(tmp_pdf_path)
num_pages = len(reader.pages)
# Create EPUB book
book = epub.EpubBook()
book.set_title(title)
book.add_author(author)
book.set_language(language)
# Use first page as cover
cover_image_b64 = render_pdf_to_base64png(tmp_pdf_path, 1, target_longest_image_dim=1024)
cover_image_bytes = base64.b64decode(cover_image_b64)
book.set_cover("cover.jpg", cover_image_bytes)
# OCR and add pages
for i in range(num_pages):
text = ocr_page(tmp_pdf_path, i)
chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang=language)
chapter.content = f"<h1>Page {i+1}</h1><p>{text}</p>"
book.add_item(chapter)
book.spine.append(chapter)
# Finalize EPUB
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
epub_path = os.path.join(tempfile.gettempdir(), "output.epub")
epub.write_epub(epub_path, book, {})
with open(epub_path, "rb") as f:
return epub_path, f.read()
def interface_fn(pdf, title, author, language):
epub_path, epub_bytes = convert_pdf_to_epub(pdf, title, author, language)
return epub_path
demo = gr.Interface(
fn=interface_fn,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
],
outputs=gr.File(label="Download EPUB"),
title="PDF to EPUB Converter (olmOCR)",
description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
allow_flagging="never",
)
if __name__ == "__main__":
demo.launch()