olmocr-demo / app.py
leonarb's picture
Update app.py
36bb738 verified
raw
history blame
5.25 kB
import os
import base64
import tempfile
from io import BytesIO
import torch
import gradio as gr
from PIL import Image
from PyPDF2 import PdfReader
from ebooklib import epub
from pdf2image import convert_from_path
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
# Set cache and log paths
cache_dir = "/tmp/huggingface_cache"
os.environ["HF_HOME"] = cache_dir
os.environ["TORCH_HOME"] = cache_dir
os.environ["OLMOCR_LOG_PATH"] = "/tmp/olmocr-pipeline-debug.log"
os.makedirs(cache_dir, exist_ok=True)
# Patch logging path before olmocr import
import logging
original_file_handler = logging.FileHandler
def safe_file_handler(filename, *args, **kwargs):
if filename == "olmocr-pipeline-debug.log":
filename = os.environ.get("OLMOCR_LOG_PATH", "/tmp/olmocr-pipeline-debug.log")
return original_file_handler(filename, *args, **kwargs)
logging.FileHandler = safe_file_handler
# Import olmocr pipeline after setting log path
from olmocr.pipeline import PDFToTextOCR
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Qwen2VLForConditionalGeneration.from_pretrained(
"allenai/olmOCR-7B-0225-preview",
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
).eval().to(device)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
# Load OCR pipeline
olmocr = PDFToTextOCR(model=model, processor=processor)
def ocr_page(pdf_path, page_num):
image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
messages = [{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
],
}]
prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
temperature=0.8,
max_new_tokens=1024,
do_sample=True,
)
prompt_len = inputs["input_ids"].shape[1]
new_tokens = outputs[:, prompt_len:]
decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
return decoded[0] if decoded else ""
def create_epub_from_text(text, output_path, title, author, language, cover_image):
book = epub.EpubBook()
# Set metadata
book.set_title(title)
book.set_language(language)
book.add_author(author)
# Add cover image
with open(cover_image, "rb") as cover_file:
cover_data = cover_file.read()
cover_item = epub.EpubItem(uid="cover", file_name="cover.jpg", media_type="image/jpeg", content=cover_data)
book.add_item(cover_item)
# Create a chapter for the content
chapter = epub.EpubHtml(title="Content", file_name="content.xhtml", lang=language)
chapter.set_content(f"<html><body><h1>{title}</h1><p>{text}</p></body></html>")
book.add_item(chapter)
# Define Table of Contents (TOC)
book.toc = (epub.Link("content.xhtml", "Content", "content"),)
# Add default NCX and OPF files
book.add_item(epub.EpubNav())
# Write the EPUB file
epub.write_epub(output_path, book)
def convert_pdf_to_epub(pdf_file, title, author, language):
tmp_pdf_path = pdf_file.name
# Read PDF to get cover
reader = PdfReader(tmp_pdf_path)
first_page = reader.pages[0]
cover_path = "/tmp/cover.jpg"
images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
images[0].save(cover_path, "JPEG")
# Run OCR
ocr_text = olmocr.process(tmp_pdf_path)
# Write EPUB
epub_path = "/tmp/output.epub"
create_epub_from_text(
text=ocr_text,
output_path=epub_path,
title=title,
author=author,
language=language,
cover_image=cover_path
)
return epub_path, cover_path
def interface_fn(pdf, title, author, language):
epub_path, _ = convert_pdf_to_epub(pdf, title, author, language)
return epub_path
demo = gr.Interface(
fn=interface_fn,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
],
outputs=gr.File(label="Download EPUB"),
title="PDF to EPUB Converter (olmOCR)",
description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
allow_flagging="never",
)
if __name__ == "__main__":
demo.launch(share=True)