Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

olmocr-demo / app.py

leonarb

Update app.py

b3d319d verified 26 days ago

raw

history blame

4.97 kB

	import os
	import base64
	import tempfile
	from io import BytesIO

	import torch
	import gradio as gr
	from PIL import Image
	from PyPDF2 import PdfReader
	from ebooklib import epub
	from pdf2image import convert_from_path
	from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

	# Set cache and log paths
	cache_dir = "/tmp/huggingface_cache"
	os.environ["HF_HOME"] = cache_dir
	os.environ["TORCH_HOME"] = cache_dir
	os.makedirs(cache_dir, exist_ok=True)

	# Patch logging to avoid permission errors
	import logging
	from logging import FileHandler
	class SafeFileHandler(FileHandler):
	def __init__(self, filename, mode='a', encoding=None, delay=False, errors=None):
	# Redirect all logs to tmp
	safe_path = os.environ.get("OLMOCR_LOG_PATH", "/tmp/olmocr-pipeline-debug.log")
	super().__init__(safe_path, mode, encoding, delay, errors)
	logging.FileHandler = SafeFileHandler

	# Now import olmocr
	from olmocr.run_ocr import ocr_pdf_to_text
	from olmocr.prompts import build_finetuning_prompt
	from olmocr.prompts.anchor import get_anchor_text
	from olmocr.data.renderpdf import render_pdf_to_base64png

	# Load model and processor
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"allenai/olmOCR-7B-0225-preview",
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
	).eval().to(device)
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

	def ocr_page(pdf_path, page_num):
	image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
	anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
	prompt = build_finetuning_prompt(anchor_text)

	messages = [{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
	],
	}]

	prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
	inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	temperature=0.8,
	max_new_tokens=1024,
	do_sample=True,
	)

	prompt_len = inputs["input_ids"].shape[1]
	new_tokens = outputs[:, prompt_len:]
	decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
	return decoded[0] if decoded else ""

	def create_epub_from_text(text, output_path, title, author, language, cover_image):
	book = epub.EpubBook()
	book.set_title(title)
	book.set_language(language)
	book.add_author(author)

	with open(cover_image, "rb") as cover_file:
	cover_data = cover_file.read()
	cover_item = epub.EpubItem(uid="cover", file_name="cover.jpg", media_type="image/jpeg", content=cover_data)
	book.add_item(cover_item)

	chapter = epub.EpubHtml(title="Content", file_name="content.xhtml", lang=language)
	chapter.set_content(f"<html><body><h1>{title}</h1><p>{text}</p></body></html>")
	book.add_item(chapter)
	book.toc = (epub.Link("content.xhtml", "Content", "content"),)
	book.add_item(epub.EpubNav())
	epub.write_epub(output_path, book)

	def convert_pdf_to_epub(pdf_file, title, author, language):
	tmp_pdf_path = pdf_file.name
	reader = PdfReader(tmp_pdf_path)
	cover_path = "/tmp/cover.jpg"
	images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
	images[0].save(cover_path, "JPEG")

	# Use official AllenAI OCR function
	ocr_text = ocr_pdf_to_text(
	pdf_path=tmp_pdf_path,
	model=model,
	processor=processor
	)

	epub_path = "/tmp/output.epub"
	create_epub_from_text(
	text=ocr_text,
	output_path=epub_path,
	title=title,
	author=author,
	language=language,
	cover_image=cover_path
	)
	return epub_path, cover_path

	def interface_fn(pdf, title, author, language):
	epub_path, _ = convert_pdf_to_epub(pdf, title, author, language)
	return epub_path

	demo = gr.Interface(
	fn=interface_fn,
	inputs=[
	gr.File(label="Upload PDF", file_types=[".pdf"]),
	gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
	gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
	gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
	],
	outputs=gr.File(label="Download EPUB"),
	title="PDF to EPUB Converter (olmOCR)",
	description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
	allow_flagging="never",
	)

	if __name__ == "__main__":
	demo.launch(share=True)