Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

olmocr-demo / app.py

leonarb

Update app.py

afbaa03 verified about 1 month ago

raw

history blame

4.1 kB

	import gradio as gr
	import torch
	from PyPDF2 import PdfReader
	from io import BytesIO
	from PIL import Image
	from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
	from olmocr.data.renderpdf import render_pdf_to_base64png
	from olmocr.prompts import build_finetuning_prompt
	from olmocr.prompts.anchor import get_anchor_text
	from ebooklib import epub
	import base64
	import tempfile
	import os


	# Load model
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
	).eval().to(device)
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")


	def ocr_page(pdf_path, page_num):
	# Render page to base64 PNG
	image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
	anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
	prompt = build_finetuning_prompt(anchor_text)

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
	],
	}
	]

	prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
	inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	temperature=0.8,
	max_new_tokens=1024,
	do_sample=True,
	)

	prompt_len = inputs["input_ids"].shape[1]
	new_tokens = outputs[:, prompt_len:]
	decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
	return decoded[0] if decoded else ""


	def convert_pdf_to_epub(pdf_file, title, author, language):
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
	tmp_pdf.write(pdf_file.read())
	tmp_pdf_path = tmp_pdf.name

	reader = PdfReader(tmp_pdf_path)
	num_pages = len(reader.pages)

	# Create EPUB book
	book = epub.EpubBook()
	book.set_title(title)
	book.add_author(author)
	book.set_language(language)

	# Use first page as cover
	cover_image_b64 = render_pdf_to_base64png(tmp_pdf_path, 1, target_longest_image_dim=1024)
	cover_image_bytes = base64.b64decode(cover_image_b64)
	book.set_cover("cover.jpg", cover_image_bytes)

	# OCR and add pages
	for i in range(num_pages):
	text = ocr_page(tmp_pdf_path, i)
	chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang=language)
	chapter.content = f"<h1>Page {i+1}</h1><p>{text}</p>"
	book.add_item(chapter)
	book.spine.append(chapter)

	# Finalize EPUB
	book.add_item(epub.EpubNcx())
	book.add_item(epub.EpubNav())
	epub_path = os.path.join(tempfile.gettempdir(), "output.epub")
	epub.write_epub(epub_path, book, {})

	with open(epub_path, "rb") as f:
	return epub_path, f.read()


	def interface_fn(pdf, title, author, language):
	epub_path, epub_bytes = convert_pdf_to_epub(pdf, title, author, language)
	return epub_path


	demo = gr.Interface(
	fn=interface_fn,
	inputs=[
	gr.File(label="Upload PDF", file_types=[".pdf"]),
	gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
	gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
	gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
	],
	outputs=gr.File(label="Download EPUB"),
	title="PDF to EPUB Converter (olmOCR)",
	description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
	allow_flagging="never",
	)

	if __name__ == "__main__":
	demo.launch()