Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

olmocr-demo / app.py

leonarb

Update app.py

f01e8a4 verified 26 days ago

raw

history blame

3.31 kB

	import os
	import torch
	import base64
	from io import BytesIO
	from PIL import Image
	import gradio as gr
	from ebooklib import epub

	from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
	from olmocr.data.renderpdf import render_pdf_to_base64png
	from olmocr.prompts import build_finetuning_prompt
	from olmocr.prompts.anchor import get_anchor_text

	# Set a writable directory for Hugging Face's cache
	os.environ['HF_HOME'] = '/tmp/.cache/huggingface'

	# Load processor and model
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
	).eval()

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language="en"):
	file_path = file.name

	image_base64 = render_pdf_to_base64png(file_path, page, target_longest_image_dim=1024)
	main_image = Image.open(BytesIO(base64.b64decode(image_base64)))

	anchor_text = get_anchor_text(file_path, page, pdf_engine="pdfreport", target_length=4000)
	prompt = build_finetuning_prompt(anchor_text)

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
	],
	}
	]

	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(text=[text], images=[main_image], return_tensors="pt", padding=True)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	with torch.no_grad():
	output = model.generate(
	**inputs,
	temperature=0.8,
	max_new_tokens=256,
	num_return_sequences=1,
	do_sample=True,
	)

	prompt_len = inputs["input_ids"].shape[1]
	new_tokens = output[:, prompt_len:]
	decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]

	# Create EPUB
	book = epub.EpubBook()
	book.set_identifier("id123456")
	book.set_title(title)
	book.set_language(language)
	book.add_author(author)

	chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
	chapter.content = f"<h1>{title}</h1><p>{decoded}</p>"
	book.add_item(chapter)

	book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
	book.add_item(epub.EpubNavi())
	book.add_item(epub.EpubNCX())
	book.spine = ['nav', chapter]

	epub_path = f"/tmp/{title.replace(' ', '_')}_page_{page}.epub"
	epub.write_epub(epub_path, book)

	return epub_path

	# Gradio Interface
	iface = gr.Interface(
	fn=process_pdf,
	inputs=[
	gr.File(label="Upload PDF"),
	gr.Number(value=1, label="Page Number"),
	gr.Textbox(value="Extracted Page", label="EPUB Title"),
	gr.Textbox(value="olmOCR", label="Author"),
	gr.Textbox(value="en", label="Language"),
	],
	outputs=gr.File(label="Download EPUB"),
	title="olmOCR PDF to EPUB",
	description="Extract text from a selected page of a PDF and download it as an EPUB file.",
	allow_flagging="never"
	)

	if __name__ == "__main__":
	iface.launch()