Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

olmocr-demo / app.py

leonarb

Update app.py

36bb738 verified 26 days ago

raw

history blame

5.25 kB

	import os
	import base64
	import tempfile
	from io import BytesIO

	import torch
	import gradio as gr
	from PIL import Image
	from PyPDF2 import PdfReader
	from ebooklib import epub
	from pdf2image import convert_from_path
	from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

	# Set cache and log paths
	cache_dir = "/tmp/huggingface_cache"
	os.environ["HF_HOME"] = cache_dir
	os.environ["TORCH_HOME"] = cache_dir
	os.environ["OLMOCR_LOG_PATH"] = "/tmp/olmocr-pipeline-debug.log"
	os.makedirs(cache_dir, exist_ok=True)

	# Patch logging path before olmocr import
	import logging
	original_file_handler = logging.FileHandler
	def safe_file_handler(filename, args, *kwargs):
	if filename == "olmocr-pipeline-debug.log":
	filename = os.environ.get("OLMOCR_LOG_PATH", "/tmp/olmocr-pipeline-debug.log")
	return original_file_handler(filename, args, *kwargs)
	logging.FileHandler = safe_file_handler

	# Import olmocr pipeline after setting log path
	from olmocr.pipeline import PDFToTextOCR
	from olmocr.data.renderpdf import render_pdf_to_base64png
	from olmocr.prompts import build_finetuning_prompt
	from olmocr.prompts.anchor import get_anchor_text

	# Load model and processor
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"allenai/olmOCR-7B-0225-preview",
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
	).eval().to(device)
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

	# Load OCR pipeline
	olmocr = PDFToTextOCR(model=model, processor=processor)

	def ocr_page(pdf_path, page_num):
	image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
	anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
	prompt = build_finetuning_prompt(anchor_text)

	messages = [{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
	],
	}]

	prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
	inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	temperature=0.8,
	max_new_tokens=1024,
	do_sample=True,
	)

	prompt_len = inputs["input_ids"].shape[1]
	new_tokens = outputs[:, prompt_len:]
	decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
	return decoded[0] if decoded else ""

	def create_epub_from_text(text, output_path, title, author, language, cover_image):
	book = epub.EpubBook()

	# Set metadata
	book.set_title(title)
	book.set_language(language)
	book.add_author(author)

	# Add cover image
	with open(cover_image, "rb") as cover_file:
	cover_data = cover_file.read()
	cover_item = epub.EpubItem(uid="cover", file_name="cover.jpg", media_type="image/jpeg", content=cover_data)
	book.add_item(cover_item)

	# Create a chapter for the content
	chapter = epub.EpubHtml(title="Content", file_name="content.xhtml", lang=language)
	chapter.set_content(f"<html><body><h1>{title}</h1><p>{text}</p></body></html>")
	book.add_item(chapter)

	# Define Table of Contents (TOC)
	book.toc = (epub.Link("content.xhtml", "Content", "content"),)

	# Add default NCX and OPF files
	book.add_item(epub.EpubNav())

	# Write the EPUB file
	epub.write_epub(output_path, book)

	def convert_pdf_to_epub(pdf_file, title, author, language):
	tmp_pdf_path = pdf_file.name

	# Read PDF to get cover
	reader = PdfReader(tmp_pdf_path)
	first_page = reader.pages[0]
	cover_path = "/tmp/cover.jpg"
	images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
	images[0].save(cover_path, "JPEG")

	# Run OCR
	ocr_text = olmocr.process(tmp_pdf_path)

	# Write EPUB
	epub_path = "/tmp/output.epub"
	create_epub_from_text(
	text=ocr_text,
	output_path=epub_path,
	title=title,
	author=author,
	language=language,
	cover_image=cover_path
	)

	return epub_path, cover_path

	def interface_fn(pdf, title, author, language):
	epub_path, _ = convert_pdf_to_epub(pdf, title, author, language)
	return epub_path

	demo = gr.Interface(
	fn=interface_fn,
	inputs=[
	gr.File(label="Upload PDF", file_types=[".pdf"]),
	gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
	gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
	gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
	],
	outputs=gr.File(label="Download EPUB"),
	title="PDF to EPUB Converter (olmOCR)",
	description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
	allow_flagging="never",
	)

	if __name__ == "__main__":
	demo.launch(share=True)