Spaces:
Running
Running
File size: 4,372 Bytes
e7faf36 80deca1 e7faf36 afbaa03 19918ea afbaa03 8be5494 d45f3e7 8be5494 afbaa03 1e4eb1b 8be5494 afbaa03 8be5494 afbaa03 8be5494 afbaa03 8be5494 19918ea 8be5494 afbaa03 8be5494 19918ea afbaa03 8be5494 d45f3e7 8be5494 afbaa03 8be5494 afbaa03 8be5494 d45f3e7 8be5494 afbaa03 0225b8c afbaa03 f01e8a4 afbaa03 0225b8c afbaa03 f01e8a4 afbaa03 f01e8a4 afbaa03 0225b8c afbaa03 f01e8a4 afbaa03 f01e8a4 afbaa03 f01e8a4 afbaa03 f01e8a4 d45f3e7 afbaa03 8be5494 afbaa03 8be5494 f01e8a4 afbaa03 d45f3e7 683e2df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import os
# Set Hugging Face and Torch cache to a guaranteed-writable location
cache_dir = "/tmp/huggingface_cache"
os.environ["HF_HOME"] = cache_dir
os.environ["TORCH_HOME"] = cache_dir
# Create the directory if it doesn't exist
os.makedirs(cache_dir, exist_ok=True)
import gradio as gr
import torch
from PyPDF2 import PdfReader
from io import BytesIO
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
from ebooklib import epub
import base64
import tempfile
# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Qwen2VLForConditionalGeneration.from_pretrained(
"allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
).eval().to(device)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
def ocr_page(pdf_path, page_num):
# Render page to base64 PNG
image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
],
}
]
prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
temperature=0.8,
max_new_tokens=1024,
do_sample=True,
)
prompt_len = inputs["input_ids"].shape[1]
new_tokens = outputs[:, prompt_len:]
decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
return decoded[0] if decoded else ""
def convert_pdf_to_epub(pdf_file, title, author, language):
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
tmp_pdf.write(pdf_file.read())
tmp_pdf_path = tmp_pdf.name
reader = PdfReader(tmp_pdf_path)
num_pages = len(reader.pages)
# Create EPUB book
book = epub.EpubBook()
book.set_title(title)
book.add_author(author)
book.set_language(language)
# Use first page as cover
cover_image_b64 = render_pdf_to_base64png(tmp_pdf_path, 1, target_longest_image_dim=1024)
cover_image_bytes = base64.b64decode(cover_image_b64)
book.set_cover("cover.jpg", cover_image_bytes)
# OCR and add pages
for i in range(num_pages):
text = ocr_page(tmp_pdf_path, i)
chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang=language)
chapter.content = f"<h1>Page {i+1}</h1><p>{text}</p>"
book.add_item(chapter)
book.spine.append(chapter)
# Finalize EPUB
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
epub_path = os.path.join(tempfile.gettempdir(), "output.epub")
epub.write_epub(epub_path, book, {})
with open(epub_path, "rb") as f:
return epub_path, f.read()
def interface_fn(pdf, title, author, language):
epub_path, epub_bytes = convert_pdf_to_epub(pdf, title, author, language)
return epub_path
demo = gr.Interface(
fn=interface_fn,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
],
outputs=gr.File(label="Download EPUB"),
title="PDF to EPUB Converter (olmOCR)",
description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
allow_flagging="never",
)
if __name__ == "__main__":
demo.launch(share = True)
|