Spaces:
Running
Running
File size: 4,020 Bytes
c986ff1 19918ea 8be5494 d45f3e7 8be5494 f01e8a4 8be5494 0225b8c f01e8a4 8be5494 0225b8c 8be5494 0225b8c 8be5494 19918ea 8be5494 19918ea 8be5494 0225b8c 8be5494 d45f3e7 8be5494 d45f3e7 8be5494 f01e8a4 0225b8c f01e8a4 0225b8c f01e8a4 0225b8c f01e8a4 0225b8c f01e8a4 0225b8c f01e8a4 d45f3e7 f01e8a4 8be5494 d45f3e7 8be5494 0225b8c f01e8a4 8be5494 f01e8a4 0225b8c f01e8a4 d45f3e7 8be5494 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import os
import torch
import base64
from io import BytesIO
from PIL import Image
import gradio as gr
from ebooklib import epub
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
from PyPDF2 import PdfReader
# Set a writable cache directory for HF
os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
# Load processor and model
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
model = Qwen2VLForConditionalGeneration.from_pretrained(
"allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
).eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def extract_text_from_page(pdf_path, page_num):
# Render image
image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
image = Image.open(BytesIO(base64.b64decode(image_base64)))
# Prompt and input
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
output = model.generate(
**inputs,
temperature=0.8,
max_new_tokens=256,
num_return_sequences=1,
do_sample=True,
)
prompt_len = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_len:]
decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
return decoded, image_base64 if page_num == 1 else None
def process_pdf(file, title="Extracted PDF", author="olmOCR", language="en"):
file_path = file.name
reader = PdfReader(file_path)
num_pages = len(reader.pages)
all_text = []
cover_image_data = None
for page in range(1, num_pages + 1):
text, cover_image = extract_text_from_page(file_path, page)
all_text.append(f"<h2>Page {page}</h2><p>{text}</p>")
if cover_image and not cover_image_data:
cover_image_data = cover_image # base64
# Build EPUB
book = epub.EpubBook()
book.set_identifier("id123456")
book.set_title(title)
book.set_language(language)
book.add_author(author)
# Add cover image
if cover_image_data:
cover_bytes = base64.b64decode(cover_image_data)
book.set_cover("cover.jpg", cover_bytes)
# Create chapter with all text
chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
chapter.content = f"<h1>{title}</h1>{''.join(all_text)}"
book.add_item(chapter)
book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
book.add_item(epub.EpubNavi())
book.add_item(epub.EpubNCX())
book.spine = ['nav', chapter]
epub_path = f"/tmp/{title.replace(' ', '_')}.epub"
epub.write_epub(epub_path, book)
return epub_path
# Gradio Interface
iface = gr.Interface(
fn=process_pdf,
inputs=[
gr.File(label="Upload PDF"),
gr.Textbox(value="Extracted PDF", label="EPUB Title"),
gr.Textbox(value="olmOCR", label="Author"),
gr.Textbox(value="en", label="Language"),
],
outputs=gr.File(label="Download EPUB"),
title="olmOCR PDF to EPUB (Full PDF + Cover Image)",
description="Extract text from ALL pages of a PDF and generate an EPUB with the first page as cover.",
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()
|