Spaces:
Running
Running
File size: 4,968 Bytes
e7faf36 af75cff e7faf36 19918ea af75cff d45f3e7 af75cff fff0f58 8be5494 af75cff 89a1632 af75cff 8be5494 b3d319d 36bb738 b3d319d 89a1632 b3d319d 89a1632 af75cff 8be5494 afbaa03 af75cff afbaa03 8be5494 afbaa03 8be5494 19918ea af75cff 19918ea afbaa03 8be5494 d45f3e7 8be5494 afbaa03 8be5494 afbaa03 8be5494 d45f3e7 8be5494 afbaa03 0225b8c fff0f58 afbaa03 10b8e9d afbaa03 a67d3a2 b3d319d a67d3a2 f01e8a4 afbaa03 af75cff f01e8a4 d45f3e7 afbaa03 8be5494 afbaa03 8be5494 f01e8a4 afbaa03 d45f3e7 af75cff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import os
import base64
import tempfile
from io import BytesIO
import torch
import gradio as gr
from PIL import Image
from PyPDF2 import PdfReader
from ebooklib import epub
from pdf2image import convert_from_path
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
# Set cache and log paths
cache_dir = "/tmp/huggingface_cache"
os.environ["HF_HOME"] = cache_dir
os.environ["TORCH_HOME"] = cache_dir
os.makedirs(cache_dir, exist_ok=True)
# Patch logging to avoid permission errors
import logging
from logging import FileHandler
class SafeFileHandler(FileHandler):
def __init__(self, filename, mode='a', encoding=None, delay=False, errors=None):
# Redirect all logs to tmp
safe_path = os.environ.get("OLMOCR_LOG_PATH", "/tmp/olmocr-pipeline-debug.log")
super().__init__(safe_path, mode, encoding, delay, errors)
logging.FileHandler = SafeFileHandler
# Now import olmocr
from olmocr.run_ocr import ocr_pdf_to_text
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
from olmocr.data.renderpdf import render_pdf_to_base64png
# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Qwen2VLForConditionalGeneration.from_pretrained(
"allenai/olmOCR-7B-0225-preview",
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
).eval().to(device)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
def ocr_page(pdf_path, page_num):
image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
messages = [{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
],
}]
prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.generate(
**inputs,
temperature=0.8,
max_new_tokens=1024,
do_sample=True,
)
prompt_len = inputs["input_ids"].shape[1]
new_tokens = outputs[:, prompt_len:]
decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
return decoded[0] if decoded else ""
def create_epub_from_text(text, output_path, title, author, language, cover_image):
book = epub.EpubBook()
book.set_title(title)
book.set_language(language)
book.add_author(author)
with open(cover_image, "rb") as cover_file:
cover_data = cover_file.read()
cover_item = epub.EpubItem(uid="cover", file_name="cover.jpg", media_type="image/jpeg", content=cover_data)
book.add_item(cover_item)
chapter = epub.EpubHtml(title="Content", file_name="content.xhtml", lang=language)
chapter.set_content(f"<html><body><h1>{title}</h1><p>{text}</p></body></html>")
book.add_item(chapter)
book.toc = (epub.Link("content.xhtml", "Content", "content"),)
book.add_item(epub.EpubNav())
epub.write_epub(output_path, book)
def convert_pdf_to_epub(pdf_file, title, author, language):
tmp_pdf_path = pdf_file.name
reader = PdfReader(tmp_pdf_path)
cover_path = "/tmp/cover.jpg"
images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
images[0].save(cover_path, "JPEG")
# Use official AllenAI OCR function
ocr_text = ocr_pdf_to_text(
pdf_path=tmp_pdf_path,
model=model,
processor=processor
)
epub_path = "/tmp/output.epub"
create_epub_from_text(
text=ocr_text,
output_path=epub_path,
title=title,
author=author,
language=language,
cover_image=cover_path
)
return epub_path, cover_path
def interface_fn(pdf, title, author, language):
epub_path, _ = convert_pdf_to_epub(pdf, title, author, language)
return epub_path
demo = gr.Interface(
fn=interface_fn,
inputs=[
gr.File(label="Upload PDF", file_types=[".pdf"]),
gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
],
outputs=gr.File(label="Download EPUB"),
title="PDF to EPUB Converter (olmOCR)",
description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
allow_flagging="never",
)
if __name__ == "__main__":
demo.launch(share=True)
|