leonarb commited on
Commit
5827499
·
verified ·
1 Parent(s): b3d319d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -111
app.py CHANGED
@@ -1,137 +1,119 @@
1
- import os
 
2
  import base64
3
- import tempfile
4
  from io import BytesIO
5
-
6
- import torch
7
- import gradio as gr
8
  from PIL import Image
9
- from PyPDF2 import PdfReader
10
- from ebooklib import epub
11
- from pdf2image import convert_from_path
12
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
13
 
14
- # Set cache and log paths
15
- cache_dir = "/tmp/huggingface_cache"
16
- os.environ["HF_HOME"] = cache_dir
17
- os.environ["TORCH_HOME"] = cache_dir
18
- os.makedirs(cache_dir, exist_ok=True)
19
-
20
- # Patch logging to avoid permission errors
21
- import logging
22
- from logging import FileHandler
23
- class SafeFileHandler(FileHandler):
24
- def __init__(self, filename, mode='a', encoding=None, delay=False, errors=None):
25
- # Redirect all logs to tmp
26
- safe_path = os.environ.get("OLMOCR_LOG_PATH", "/tmp/olmocr-pipeline-debug.log")
27
- super().__init__(safe_path, mode, encoding, delay, errors)
28
- logging.FileHandler = SafeFileHandler
29
-
30
- # Now import olmocr
31
- from olmocr.run_ocr import ocr_pdf_to_text
32
  from olmocr.prompts import build_finetuning_prompt
33
  from olmocr.prompts.anchor import get_anchor_text
34
- from olmocr.data.renderpdf import render_pdf_to_base64png
 
35
 
36
  # Load model and processor
37
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
  model = Qwen2VLForConditionalGeneration.from_pretrained(
39
- "allenai/olmOCR-7B-0225-preview",
40
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
41
- ).eval().to(device)
42
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 
 
43
 
44
- def ocr_page(pdf_path, page_num):
45
- image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
46
- anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
47
- prompt = build_finetuning_prompt(anchor_text)
48
-
49
- messages = [{
50
- "role": "user",
51
- "content": [
52
- {"type": "text", "text": prompt},
53
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
54
- ],
55
- }]
56
-
57
- prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
58
- main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
59
- inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
60
- inputs = {k: v.to(device) for k, v in inputs.items()}
61
-
62
- with torch.no_grad():
63
- outputs = model.generate(
64
- **inputs,
65
- temperature=0.8,
66
- max_new_tokens=1024,
67
- do_sample=True,
68
- )
69
-
70
- prompt_len = inputs["input_ids"].shape[1]
71
- new_tokens = outputs[:, prompt_len:]
72
- decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
73
- return decoded[0] if decoded else ""
74
-
75
- def create_epub_from_text(text, output_path, title, author, language, cover_image):
76
  book = epub.EpubBook()
 
77
  book.set_title(title)
78
- book.set_language(language)
79
  book.add_author(author)
80
 
81
- with open(cover_image, "rb") as cover_file:
82
- cover_data = cover_file.read()
83
- cover_item = epub.EpubItem(uid="cover", file_name="cover.jpg", media_type="image/jpeg", content=cover_data)
84
- book.add_item(cover_item)
85
-
86
- chapter = epub.EpubHtml(title="Content", file_name="content.xhtml", lang=language)
87
- chapter.set_content(f"<html><body><h1>{title}</h1><p>{text}</p></body></html>")
88
- book.add_item(chapter)
89
- book.toc = (epub.Link("content.xhtml", "Content", "content"),)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  book.add_item(epub.EpubNav())
 
 
 
91
  epub.write_epub(output_path, book)
 
92
 
93
- def convert_pdf_to_epub(pdf_file, title, author, language):
94
- tmp_pdf_path = pdf_file.name
95
- reader = PdfReader(tmp_pdf_path)
96
- cover_path = "/tmp/cover.jpg"
97
- images = convert_from_path(tmp_pdf_path, first_page=1, last_page=1)
98
- images[0].save(cover_path, "JPEG")
99
-
100
- # Use official AllenAI OCR function
101
- ocr_text = ocr_pdf_to_text(
102
- pdf_path=tmp_pdf_path,
103
- model=model,
104
- processor=processor
105
- )
106
-
107
- epub_path = "/tmp/output.epub"
108
- create_epub_from_text(
109
- text=ocr_text,
110
- output_path=epub_path,
111
- title=title,
112
- author=author,
113
- language=language,
114
- cover_image=cover_path
115
- )
116
- return epub_path, cover_path
117
-
118
- def interface_fn(pdf, title, author, language):
119
- epub_path, _ = convert_pdf_to_epub(pdf, title, author, language)
120
- return epub_path
121
-
122
- demo = gr.Interface(
123
- fn=interface_fn,
124
  inputs=[
125
  gr.File(label="Upload PDF", file_types=[".pdf"]),
126
- gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
127
- gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
128
- gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
129
  ],
130
  outputs=gr.File(label="Download EPUB"),
131
- title="PDF to EPUB Converter (olmOCR)",
132
- description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
133
- allow_flagging="never",
134
  )
135
 
136
  if __name__ == "__main__":
137
- demo.launch(share=True)
 
1
+ import gradio as gr
2
+ import torch
3
  import base64
4
+ import fitz # PyMuPDF
5
  from io import BytesIO
 
 
 
6
  from PIL import Image
7
+ from pathlib import Path
 
 
8
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
9
 
10
+ from olmocr.data.renderpdf import render_pdf_to_base64png
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from olmocr.prompts import build_finetuning_prompt
12
  from olmocr.prompts.anchor import get_anchor_text
13
+
14
+ from ebooklib import epub
15
 
16
  # Load model and processor
 
17
  model = Qwen2VLForConditionalGeneration.from_pretrained(
18
+ "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
19
+ ).eval()
 
20
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
21
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
+ model.to(device)
23
 
24
+ def process_pdf_to_epub(pdf_file, title, author):
25
+ pdf_path = pdf_file.name
26
+ doc = fitz.open(pdf_path)
27
+ num_pages = len(doc)
28
+
29
+ # Create EPUB book
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  book = epub.EpubBook()
31
+ book.set_identifier("id123456")
32
  book.set_title(title)
 
33
  book.add_author(author)
34
 
35
+ chapters = []
36
+
37
+ for i in range(num_pages):
38
+ page_num = i + 1
39
+
40
+ try:
41
+ # Render page to base64 image
42
+ image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
43
+ anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
44
+ prompt = build_finetuning_prompt(anchor_text)
45
+
46
+ # Format prompt
47
+ messages = [
48
+ {
49
+ "role": "user",
50
+ "content": [
51
+ {"type": "text", "text": prompt},
52
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
53
+ ],
54
+ }
55
+ ]
56
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
57
+ image = Image.open(BytesIO(base64.b64decode(image_base64)))
58
+
59
+ inputs = processor(
60
+ text=[text],
61
+ images=[image],
62
+ padding=True,
63
+ return_tensors="pt",
64
+ )
65
+ inputs = {k: v.to(device) for k, v in inputs.items()}
66
+
67
+ output = model.generate(
68
+ **inputs,
69
+ temperature=0.8,
70
+ max_new_tokens=512,
71
+ num_return_sequences=1,
72
+ do_sample=True,
73
+ )
74
+
75
+ prompt_length = inputs["input_ids"].shape[1]
76
+ new_tokens = output[:, prompt_length:]
77
+ decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
78
+
79
+ except Exception as e:
80
+ decoded = f"[Error processing page {page_num}: {str(e)}]"
81
+
82
+ # Create chapter
83
+ chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
84
+ chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
85
+ book.add_item(chapter)
86
+ chapters.append(chapter)
87
+
88
+ # Save cover image from page 1
89
+ if page_num == 1:
90
+ cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
91
+ cover_io = BytesIO()
92
+ cover_image.save(cover_io, format='PNG')
93
+ book.set_cover("cover.png", cover_io.getvalue())
94
+
95
+ # Assemble EPUB
96
+ book.toc = tuple(chapters)
97
+ book.add_item(epub.EpubNcx())
98
  book.add_item(epub.EpubNav())
99
+ book.spine = ['nav'] + chapters
100
+
101
+ output_path = "/tmp/output.epub"
102
  epub.write_epub(output_path, book)
103
+ return output_path
104
 
105
+ # Gradio Interface
106
+ iface = gr.Interface(
107
+ fn=process_pdf_to_epub,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  inputs=[
109
  gr.File(label="Upload PDF", file_types=[".pdf"]),
110
+ gr.Textbox(label="EPUB Title"),
111
+ gr.Textbox(label="Author(s)")
 
112
  ],
113
  outputs=gr.File(label="Download EPUB"),
114
+ title="PDF to EPUB Converter (with olmOCR)",
115
+ description="Uploads a PDF, extracts text from each page with vision + prompt, and builds an EPUB using the outputs. Sets the first page as cover."
 
116
  )
117
 
118
  if __name__ == "__main__":
119
+ iface.launch()