leonarb commited on
Commit
d5f7d0d
·
verified ·
1 Parent(s): 84e3794

Fixes EPUB output

Browse files
Files changed (1) hide show
  1. app.py +52 -128
app.py CHANGED
@@ -1,146 +1,70 @@
1
  import gradio as gr
2
- import torch
3
- import base64
4
- import fitz # PyMuPDF
5
  from io import BytesIO
6
  from PIL import Image
7
- from pathlib import Path
8
- from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
9
-
10
- from olmocr.data.renderpdf import render_pdf_to_base64png
11
- from olmocr.prompts.anchor import get_anchor_text
12
-
13
  from ebooklib import epub
14
-
15
- # Load model and processor
16
- model = Qwen2VLForConditionalGeneration.from_pretrained(
17
- "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
18
- ).eval()
19
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
20
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
- model.to(device)
22
 
23
  def process_pdf_to_epub(pdf_file, title, author):
24
- pdf_path = pdf_file.name
25
- doc = fitz.open(pdf_path)
26
- num_pages = len(doc)
27
 
28
- # Create EPUB book
29
  book = epub.EpubBook()
30
  book.set_identifier("id123456")
31
- book.set_title(title)
32
- book.add_author(author)
33
-
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  chapters = []
35
-
36
- for i in range(num_pages):
37
- page_num = i + 1
38
- print(f"Processing page {page_num}...")
39
-
40
- try:
41
- # Render page to base64 image
42
- image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
43
- anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
44
- print(f"Anchor text for page {page_num}: {anchor_text}")
45
-
46
- # New prompt format
47
- prompt = (
48
- "Below is the image of one page of a document, as well as some raw textual content that was previously "
49
- "extracted for it. Just return the plain text representation of this document as if you were reading it naturally.\n"
50
- "Do not hallucinate.\n"
51
- "RAW_TEXT_START\n"
52
- f"{anchor_text}\n"
53
- "RAW_TEXT_END"
54
- )
55
-
56
- messages = [
57
- {
58
- "role": "user",
59
- "content": [
60
- {"type": "text", "text": prompt},
61
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
62
- ],
63
- }
64
- ]
65
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
66
- image = Image.open(BytesIO(base64.b64decode(image_base64)))
67
-
68
- inputs = processor(
69
- text=[text],
70
- images=[image],
71
- padding=True,
72
- return_tensors="pt",
73
- )
74
- inputs = {k: v.to(device) for k, v in inputs.items()}
75
-
76
- output = model.generate(
77
- **inputs,
78
- temperature=0.8,
79
- max_new_tokens=512,
80
- num_return_sequences=1,
81
- do_sample=True,
82
- )
83
-
84
- prompt_length = inputs["input_ids"].shape[1]
85
- new_tokens = output[:, prompt_length:].detach().cpu()
86
-
87
- decoded = "[No output generated]"
88
- if new_tokens is not None and new_tokens.shape[1] > 0:
89
- try:
90
- decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
91
- decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
92
- except Exception as decode_error:
93
- decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
94
- else:
95
- decoded = "[Model returned no new tokens]"
96
-
97
- except Exception as processing_error:
98
- decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
99
-
100
- print(f"Decoded content for page {page_num}: {decoded}")
101
-
102
- # Create chapter
103
- chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
104
- chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
105
  book.add_item(chapter)
106
  chapters.append(chapter)
107
 
108
- # Save cover image from page 1
109
- if page_num == 1:
110
- cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
111
- cover_io = BytesIO()
112
- cover_image.save(cover_io, format='PNG')
113
- book.set_cover("cover.png", cover_io.getvalue())
114
-
115
- # Assemble EPUB
116
- book.toc = tuple(chapters)
117
  book.add_item(epub.EpubNcx())
118
  book.add_item(epub.EpubNav())
119
- book.spine = ['nav'] + chapters
120
-
121
- output_path = "/tmp/output.epub"
122
- epub.write_epub(output_path, book)
123
- return output_path
124
 
125
- # Gradio Interface
126
- iface = gr.Interface(
127
- fn=process_pdf_to_epub,
128
- inputs=[
129
- gr.File(label="Upload PDF", file_types=[".pdf"]),
130
- gr.Textbox(label="EPUB Title"),
131
- gr.Textbox(label="Author(s)")
132
- ],
133
- outputs=gr.File(label="Download EPUB"),
134
- title="PDF to EPUB Converter (with olmOCR)",
135
- description="Uploads a PDF, extracts text from each page with vision + prompt, and builds an EPUB using the outputs. Sets the first page as cover.",
136
- allow_flagging="never"
137
- )
 
 
 
 
 
 
 
138
 
139
  if __name__ == "__main__":
140
- iface.launch(
141
- server_name="0.0.0.0",
142
- server_port=7860,
143
- share=True,
144
- debug=True,
145
- allowed_paths=["/tmp"]
146
- )
 
1
  import gradio as gr
2
+ import tempfile
 
 
3
  from io import BytesIO
4
  from PIL import Image
 
 
 
 
 
 
5
  from ebooklib import epub
6
+ from olmocr.model import process_pdf # your forked olmocr model
 
 
 
 
 
 
 
7
 
8
  def process_pdf_to_epub(pdf_file, title, author):
9
+ # Run the OCR + LLM pipeline from olmocr
10
+ print("Starting PDF processing...")
11
+ page_results = process_pdf(pdf_file.name)
12
 
13
+ # Create the EPUB book
14
  book = epub.EpubBook()
15
  book.set_identifier("id123456")
16
+ book.set_title(title if title else "Untitled Document")
17
+ book.set_language("en")
18
+ if author:
19
+ book.add_author(author)
20
+
21
+ # Try to use the first page as cover
22
+ try:
23
+ with Image.open(pdf_file.name) as img:
24
+ img.convert("RGB").save("cover.jpg", "JPEG")
25
+ with open("cover.jpg", "rb") as f:
26
+ cover_data = f.read()
27
+ book.set_cover("cover.jpg", cover_data)
28
+ except Exception as e:
29
+ print("Could not generate cover:", e)
30
+
31
+ # Add chapters from pages
32
  chapters = []
33
+ for i, page in enumerate(page_results):
34
+ text = page.get("decoded", {}).get("natural_text", "")
35
+ if not text.strip():
36
+ continue
37
+ chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en")
38
+ chapter.content = f"<h1>Page {i+1}</h1><p>{text.replace('\n', '<br/>')}</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  book.add_item(chapter)
40
  chapters.append(chapter)
41
 
42
+ # Define spine and table of contents
43
+ book.toc = chapters
44
+ book.spine = ["nav"] + chapters
 
 
 
 
 
 
45
  book.add_item(epub.EpubNcx())
46
  book.add_item(epub.EpubNav())
 
 
 
 
 
47
 
48
+ # Write to in-memory buffer
49
+ output_buffer = BytesIO()
50
+ epub.write_epub(output_buffer, book)
51
+ output_buffer.seek(0)
52
+ return output_buffer
53
+
54
+ with gr.Blocks() as demo:
55
+ gr.Markdown("# PDF to EPUB Converter\nPowered by `olmOCR`")
56
+ with gr.Row():
57
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
58
+ title = gr.Textbox(label="EPUB Title", placeholder="Optional title")
59
+ author = gr.Textbox(label="Author", placeholder="Optional author name")
60
+ convert_button = gr.Button("Convert to EPUB")
61
+ epub_output = gr.File(label="Download EPUB", file_types=[".epub"])
62
+
63
+ convert_button.click(
64
+ fn=process_pdf_to_epub,
65
+ inputs=[pdf_input, title, author],
66
+ outputs=epub_output
67
+ )
68
 
69
  if __name__ == "__main__":
70
+ demo.launch()