leonarb commited on
Commit
59ff001
·
verified ·
1 Parent(s): 6a0411c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -37
app.py CHANGED
@@ -1,64 +1,139 @@
1
- import os
2
- import tempfile
 
 
 
 
3
  from pathlib import Path
 
 
 
 
4
 
5
- import gradio as gr
6
  from ebooklib import epub
7
- from olmocr import process_pdf # your forked olmocr model
8
 
9
- def process_pdf_to_epub(pdf_path, title="Untitled", author="Unknown"):
10
- print(f"Processing PDF: {pdf_path}")
11
- output_dir = tempfile.mkdtemp()
12
- results = process_pdf(pdf_path, output_dir)
 
 
 
 
 
 
 
 
13
 
 
14
  book = epub.EpubBook()
15
  book.set_identifier("id123456")
16
  book.set_title(title)
17
- book.set_language("en")
18
  book.add_author(author)
19
 
20
  chapters = []
21
 
22
- for i, result in enumerate(results):
23
- text = result.get("decoded_content", {}).get("natural_text", "")
24
- if not text:
25
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en")
28
- # Avoid backslash in f-string expression
29
- safe_text = text.replace("\n", "<br/>")
30
- chapter.content = f"<h1>Page {i+1}</h1><p>{safe_text}</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  book.add_item(chapter)
32
  chapters.append(chapter)
33
 
34
- print(f"Processed page {i+1}")
35
-
36
- if not chapters:
37
- raise ValueError("No content extracted from PDF.")
 
 
38
 
 
39
  book.toc = tuple(chapters)
40
  book.add_item(epub.EpubNcx())
41
  book.add_item(epub.EpubNav())
42
- book.spine = ["nav"] + chapters
43
-
44
- with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
45
- epub.write_epub(tmp.name, book)
46
- print(f"EPUB written to {tmp.name}")
47
- return tmp.name
48
-
49
- # Gradio UI
50
- title_input = gr.Textbox(label="EPUB Title", value="Untitled")
51
- author_input = gr.Textbox(label="Author", value="Unknown")
52
- file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
53
 
54
- output_file = gr.File(label="Download EPUB")
 
 
55
 
 
56
  iface = gr.Interface(
57
  fn=process_pdf_to_epub,
58
- inputs=[file_input, title_input, author_input],
59
- outputs=output_file,
60
- title="PDF to EPUB Converter with olmOCR",
61
- description="Upload a PDF to convert it into an EPUB. First page is used as the cover."
 
 
 
 
 
62
  )
63
 
64
  if __name__ == "__main__":
 
1
+ import gradio as gr
2
+ import torch
3
+ import base64
4
+ import fitz # PyMuPDF
5
+ from io import BytesIO
6
+ from PIL import Image
7
  from pathlib import Path
8
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
9
+
10
+ from olmocr.data.renderpdf import render_pdf_to_base64png
11
+ from olmocr.prompts.anchor import get_anchor_text
12
 
 
13
  from ebooklib import epub
 
14
 
15
+ # Load model and processor
16
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
17
+ "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
18
+ ).eval()
19
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
20
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
+ model.to(device)
22
+
23
+ def process_pdf_to_epub(pdf_file, title, author):
24
+ pdf_path = pdf_file.name
25
+ doc = fitz.open(pdf_path)
26
+ num_pages = len(doc)
27
 
28
+ # Create EPUB book
29
  book = epub.EpubBook()
30
  book.set_identifier("id123456")
31
  book.set_title(title)
 
32
  book.add_author(author)
33
 
34
  chapters = []
35
 
36
+ for i in range(num_pages):
37
+ page_num = i + 1
38
+ print(f"Processing page {page_num}...")
39
+
40
+ try:
41
+ # Render page to base64 image
42
+ image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
43
+ anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
44
+ print(f"Anchor text for page {page_num}: {anchor_text}")
45
+
46
+ # New prompt format
47
+ prompt = (
48
+ "Below is the image of one page of a document, as well as some raw textual content that was previously "
49
+ "extracted for it. Just return the plain text representation of this document as if you were reading it naturally.\n"
50
+ "Do not hallucinate.\n"
51
+ "RAW_TEXT_START\n"
52
+ f"{anchor_text}\n"
53
+ "RAW_TEXT_END"
54
+ )
55
+
56
+ messages = [
57
+ {
58
+ "role": "user",
59
+ "content": [
60
+ {"type": "text", "text": prompt},
61
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
62
+ ],
63
+ }
64
+ ]
65
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
66
+ image = Image.open(BytesIO(base64.b64decode(image_base64)))
67
+
68
+ inputs = processor(
69
+ text=[text],
70
+ images=[image],
71
+ padding=True,
72
+ return_tensors="pt",
73
+ )
74
+ inputs = {k: v.to(device) for k, v in inputs.items()}
75
 
76
+ output = model.generate(
77
+ **inputs,
78
+ temperature=0.8,
79
+ max_new_tokens=512,
80
+ num_return_sequences=1,
81
+ do_sample=True,
82
+ )
83
+
84
+ prompt_length = inputs["input_ids"].shape[1]
85
+ new_tokens = output[:, prompt_length:].detach().cpu()
86
+
87
+ decoded = "[No output generated]"
88
+ if new_tokens is not None and new_tokens.shape[1] > 0:
89
+ try:
90
+ decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
91
+ decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
92
+ except Exception as decode_error:
93
+ decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
94
+ else:
95
+ decoded = "[Model returned no new tokens]"
96
+
97
+ except Exception as processing_error:
98
+ decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
99
+
100
+ print(f"Decoded content for page {page_num}: {decoded}")
101
+
102
+ # Create chapter
103
+ chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
104
+ chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
105
  book.add_item(chapter)
106
  chapters.append(chapter)
107
 
108
+ # Save cover image from page 1
109
+ if page_num == 1:
110
+ cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
111
+ cover_io = BytesIO()
112
+ cover_image.save(cover_io, format='PNG')
113
+ book.set_cover("cover.png", cover_io.getvalue())
114
 
115
+ # Assemble EPUB
116
  book.toc = tuple(chapters)
117
  book.add_item(epub.EpubNcx())
118
  book.add_item(epub.EpubNav())
119
+ book.spine = ['nav'] + chapters
 
 
 
 
 
 
 
 
 
 
120
 
121
+ output_path = "/tmp/output.epub"
122
+ epub.write_epub(output_path, book)
123
+ return output_path
124
 
125
+ # Gradio Interface
126
  iface = gr.Interface(
127
  fn=process_pdf_to_epub,
128
+ inputs=[
129
+ gr.File(label="Upload PDF", file_types=[".pdf"]),
130
+ gr.Textbox(label="EPUB Title"),
131
+ gr.Textbox(label="Author(s)")
132
+ ],
133
+ outputs=gr.File(label="Download EPUB"),
134
+ title="PDF to EPUB Converter (with olmOCR)",
135
+ description="Uploads a PDF, extracts text from each page with vision + prompt, and builds an EPUB using the outputs. Sets the first page as cover.",
136
+ allow_flagging="never"
137
  )
138
 
139
  if __name__ == "__main__":