leonarb commited on
Commit
afbaa03
·
verified ·
1 Parent(s): 606638a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -71
app.py CHANGED
@@ -1,38 +1,30 @@
1
- import os
2
- os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers"
3
  import torch
4
- import base64
5
  from io import BytesIO
6
  from PIL import Image
7
- import gradio as gr
8
- from ebooklib import epub
9
-
10
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
11
  from olmocr.data.renderpdf import render_pdf_to_base64png
12
  from olmocr.prompts import build_finetuning_prompt
13
  from olmocr.prompts.anchor import get_anchor_text
 
 
 
 
14
 
15
- from PyPDF2 import PdfReader
16
-
17
- # Set a writable cache directory for HF
18
- os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
19
-
20
- # Load processor and model
21
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
22
- model = Qwen2VLForConditionalGeneration.from_pretrained(
23
- "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
24
- ).eval()
25
 
 
26
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
- model.to(device)
 
 
 
28
 
29
- def extract_text_from_page(pdf_path, page_num):
30
- # Render image
31
- image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
32
- image = Image.open(BytesIO(base64.b64decode(image_base64)))
33
 
34
- # Prompt and input
35
- anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
 
 
36
  prompt = build_finetuning_prompt(anchor_text)
37
 
38
  messages = [
@@ -40,84 +32,85 @@ def extract_text_from_page(pdf_path, page_num):
40
  "role": "user",
41
  "content": [
42
  {"type": "text", "text": prompt},
43
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
44
  ],
45
  }
46
  ]
47
 
48
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
49
- inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
 
50
  inputs = {k: v.to(device) for k, v in inputs.items()}
51
 
52
  with torch.no_grad():
53
- output = model.generate(
54
  **inputs,
55
  temperature=0.8,
56
- max_new_tokens=256,
57
- num_return_sequences=1,
58
  do_sample=True,
59
  )
60
 
61
  prompt_len = inputs["input_ids"].shape[1]
62
- new_tokens = output[:, prompt_len:]
63
- decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
64
- return decoded, image_base64 if page_num == 1 else None
65
 
66
- def process_pdf(file, title="Extracted PDF", author="olmOCR", language="en"):
67
- file_path = file.name
68
- reader = PdfReader(file_path)
69
- num_pages = len(reader.pages)
70
 
71
- all_text = []
72
- cover_image_data = None
 
 
73
 
74
- for page in range(1, num_pages + 1):
75
- text, cover_image = extract_text_from_page(file_path, page)
76
- all_text.append(f"<h2>Page {page}</h2><p>{text}</p>")
77
- if cover_image and not cover_image_data:
78
- cover_image_data = cover_image # base64
79
 
80
- # Build EPUB
81
  book = epub.EpubBook()
82
- book.set_identifier("id123456")
83
  book.set_title(title)
84
- book.set_language(language)
85
  book.add_author(author)
 
86
 
87
- # Add cover image
88
- if cover_image_data:
89
- cover_bytes = base64.b64decode(cover_image_data)
90
- book.set_cover("cover.jpg", cover_bytes)
91
 
92
- # Create chapter with all text
93
- chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
94
- chapter.content = f"<h1>{title}</h1>{''.join(all_text)}"
95
- book.add_item(chapter)
 
 
 
96
 
97
- book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
98
- book.add_item(epub.EpubNavi())
99
- book.add_item(epub.EpubNCX())
100
- book.spine = ['nav', chapter]
 
101
 
102
- epub_path = f"/tmp/{title.replace(' ', '_')}.epub"
103
- epub.write_epub(epub_path, book)
104
 
 
 
 
105
  return epub_path
106
 
107
- # Gradio Interface
108
- iface = gr.Interface(
109
- fn=process_pdf,
110
  inputs=[
111
- gr.File(label="Upload PDF"),
112
- gr.Textbox(value="Extracted PDF", label="EPUB Title"),
113
- gr.Textbox(value="olmOCR", label="Author"),
114
- gr.Textbox(value="en", label="Language"),
115
  ],
116
  outputs=gr.File(label="Download EPUB"),
117
- title="olmOCR PDF to EPUB (Full PDF + Cover Image)",
118
- description="Extract text from ALL pages of a PDF and generate an EPUB with the first page as cover.",
119
- allow_flagging="never"
120
  )
121
 
122
  if __name__ == "__main__":
123
- iface.launch()
 
1
+ import gradio as gr
 
2
  import torch
3
+ from PyPDF2 import PdfReader
4
  from io import BytesIO
5
  from PIL import Image
 
 
 
6
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
7
  from olmocr.data.renderpdf import render_pdf_to_base64png
8
  from olmocr.prompts import build_finetuning_prompt
9
  from olmocr.prompts.anchor import get_anchor_text
10
+ from ebooklib import epub
11
+ import base64
12
+ import tempfile
13
+ import os
14
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Load model
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
19
+ "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
20
+ ).eval().to(device)
21
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
22
 
 
 
 
 
23
 
24
+ def ocr_page(pdf_path, page_num):
25
+ # Render page to base64 PNG
26
+ image_b64 = render_pdf_to_base64png(pdf_path, page_num + 1, target_longest_image_dim=1024)
27
+ anchor_text = get_anchor_text(pdf_path, page_num + 1, pdf_engine="pdfreport", target_length=4000)
28
  prompt = build_finetuning_prompt(anchor_text)
29
 
30
  messages = [
 
32
  "role": "user",
33
  "content": [
34
  {"type": "text", "text": prompt},
35
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
36
  ],
37
  }
38
  ]
39
 
40
+ prompt_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
41
+ main_image = Image.open(BytesIO(base64.b64decode(image_b64)))
42
+ inputs = processor(text=[prompt_text], images=[main_image], return_tensors="pt", padding=True)
43
  inputs = {k: v.to(device) for k, v in inputs.items()}
44
 
45
  with torch.no_grad():
46
+ outputs = model.generate(
47
  **inputs,
48
  temperature=0.8,
49
+ max_new_tokens=1024,
 
50
  do_sample=True,
51
  )
52
 
53
  prompt_len = inputs["input_ids"].shape[1]
54
+ new_tokens = outputs[:, prompt_len:]
55
+ decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
56
+ return decoded[0] if decoded else ""
57
 
 
 
 
 
58
 
59
+ def convert_pdf_to_epub(pdf_file, title, author, language):
60
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
61
+ tmp_pdf.write(pdf_file.read())
62
+ tmp_pdf_path = tmp_pdf.name
63
 
64
+ reader = PdfReader(tmp_pdf_path)
65
+ num_pages = len(reader.pages)
 
 
 
66
 
67
+ # Create EPUB book
68
  book = epub.EpubBook()
 
69
  book.set_title(title)
 
70
  book.add_author(author)
71
+ book.set_language(language)
72
 
73
+ # Use first page as cover
74
+ cover_image_b64 = render_pdf_to_base64png(tmp_pdf_path, 1, target_longest_image_dim=1024)
75
+ cover_image_bytes = base64.b64decode(cover_image_b64)
76
+ book.set_cover("cover.jpg", cover_image_bytes)
77
 
78
+ # OCR and add pages
79
+ for i in range(num_pages):
80
+ text = ocr_page(tmp_pdf_path, i)
81
+ chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang=language)
82
+ chapter.content = f"<h1>Page {i+1}</h1><p>{text}</p>"
83
+ book.add_item(chapter)
84
+ book.spine.append(chapter)
85
 
86
+ # Finalize EPUB
87
+ book.add_item(epub.EpubNcx())
88
+ book.add_item(epub.EpubNav())
89
+ epub_path = os.path.join(tempfile.gettempdir(), "output.epub")
90
+ epub.write_epub(epub_path, book, {})
91
 
92
+ with open(epub_path, "rb") as f:
93
+ return epub_path, f.read()
94
 
95
+
96
+ def interface_fn(pdf, title, author, language):
97
+ epub_path, epub_bytes = convert_pdf_to_epub(pdf, title, author, language)
98
  return epub_path
99
 
100
+
101
+ demo = gr.Interface(
102
+ fn=interface_fn,
103
  inputs=[
104
+ gr.File(label="Upload PDF", file_types=[".pdf"]),
105
+ gr.Textbox(label="EPUB Title", placeholder="e.g. Understanding AI"),
106
+ gr.Textbox(label="Author", placeholder="e.g. Allen AI"),
107
+ gr.Textbox(label="Language", placeholder="e.g. en", value="en"),
108
  ],
109
  outputs=gr.File(label="Download EPUB"),
110
+ title="PDF to EPUB Converter (olmOCR)",
111
+ description="Upload a PDF to convert it into a structured EPUB. The first page is used as the cover. OCR is performed with the olmOCR model.",
112
+ allow_flagging="never",
113
  )
114
 
115
  if __name__ == "__main__":
116
+ demo.launch()