leonarb commited on
Commit
0225b8c
·
verified ·
1 Parent(s): f99a1ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -15
app.py CHANGED
@@ -11,7 +11,9 @@ from olmocr.data.renderpdf import render_pdf_to_base64png
11
  from olmocr.prompts import build_finetuning_prompt
12
  from olmocr.prompts.anchor import get_anchor_text
13
 
14
- # Set a writable directory for Hugging Face's cache
 
 
15
  os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
16
 
17
  # Load processor and model
@@ -23,13 +25,13 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
23
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
  model.to(device)
25
 
26
- def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language="en"):
27
- file_path = file.name
28
-
29
- image_base64 = render_pdf_to_base64png(file_path, page, target_longest_image_dim=1024)
30
- main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
31
 
32
- anchor_text = get_anchor_text(file_path, page, pdf_engine="pdfreport", target_length=4000)
 
33
  prompt = build_finetuning_prompt(anchor_text)
34
 
35
  messages = [
@@ -43,7 +45,7 @@ def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language=
43
  ]
44
 
45
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
46
- inputs = processor(text=[text], images=[main_image], return_tensors="pt", padding=True)
47
  inputs = {k: v.to(device) for k, v in inputs.items()}
48
 
49
  with torch.no_grad():
@@ -58,16 +60,37 @@ def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language=
58
  prompt_len = inputs["input_ids"].shape[1]
59
  new_tokens = output[:, prompt_len:]
60
  decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
 
 
 
 
 
 
 
 
 
61
 
62
- # Create EPUB
 
 
 
 
 
 
63
  book = epub.EpubBook()
64
  book.set_identifier("id123456")
65
  book.set_title(title)
66
  book.set_language(language)
67
  book.add_author(author)
68
 
 
 
 
 
 
 
69
  chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
70
- chapter.content = f"<h1>{title}</h1><p>{decoded}</p>"
71
  book.add_item(chapter)
72
 
73
  book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
@@ -75,7 +98,7 @@ def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language=
75
  book.add_item(epub.EpubNCX())
76
  book.spine = ['nav', chapter]
77
 
78
- epub_path = f"/tmp/{title.replace(' ', '_')}_page_{page}.epub"
79
  epub.write_epub(epub_path, book)
80
 
81
  return epub_path
@@ -85,14 +108,13 @@ iface = gr.Interface(
85
  fn=process_pdf,
86
  inputs=[
87
  gr.File(label="Upload PDF"),
88
- gr.Number(value=1, label="Page Number"),
89
- gr.Textbox(value="Extracted Page", label="EPUB Title"),
90
  gr.Textbox(value="olmOCR", label="Author"),
91
  gr.Textbox(value="en", label="Language"),
92
  ],
93
  outputs=gr.File(label="Download EPUB"),
94
- title="olmOCR PDF to EPUB",
95
- description="Extract text from a selected page of a PDF and download it as an EPUB file.",
96
  allow_flagging="never"
97
  )
98
 
 
11
  from olmocr.prompts import build_finetuning_prompt
12
  from olmocr.prompts.anchor import get_anchor_text
13
 
14
+ from PyPDF2 import PdfReader
15
+
16
+ # Set a writable cache directory for HF
17
  os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
18
 
19
  # Load processor and model
 
25
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
  model.to(device)
27
 
28
+ def extract_text_from_page(pdf_path, page_num):
29
+ # Render image
30
+ image_base64 = render_pdf_to_base64png(pdf_path, page_num, target_longest_image_dim=1024)
31
+ image = Image.open(BytesIO(base64.b64decode(image_base64)))
 
32
 
33
+ # Prompt and input
34
+ anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport", target_length=4000)
35
  prompt = build_finetuning_prompt(anchor_text)
36
 
37
  messages = [
 
45
  ]
46
 
47
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
48
+ inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
49
  inputs = {k: v.to(device) for k, v in inputs.items()}
50
 
51
  with torch.no_grad():
 
60
  prompt_len = inputs["input_ids"].shape[1]
61
  new_tokens = output[:, prompt_len:]
62
  decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
63
+ return decoded, image_base64 if page_num == 1 else None
64
+
65
+ def process_pdf(file, title="Extracted PDF", author="olmOCR", language="en"):
66
+ file_path = file.name
67
+ reader = PdfReader(file_path)
68
+ num_pages = len(reader.pages)
69
+
70
+ all_text = []
71
+ cover_image_data = None
72
 
73
+ for page in range(1, num_pages + 1):
74
+ text, cover_image = extract_text_from_page(file_path, page)
75
+ all_text.append(f"<h2>Page {page}</h2><p>{text}</p>")
76
+ if cover_image and not cover_image_data:
77
+ cover_image_data = cover_image # base64
78
+
79
+ # Build EPUB
80
  book = epub.EpubBook()
81
  book.set_identifier("id123456")
82
  book.set_title(title)
83
  book.set_language(language)
84
  book.add_author(author)
85
 
86
+ # Add cover image
87
+ if cover_image_data:
88
+ cover_bytes = base64.b64decode(cover_image_data)
89
+ book.set_cover("cover.jpg", cover_bytes)
90
+
91
+ # Create chapter with all text
92
  chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
93
+ chapter.content = f"<h1>{title}</h1>{''.join(all_text)}"
94
  book.add_item(chapter)
95
 
96
  book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
 
98
  book.add_item(epub.EpubNCX())
99
  book.spine = ['nav', chapter]
100
 
101
+ epub_path = f"/tmp/{title.replace(' ', '_')}.epub"
102
  epub.write_epub(epub_path, book)
103
 
104
  return epub_path
 
108
  fn=process_pdf,
109
  inputs=[
110
  gr.File(label="Upload PDF"),
111
+ gr.Textbox(value="Extracted PDF", label="EPUB Title"),
 
112
  gr.Textbox(value="olmOCR", label="Author"),
113
  gr.Textbox(value="en", label="Language"),
114
  ],
115
  outputs=gr.File(label="Download EPUB"),
116
+ title="olmOCR PDF to EPUB (Full PDF + Cover Image)",
117
+ description="Extract text from ALL pages of a PDF and generate an EPUB with the first page as cover.",
118
  allow_flagging="never"
119
  )
120