leonarb commited on
Commit
f01e8a4
·
verified ·
1 Parent(s): c986ff1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -18
app.py CHANGED
@@ -1,19 +1,19 @@
1
  import os
2
-
3
- # Set a writable directory for Hugging Face's cache
4
- os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
5
-
6
  import torch
7
  import base64
8
  from io import BytesIO
9
  from PIL import Image
10
  import gradio as gr
 
11
 
12
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
13
  from olmocr.data.renderpdf import render_pdf_to_base64png
14
  from olmocr.prompts import build_finetuning_prompt
15
  from olmocr.prompts.anchor import get_anchor_text
16
 
 
 
 
17
  # Load processor and model
18
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
19
  model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -23,19 +23,15 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
23
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
  model.to(device)
25
 
26
- def process_pdf(file, page=1):
27
- # Save uploaded file to disk
28
  file_path = file.name
29
 
30
- # Render the selected PDF page to base64 PNG
31
  image_base64 = render_pdf_to_base64png(file_path, page, target_longest_image_dim=1024)
32
  main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
33
 
34
- # Extract document metadata and build the prompt
35
  anchor_text = get_anchor_text(file_path, page, pdf_engine="pdfreport", target_length=4000)
36
  prompt = build_finetuning_prompt(anchor_text)
37
 
38
- # Construct chat message
39
  messages = [
40
  {
41
  "role": "user",
@@ -46,12 +42,10 @@ def process_pdf(file, page=1):
46
  }
47
  ]
48
 
49
- # Tokenize inputs
50
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
51
  inputs = processor(text=[text], images=[main_image], return_tensors="pt", padding=True)
52
  inputs = {k: v.to(device) for k, v in inputs.items()}
53
 
54
- # Run model
55
  with torch.no_grad():
56
  output = model.generate(
57
  **inputs,
@@ -61,22 +55,45 @@ def process_pdf(file, page=1):
61
  do_sample=True,
62
  )
63
 
64
- # Decode
65
  prompt_len = inputs["input_ids"].shape[1]
66
  new_tokens = output[:, prompt_len:]
67
- decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
68
- return decoded[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- # Gradio interface
71
  iface = gr.Interface(
72
  fn=process_pdf,
73
  inputs=[
74
  gr.File(label="Upload PDF"),
75
  gr.Number(value=1, label="Page Number"),
 
 
 
76
  ],
77
- outputs="text",
78
- title="olmOCR PDF Text Extractor",
79
- description="Upload a PDF and select a page to extract text using the olmOCR model.",
 
80
  )
81
 
82
  if __name__ == "__main__":
 
1
  import os
 
 
 
 
2
  import torch
3
  import base64
4
  from io import BytesIO
5
  from PIL import Image
6
  import gradio as gr
7
+ from ebooklib import epub
8
 
9
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
10
  from olmocr.data.renderpdf import render_pdf_to_base64png
11
  from olmocr.prompts import build_finetuning_prompt
12
  from olmocr.prompts.anchor import get_anchor_text
13
 
14
+ # Set a writable directory for Hugging Face's cache
15
+ os.environ['HF_HOME'] = '/tmp/.cache/huggingface'
16
+
17
  # Load processor and model
18
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
19
  model = Qwen2VLForConditionalGeneration.from_pretrained(
 
23
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
  model.to(device)
25
 
26
+ def process_pdf(file, page=1, title="Extracted Page", author="olmOCR", language="en"):
 
27
  file_path = file.name
28
 
 
29
  image_base64 = render_pdf_to_base64png(file_path, page, target_longest_image_dim=1024)
30
  main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
31
 
 
32
  anchor_text = get_anchor_text(file_path, page, pdf_engine="pdfreport", target_length=4000)
33
  prompt = build_finetuning_prompt(anchor_text)
34
 
 
35
  messages = [
36
  {
37
  "role": "user",
 
42
  }
43
  ]
44
 
 
45
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
46
  inputs = processor(text=[text], images=[main_image], return_tensors="pt", padding=True)
47
  inputs = {k: v.to(device) for k, v in inputs.items()}
48
 
 
49
  with torch.no_grad():
50
  output = model.generate(
51
  **inputs,
 
55
  do_sample=True,
56
  )
57
 
 
58
  prompt_len = inputs["input_ids"].shape[1]
59
  new_tokens = output[:, prompt_len:]
60
+ decoded = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
61
+
62
+ # Create EPUB
63
+ book = epub.EpubBook()
64
+ book.set_identifier("id123456")
65
+ book.set_title(title)
66
+ book.set_language(language)
67
+ book.add_author(author)
68
+
69
+ chapter = epub.EpubHtml(title=title, file_name="chap1.xhtml", lang=language)
70
+ chapter.content = f"<h1>{title}</h1><p>{decoded}</p>"
71
+ book.add_item(chapter)
72
+
73
+ book.toc = (epub.Link('chap1.xhtml', title, 'chap1'),)
74
+ book.add_item(epub.EpubNavi())
75
+ book.add_item(epub.EpubNCX())
76
+ book.spine = ['nav', chapter]
77
+
78
+ epub_path = f"/tmp/{title.replace(' ', '_')}_page_{page}.epub"
79
+ epub.write_epub(epub_path, book)
80
+
81
+ return epub_path
82
 
83
+ # Gradio Interface
84
  iface = gr.Interface(
85
  fn=process_pdf,
86
  inputs=[
87
  gr.File(label="Upload PDF"),
88
  gr.Number(value=1, label="Page Number"),
89
+ gr.Textbox(value="Extracted Page", label="EPUB Title"),
90
+ gr.Textbox(value="olmOCR", label="Author"),
91
+ gr.Textbox(value="en", label="Language"),
92
  ],
93
+ outputs=gr.File(label="Download EPUB"),
94
+ title="olmOCR PDF to EPUB",
95
+ description="Extract text from a selected page of a PDF and download it as an EPUB file.",
96
+ allow_flagging="never"
97
  )
98
 
99
  if __name__ == "__main__":