leonarb commited on
Commit
c7e3ff4
·
verified ·
1 Parent(s): 3658a99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -11
app.py CHANGED
@@ -12,6 +12,7 @@ from olmocr.data.renderpdf import render_pdf_to_base64png
12
  from olmocr.prompts.anchor import get_anchor_text
13
 
14
  from ebooklib import epub
 
15
 
16
  # Load model and processor
17
  model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -31,7 +32,7 @@ def process_pdf_to_epub(pdf_file, title, author):
31
  book.set_title(title)
32
  book.add_author(author)
33
 
34
- chapters = []
35
 
36
  for i in range(num_pages):
37
  page_num = i + 1
@@ -73,7 +74,7 @@ def process_pdf_to_epub(pdf_file, title, author):
73
  output = model.generate(
74
  **inputs,
75
  temperature=0.8,
76
- max_new_tokens=512,
77
  num_return_sequences=1,
78
  do_sample=True,
79
  )
@@ -85,7 +86,12 @@ def process_pdf_to_epub(pdf_file, title, author):
85
  if new_tokens is not None and new_tokens.shape[1] > 0:
86
  try:
87
  decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
88
- decoded = decoded_list[0].strip() if decoded_list else "[No output generated]"
 
 
 
 
 
89
  except Exception as decode_error:
90
  decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
91
  else:
@@ -95,11 +101,7 @@ def process_pdf_to_epub(pdf_file, title, author):
95
  decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
96
 
97
  print(f"Decoded content for page {page_num}: {decoded}")
98
-
99
- chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
100
- chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
101
- book.add_item(chapter)
102
- chapters.append(chapter)
103
 
104
  if page_num == 1:
105
  cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
@@ -107,12 +109,14 @@ def process_pdf_to_epub(pdf_file, title, author):
107
  cover_image.save(cover_io, format='PNG')
108
  book.set_cover("cover.png", cover_io.getvalue())
109
 
110
- book.toc = tuple(chapters)
 
 
 
 
111
  book.add_item(epub.EpubNcx())
112
  book.add_item(epub.EpubNav())
113
- book.spine = ['nav'] + chapters
114
 
115
- # ✅ SAFELY write to a temp file in /tmp
116
  with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
117
  epub.write_epub(tmp.name, book)
118
  return tmp.name
 
12
  from olmocr.prompts.anchor import get_anchor_text
13
 
14
  from ebooklib import epub
15
+ import json
16
 
17
  # Load model and processor
18
  model = Qwen2VLForConditionalGeneration.from_pretrained(
 
32
  book.set_title(title)
33
  book.add_author(author)
34
 
35
+ all_text = ""
36
 
37
  for i in range(num_pages):
38
  page_num = i + 1
 
74
  output = model.generate(
75
  **inputs,
76
  temperature=0.8,
77
+ max_new_tokens=5096,
78
  num_return_sequences=1,
79
  do_sample=True,
80
  )
 
86
  if new_tokens is not None and new_tokens.shape[1] > 0:
87
  try:
88
  decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
89
+ raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
90
+ try:
91
+ parsed = json.loads(raw_output)
92
+ decoded = parsed.get("natural_text", raw_output)
93
+ except json.JSONDecodeError:
94
+ decoded = raw_output
95
  except Exception as decode_error:
96
  decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
97
  else:
 
101
  decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
102
 
103
  print(f"Decoded content for page {page_num}: {decoded}")
104
+ all_text += f"<h2>Page {page_num}</h2>" + "".join(f"<p>{p.strip()}</p>" for p in decoded.split("\n\n") if p.strip())
 
 
 
 
105
 
106
  if page_num == 1:
107
  cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
 
109
  cover_image.save(cover_io, format='PNG')
110
  book.set_cover("cover.png", cover_io.getvalue())
111
 
112
+ single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
113
+ single_chapter.content = f"<h1>{title}</h1>{all_text}"
114
+ book.add_item(single_chapter)
115
+ book.toc = (single_chapter,)
116
+ book.spine = ['nav', single_chapter]
117
  book.add_item(epub.EpubNcx())
118
  book.add_item(epub.EpubNav())
 
119
 
 
120
  with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
121
  epub.write_epub(tmp.name, book)
122
  return tmp.name