Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ from olmocr.data.renderpdf import render_pdf_to_base64png
|
|
12 |
from olmocr.prompts.anchor import get_anchor_text
|
13 |
|
14 |
from ebooklib import epub
|
|
|
15 |
|
16 |
# Load model and processor
|
17 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
@@ -31,7 +32,7 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
31 |
book.set_title(title)
|
32 |
book.add_author(author)
|
33 |
|
34 |
-
|
35 |
|
36 |
for i in range(num_pages):
|
37 |
page_num = i + 1
|
@@ -73,7 +74,7 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
73 |
output = model.generate(
|
74 |
**inputs,
|
75 |
temperature=0.8,
|
76 |
-
max_new_tokens=
|
77 |
num_return_sequences=1,
|
78 |
do_sample=True,
|
79 |
)
|
@@ -85,7 +86,12 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
85 |
if new_tokens is not None and new_tokens.shape[1] > 0:
|
86 |
try:
|
87 |
decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
89 |
except Exception as decode_error:
|
90 |
decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
|
91 |
else:
|
@@ -95,11 +101,7 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
95 |
decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
|
96 |
|
97 |
print(f"Decoded content for page {page_num}: {decoded}")
|
98 |
-
|
99 |
-
chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
|
100 |
-
chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
|
101 |
-
book.add_item(chapter)
|
102 |
-
chapters.append(chapter)
|
103 |
|
104 |
if page_num == 1:
|
105 |
cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
|
@@ -107,12 +109,14 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
107 |
cover_image.save(cover_io, format='PNG')
|
108 |
book.set_cover("cover.png", cover_io.getvalue())
|
109 |
|
110 |
-
|
|
|
|
|
|
|
|
|
111 |
book.add_item(epub.EpubNcx())
|
112 |
book.add_item(epub.EpubNav())
|
113 |
-
book.spine = ['nav'] + chapters
|
114 |
|
115 |
-
# ✅ SAFELY write to a temp file in /tmp
|
116 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
|
117 |
epub.write_epub(tmp.name, book)
|
118 |
return tmp.name
|
|
|
12 |
from olmocr.prompts.anchor import get_anchor_text
|
13 |
|
14 |
from ebooklib import epub
|
15 |
+
import json
|
16 |
|
17 |
# Load model and processor
|
18 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
|
32 |
book.set_title(title)
|
33 |
book.add_author(author)
|
34 |
|
35 |
+
all_text = ""
|
36 |
|
37 |
for i in range(num_pages):
|
38 |
page_num = i + 1
|
|
|
74 |
output = model.generate(
|
75 |
**inputs,
|
76 |
temperature=0.8,
|
77 |
+
max_new_tokens=5096,
|
78 |
num_return_sequences=1,
|
79 |
do_sample=True,
|
80 |
)
|
|
|
86 |
if new_tokens is not None and new_tokens.shape[1] > 0:
|
87 |
try:
|
88 |
decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
|
89 |
+
raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
|
90 |
+
try:
|
91 |
+
parsed = json.loads(raw_output)
|
92 |
+
decoded = parsed.get("natural_text", raw_output)
|
93 |
+
except json.JSONDecodeError:
|
94 |
+
decoded = raw_output
|
95 |
except Exception as decode_error:
|
96 |
decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
|
97 |
else:
|
|
|
101 |
decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
|
102 |
|
103 |
print(f"Decoded content for page {page_num}: {decoded}")
|
104 |
+
all_text += f"<h2>Page {page_num}</h2>" + "".join(f"<p>{p.strip()}</p>" for p in decoded.split("\n\n") if p.strip())
|
|
|
|
|
|
|
|
|
105 |
|
106 |
if page_num == 1:
|
107 |
cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
|
|
|
109 |
cover_image.save(cover_io, format='PNG')
|
110 |
book.set_cover("cover.png", cover_io.getvalue())
|
111 |
|
112 |
+
single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
|
113 |
+
single_chapter.content = f"<h1>{title}</h1>{all_text}"
|
114 |
+
book.add_item(single_chapter)
|
115 |
+
book.toc = (single_chapter,)
|
116 |
+
book.spine = ['nav', single_chapter]
|
117 |
book.add_item(epub.EpubNcx())
|
118 |
book.add_item(epub.EpubNav())
|
|
|
119 |
|
|
|
120 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
|
121 |
epub.write_epub(tmp.name, book)
|
122 |
return tmp.name
|