Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,6 +12,7 @@ from olmocr.data.renderpdf import render_pdf_to_base64png
|
|
| 12 |
from olmocr.prompts.anchor import get_anchor_text
|
| 13 |
|
| 14 |
from ebooklib import epub
|
|
|
|
| 15 |
|
| 16 |
# Load model and processor
|
| 17 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
@@ -31,7 +32,7 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
| 31 |
book.set_title(title)
|
| 32 |
book.add_author(author)
|
| 33 |
|
| 34 |
-
|
| 35 |
|
| 36 |
for i in range(num_pages):
|
| 37 |
page_num = i + 1
|
|
@@ -73,7 +74,7 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
| 73 |
output = model.generate(
|
| 74 |
**inputs,
|
| 75 |
temperature=0.8,
|
| 76 |
-
max_new_tokens=
|
| 77 |
num_return_sequences=1,
|
| 78 |
do_sample=True,
|
| 79 |
)
|
|
@@ -85,7 +86,12 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
| 85 |
if new_tokens is not None and new_tokens.shape[1] > 0:
|
| 86 |
try:
|
| 87 |
decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
except Exception as decode_error:
|
| 90 |
decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
|
| 91 |
else:
|
|
@@ -95,11 +101,7 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
| 95 |
decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
|
| 96 |
|
| 97 |
print(f"Decoded content for page {page_num}: {decoded}")
|
| 98 |
-
|
| 99 |
-
chapter = epub.EpubHtml(title=f"Page {page_num}", file_name=f"page_{page_num}.xhtml", lang="en")
|
| 100 |
-
chapter.content = f"<h1>Page {page_num}</h1><p>{decoded}</p>"
|
| 101 |
-
book.add_item(chapter)
|
| 102 |
-
chapters.append(chapter)
|
| 103 |
|
| 104 |
if page_num == 1:
|
| 105 |
cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
|
|
@@ -107,12 +109,14 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
| 107 |
cover_image.save(cover_io, format='PNG')
|
| 108 |
book.set_cover("cover.png", cover_io.getvalue())
|
| 109 |
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
book.add_item(epub.EpubNcx())
|
| 112 |
book.add_item(epub.EpubNav())
|
| 113 |
-
book.spine = ['nav'] + chapters
|
| 114 |
|
| 115 |
-
# ✅ SAFELY write to a temp file in /tmp
|
| 116 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
|
| 117 |
epub.write_epub(tmp.name, book)
|
| 118 |
return tmp.name
|
|
|
|
| 12 |
from olmocr.prompts.anchor import get_anchor_text
|
| 13 |
|
| 14 |
from ebooklib import epub
|
| 15 |
+
import json
|
| 16 |
|
| 17 |
# Load model and processor
|
| 18 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
|
|
| 32 |
book.set_title(title)
|
| 33 |
book.add_author(author)
|
| 34 |
|
| 35 |
+
all_text = ""
|
| 36 |
|
| 37 |
for i in range(num_pages):
|
| 38 |
page_num = i + 1
|
|
|
|
| 74 |
output = model.generate(
|
| 75 |
**inputs,
|
| 76 |
temperature=0.8,
|
| 77 |
+
max_new_tokens=5096,
|
| 78 |
num_return_sequences=1,
|
| 79 |
do_sample=True,
|
| 80 |
)
|
|
|
|
| 86 |
if new_tokens is not None and new_tokens.shape[1] > 0:
|
| 87 |
try:
|
| 88 |
decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
|
| 89 |
+
raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
|
| 90 |
+
try:
|
| 91 |
+
parsed = json.loads(raw_output)
|
| 92 |
+
decoded = parsed.get("natural_text", raw_output)
|
| 93 |
+
except json.JSONDecodeError:
|
| 94 |
+
decoded = raw_output
|
| 95 |
except Exception as decode_error:
|
| 96 |
decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
|
| 97 |
else:
|
|
|
|
| 101 |
decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
|
| 102 |
|
| 103 |
print(f"Decoded content for page {page_num}: {decoded}")
|
| 104 |
+
all_text += f"<h2>Page {page_num}</h2>" + "".join(f"<p>{p.strip()}</p>" for p in decoded.split("\n\n") if p.strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
if page_num == 1:
|
| 107 |
cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
|
|
|
|
| 109 |
cover_image.save(cover_io, format='PNG')
|
| 110 |
book.set_cover("cover.png", cover_io.getvalue())
|
| 111 |
|
| 112 |
+
single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
|
| 113 |
+
single_chapter.content = f"<h1>{title}</h1>{all_text}"
|
| 114 |
+
book.add_item(single_chapter)
|
| 115 |
+
book.toc = (single_chapter,)
|
| 116 |
+
book.spine = ['nav', single_chapter]
|
| 117 |
book.add_item(epub.EpubNcx())
|
| 118 |
book.add_item(epub.EpubNav())
|
|
|
|
| 119 |
|
|
|
|
| 120 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
|
| 121 |
epub.write_epub(tmp.name, book)
|
| 122 |
return tmp.name
|