Spaces:
Running
Running
Cleans math rendering and chapter titles
Browse files
app.py
CHANGED
@@ -13,6 +13,7 @@ from olmocr.prompts.anchor import get_anchor_text
|
|
13 |
|
14 |
from ebooklib import epub
|
15 |
import json
|
|
|
16 |
|
17 |
# Load model and processor
|
18 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
@@ -89,6 +90,7 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
89 |
raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
|
90 |
try:
|
91 |
parsed = json.loads(raw_output)
|
|
|
92 |
decoded = parsed.get("natural_text", raw_output)
|
93 |
except json.JSONDecodeError:
|
94 |
decoded = raw_output
|
@@ -101,7 +103,22 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
101 |
decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
|
102 |
|
103 |
print(f"Decoded content for page {page_num}: {decoded}")
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
if page_num == 1:
|
107 |
cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
|
@@ -110,7 +127,26 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
110 |
book.set_cover("cover.png", cover_io.getvalue())
|
111 |
|
112 |
single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
book.add_item(single_chapter)
|
115 |
book.toc = (single_chapter,)
|
116 |
book.spine = ['nav', single_chapter]
|
|
|
13 |
|
14 |
from ebooklib import epub
|
15 |
import json
|
16 |
+
import html
|
17 |
|
18 |
# Load model and processor
|
19 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
|
90 |
raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
|
91 |
try:
|
92 |
parsed = json.loads(raw_output)
|
93 |
+
# Only include `natural_text`, drop undesired metadata
|
94 |
decoded = parsed.get("natural_text", raw_output)
|
95 |
except json.JSONDecodeError:
|
96 |
decoded = raw_output
|
|
|
103 |
decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
|
104 |
|
105 |
print(f"Decoded content for page {page_num}: {decoded}")
|
106 |
+
|
107 |
+
# Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
|
108 |
+
escaped_text = html.escape(decoded)
|
109 |
+
|
110 |
+
# Restore math delimiters after escaping, and preserve line breaks
|
111 |
+
escaped_text = (
|
112 |
+
escaped_text
|
113 |
+
.replace(r'\[', '<div class="math">\\[')
|
114 |
+
.replace(r'\]', '\\]</div>')
|
115 |
+
.replace(r'\(', '<span class="math">\\(')
|
116 |
+
.replace(r'\)', '\\)</span>')
|
117 |
+
.replace("\n", "<br>")
|
118 |
+
)
|
119 |
+
|
120 |
+
all_text += f"<div>{escaped_text}</div>"
|
121 |
+
|
122 |
|
123 |
if page_num == 1:
|
124 |
cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
|
|
|
127 |
book.set_cover("cover.png", cover_io.getvalue())
|
128 |
|
129 |
single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
|
130 |
+
mathjax_script = """
|
131 |
+
<script type="text/javascript" id="MathJax-script" async
|
132 |
+
src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
|
133 |
+
</script>
|
134 |
+
"""
|
135 |
+
|
136 |
+
single_chapter.content = f"""<!DOCTYPE html>
|
137 |
+
<html>
|
138 |
+
<head>
|
139 |
+
<meta charset="utf-8"/>
|
140 |
+
<title>{html.escape(title)}</title>
|
141 |
+
{mathjax_script}
|
142 |
+
</head>
|
143 |
+
<body>
|
144 |
+
<h1>{html.escape(title)}</h1>
|
145 |
+
{all_text}
|
146 |
+
</body>
|
147 |
+
</html>
|
148 |
+
"""
|
149 |
+
|
150 |
book.add_item(single_chapter)
|
151 |
book.toc = (single_chapter,)
|
152 |
book.spine = ['nav', single_chapter]
|