Spaces:
Running
Running
Cleans math rendering and chapter titles
Browse files
app.py
CHANGED
|
@@ -13,6 +13,7 @@ from olmocr.prompts.anchor import get_anchor_text
|
|
| 13 |
|
| 14 |
from ebooklib import epub
|
| 15 |
import json
|
|
|
|
| 16 |
|
| 17 |
# Load model and processor
|
| 18 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
@@ -89,6 +90,7 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
| 89 |
raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
|
| 90 |
try:
|
| 91 |
parsed = json.loads(raw_output)
|
|
|
|
| 92 |
decoded = parsed.get("natural_text", raw_output)
|
| 93 |
except json.JSONDecodeError:
|
| 94 |
decoded = raw_output
|
|
@@ -101,7 +103,22 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
| 101 |
decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
|
| 102 |
|
| 103 |
print(f"Decoded content for page {page_num}: {decoded}")
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
if page_num == 1:
|
| 107 |
cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
|
|
@@ -110,7 +127,26 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
| 110 |
book.set_cover("cover.png", cover_io.getvalue())
|
| 111 |
|
| 112 |
single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
book.add_item(single_chapter)
|
| 115 |
book.toc = (single_chapter,)
|
| 116 |
book.spine = ['nav', single_chapter]
|
|
|
|
| 13 |
|
| 14 |
from ebooklib import epub
|
| 15 |
import json
|
| 16 |
+
import html
|
| 17 |
|
| 18 |
# Load model and processor
|
| 19 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
|
|
| 90 |
raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
|
| 91 |
try:
|
| 92 |
parsed = json.loads(raw_output)
|
| 93 |
+
# Only include `natural_text`, drop undesired metadata
|
| 94 |
decoded = parsed.get("natural_text", raw_output)
|
| 95 |
except json.JSONDecodeError:
|
| 96 |
decoded = raw_output
|
|
|
|
| 103 |
decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
|
| 104 |
|
| 105 |
print(f"Decoded content for page {page_num}: {decoded}")
|
| 106 |
+
|
| 107 |
+
# Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
|
| 108 |
+
escaped_text = html.escape(decoded)
|
| 109 |
+
|
| 110 |
+
# Restore math delimiters after escaping, and preserve line breaks
|
| 111 |
+
escaped_text = (
|
| 112 |
+
escaped_text
|
| 113 |
+
.replace(r'\[', '<div class="math">\\[')
|
| 114 |
+
.replace(r'\]', '\\]</div>')
|
| 115 |
+
.replace(r'\(', '<span class="math">\\(')
|
| 116 |
+
.replace(r'\)', '\\)</span>')
|
| 117 |
+
.replace("\n", "<br>")
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
all_text += f"<div>{escaped_text}</div>"
|
| 121 |
+
|
| 122 |
|
| 123 |
if page_num == 1:
|
| 124 |
cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
|
|
|
|
| 127 |
book.set_cover("cover.png", cover_io.getvalue())
|
| 128 |
|
| 129 |
single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
|
| 130 |
+
mathjax_script = """
|
| 131 |
+
<script type="text/javascript" id="MathJax-script" async
|
| 132 |
+
src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
|
| 133 |
+
</script>
|
| 134 |
+
"""
|
| 135 |
+
|
| 136 |
+
single_chapter.content = f"""<!DOCTYPE html>
|
| 137 |
+
<html>
|
| 138 |
+
<head>
|
| 139 |
+
<meta charset="utf-8"/>
|
| 140 |
+
<title>{html.escape(title)}</title>
|
| 141 |
+
{mathjax_script}
|
| 142 |
+
</head>
|
| 143 |
+
<body>
|
| 144 |
+
<h1>{html.escape(title)}</h1>
|
| 145 |
+
{all_text}
|
| 146 |
+
</body>
|
| 147 |
+
</html>
|
| 148 |
+
"""
|
| 149 |
+
|
| 150 |
book.add_item(single_chapter)
|
| 151 |
book.toc = (single_chapter,)
|
| 152 |
book.spine = ['nav', single_chapter]
|