Spaces:
Running
Running
Improve table/footnote/math rendering
Browse files
app.py
CHANGED
@@ -11,6 +11,8 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
|
|
11 |
from olmocr.data.renderpdf import render_pdf_to_base64png
|
12 |
from olmocr.prompts.anchor import get_anchor_text
|
13 |
|
|
|
|
|
14 |
from ebooklib import epub
|
15 |
import json
|
16 |
import html
|
@@ -105,20 +107,9 @@ def process_pdf_to_epub(pdf_file, title, author):
|
|
105 |
print(f"Decoded content for page {page_num}: {decoded}")
|
106 |
|
107 |
# Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
escaped_text = (
|
112 |
-
escaped_text
|
113 |
-
.replace(r'\[', '<div class="math">\\[')
|
114 |
-
.replace(r'\]', '\\]</div>')
|
115 |
-
.replace(r'\(', '<span class="math">\\(')
|
116 |
-
.replace(r'\)', '\\)</span>')
|
117 |
-
.replace("\n", "<br>")
|
118 |
-
)
|
119 |
-
|
120 |
-
all_text += f"<div>{escaped_text}</div>"
|
121 |
-
|
122 |
|
123 |
if page_num == 1:
|
124 |
cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
|
|
|
11 |
from olmocr.data.renderpdf import render_pdf_to_base64png
|
12 |
from olmocr.prompts.anchor import get_anchor_text
|
13 |
|
14 |
+
from mathml_utils import convert_inline_and_block_latex_to_mathml
|
15 |
+
|
16 |
from ebooklib import epub
|
17 |
import json
|
18 |
import html
|
|
|
107 |
print(f"Decoded content for page {page_num}: {decoded}")
|
108 |
|
109 |
# Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
|
110 |
+
converted = convert_inline_and_block_latex_to_mathml(decoded)
|
111 |
+
converted = converted.replace("\n", "<br>") # Optional: preserve line breaks
|
112 |
+
all_text += f"<div>{converted}</div>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
if page_num == 1:
|
115 |
cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
|