Spaces:

leonarb
/

olmocr-demo

Running

leonarb commited on May 9

Commit

4366a57

verified ·

1 Parent(s): 758988d

Improve table/footnote/math rendering

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,6 +11,8 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
 from ebooklib import epub
 import json
 import html
@@ -105,20 +107,9 @@ def process_pdf_to_epub(pdf_file, title, author):
         print(f"Decoded content for page {page_num}: {decoded}")
         # Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
-        escaped_text = html.escape(decoded)
-        # Restore math delimiters after escaping, and preserve line breaks
-        escaped_text = (
-            escaped_text
-            .replace(r'\[', '<div class="math">\\[')
-            .replace(r'\]', '\\]</div>')
-            .replace(r'\(', '<span class="math">\\(')
-            .replace(r'\)', '\\)</span>')
-            .replace("\n", "<br>")
-        )
-        all_text += f"<div>{escaped_text}</div>"
         if page_num == 1:
             cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))

 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
+from mathml_utils import convert_inline_and_block_latex_to_mathml
 from ebooklib import epub
 import json
 import html
         print(f"Decoded content for page {page_num}: {decoded}")
         # Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
+        converted = convert_inline_and_block_latex_to_mathml(decoded)
+        converted = converted.replace("\n", "<br>")  # Optional: preserve line breaks
+        all_text += f"<div>{converted}</div>"
         if page_num == 1:
             cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))