leonarb commited on
Commit
4366a57
·
verified ·
1 Parent(s): 758988d

Improve table/footnote/math rendering

Browse files
Files changed (1) hide show
  1. app.py +5 -14
app.py CHANGED
@@ -11,6 +11,8 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
11
  from olmocr.data.renderpdf import render_pdf_to_base64png
12
  from olmocr.prompts.anchor import get_anchor_text
13
 
 
 
14
  from ebooklib import epub
15
  import json
16
  import html
@@ -105,20 +107,9 @@ def process_pdf_to_epub(pdf_file, title, author):
105
  print(f"Decoded content for page {page_num}: {decoded}")
106
 
107
  # Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
108
- escaped_text = html.escape(decoded)
109
-
110
- # Restore math delimiters after escaping, and preserve line breaks
111
- escaped_text = (
112
- escaped_text
113
- .replace(r'\[', '<div class="math">\\[')
114
- .replace(r'\]', '\\]</div>')
115
- .replace(r'\(', '<span class="math">\\(')
116
- .replace(r'\)', '\\)</span>')
117
- .replace("\n", "<br>")
118
- )
119
-
120
- all_text += f"<div>{escaped_text}</div>"
121
-
122
 
123
  if page_num == 1:
124
  cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
 
11
  from olmocr.data.renderpdf import render_pdf_to_base64png
12
  from olmocr.prompts.anchor import get_anchor_text
13
 
14
+ from mathml_utils import convert_inline_and_block_latex_to_mathml
15
+
16
  from ebooklib import epub
17
  import json
18
  import html
 
107
  print(f"Decoded content for page {page_num}: {decoded}")
108
 
109
  # Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
110
+ converted = convert_inline_and_block_latex_to_mathml(decoded)
111
+ converted = converted.replace("\n", "<br>") # Optional: preserve line breaks
112
+ all_text += f"<div>{converted}</div>"
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  if page_num == 1:
115
  cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))