leonarb commited on
Commit
70fe98e
·
verified ·
1 Parent(s): c7e3ff4

Cleans math rendering and chapter titles

Browse files
Files changed (1) hide show
  1. app.py +38 -2
app.py CHANGED
@@ -13,6 +13,7 @@ from olmocr.prompts.anchor import get_anchor_text
13
 
14
  from ebooklib import epub
15
  import json
 
16
 
17
  # Load model and processor
18
  model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -89,6 +90,7 @@ def process_pdf_to_epub(pdf_file, title, author):
89
  raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
90
  try:
91
  parsed = json.loads(raw_output)
 
92
  decoded = parsed.get("natural_text", raw_output)
93
  except json.JSONDecodeError:
94
  decoded = raw_output
@@ -101,7 +103,22 @@ def process_pdf_to_epub(pdf_file, title, author):
101
  decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
102
 
103
  print(f"Decoded content for page {page_num}: {decoded}")
104
- all_text += f"<h2>Page {page_num}</h2>" + "".join(f"<p>{p.strip()}</p>" for p in decoded.split("\n\n") if p.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  if page_num == 1:
107
  cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
@@ -110,7 +127,26 @@ def process_pdf_to_epub(pdf_file, title, author):
110
  book.set_cover("cover.png", cover_io.getvalue())
111
 
112
  single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
113
- single_chapter.content = f"<h1>{title}</h1>{all_text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  book.add_item(single_chapter)
115
  book.toc = (single_chapter,)
116
  book.spine = ['nav', single_chapter]
 
13
 
14
  from ebooklib import epub
15
  import json
16
+ import html
17
 
18
  # Load model and processor
19
  model = Qwen2VLForConditionalGeneration.from_pretrained(
 
90
  raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
91
  try:
92
  parsed = json.loads(raw_output)
93
+ # Only include `natural_text`, drop undesired metadata
94
  decoded = parsed.get("natural_text", raw_output)
95
  except json.JSONDecodeError:
96
  decoded = raw_output
 
103
  decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
104
 
105
  print(f"Decoded content for page {page_num}: {decoded}")
106
+
107
+ # Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
108
+ escaped_text = html.escape(decoded)
109
+
110
+ # Restore math delimiters after escaping, and preserve line breaks
111
+ escaped_text = (
112
+ escaped_text
113
+ .replace(r'\[', '<div class="math">\\[')
114
+ .replace(r'\]', '\\]</div>')
115
+ .replace(r'\(', '<span class="math">\\(')
116
+ .replace(r'\)', '\\)</span>')
117
+ .replace("\n", "<br>")
118
+ )
119
+
120
+ all_text += f"<div>{escaped_text}</div>"
121
+
122
 
123
  if page_num == 1:
124
  cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
 
127
  book.set_cover("cover.png", cover_io.getvalue())
128
 
129
  single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
130
+ mathjax_script = """
131
+ <script type="text/javascript" id="MathJax-script" async
132
+ src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
133
+ </script>
134
+ """
135
+
136
+ single_chapter.content = f"""<!DOCTYPE html>
137
+ <html>
138
+ <head>
139
+ <meta charset="utf-8"/>
140
+ <title>{html.escape(title)}</title>
141
+ {mathjax_script}
142
+ </head>
143
+ <body>
144
+ <h1>{html.escape(title)}</h1>
145
+ {all_text}
146
+ </body>
147
+ </html>
148
+ """
149
+
150
  book.add_item(single_chapter)
151
  book.toc = (single_chapter,)
152
  book.spine = ['nav', single_chapter]