Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on May 9

Commit

70fe98e

verified ·

1 Parent(s): c7e3ff4

Cleans math rendering and chapter titles

Browse files

Files changed (1) hide show

app.py +38 -2

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from olmocr.prompts.anchor import get_anchor_text
 from ebooklib import epub
 import json
 # Load model and processor
 model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -89,6 +90,7 @@ def process_pdf_to_epub(pdf_file, title, author):
                     raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
                     try:
                         parsed = json.loads(raw_output)
                         decoded = parsed.get("natural_text", raw_output)
                     except json.JSONDecodeError:
                         decoded = raw_output
@@ -101,7 +103,22 @@ def process_pdf_to_epub(pdf_file, title, author):
             decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
         print(f"Decoded content for page {page_num}: {decoded}")
-        all_text += f"<h2>Page {page_num}</h2>" + "".join(f"<p>{p.strip()}</p>" for p in decoded.split("\n\n") if p.strip())
         if page_num == 1:
             cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
@@ -110,7 +127,26 @@ def process_pdf_to_epub(pdf_file, title, author):
             book.set_cover("cover.png", cover_io.getvalue())
     single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
-    single_chapter.content = f"<h1>{title}</h1>{all_text}"
     book.add_item(single_chapter)
     book.toc = (single_chapter,)
     book.spine = ['nav', single_chapter]

 from ebooklib import epub
 import json
+import html
 # Load model and processor
 model = Qwen2VLForConditionalGeneration.from_pretrained(
                     raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
                     try:
                         parsed = json.loads(raw_output)
+                        # Only include `natural_text`, drop undesired metadata
                         decoded = parsed.get("natural_text", raw_output)
                     except json.JSONDecodeError:
                         decoded = raw_output
             decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
         print(f"Decoded content for page {page_num}: {decoded}")
+        # Escape HTML and preserve spacing and math expressions (basic TeX formatting support)
+        escaped_text = html.escape(decoded)
+        # Restore math delimiters after escaping, and preserve line breaks
+        escaped_text = (
+            escaped_text
+            .replace(r'\[', '<div class="math">\\[')
+            .replace(r'\]', '\\]</div>')
+            .replace(r'\(', '<span class="math">\\(')
+            .replace(r'\)', '\\)</span>')
+            .replace("\n", "<br>")
+        )
+        all_text += f"<div>{escaped_text}</div>"
         if page_num == 1:
             cover_image = Image.open(BytesIO(base64.b64decode(image_base64)))
             book.set_cover("cover.png", cover_io.getvalue())
     single_chapter = epub.EpubHtml(title="Full Document", file_name="full_document.xhtml", lang="en")
+    mathjax_script = """
+    <script type="text/javascript" id="MathJax-script" async
+      src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
+    </script>
+    """
+    single_chapter.content = f"""<!DOCTYPE html>
+    <html>
+      <head>
+        <meta charset="utf-8"/>
+        <title>{html.escape(title)}</title>
+        {mathjax_script}
+      </head>
+      <body>
+        <h1>{html.escape(title)}</h1>
+        {all_text}
+      </body>
+    </html>
+    """
     book.add_item(single_chapter)
     book.toc = (single_chapter,)
     book.spine = ['nav', single_chapter]