Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on May 9

Commit

bd2cd53

verified ·

1 Parent(s): e9af7f8

Cleans math rendering and TOC header stuff

Browse files

Files changed (1) hide show

app.py +23 -8

app.py CHANGED Viewed

@@ -26,10 +26,20 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 def process_pdf_to_html(pdf_file, title, author):
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
     num_pages = len(doc)
     all_text = ""
     cover_img_html = ""
@@ -101,28 +111,33 @@ def process_pdf_to_html(pdf_file, title, author):
         print(f"Decoded content for page {page_num}: {decoded}")
-        from latex2mathml.converter import convert as latex_to_mathml
         def convert_latex(text):
-            import re
             def replacer(match):
                 try:
                     return f"<math>{latex_to_mathml(match.group(1))}</math>"
                 except:
                     return html.escape(match.group(0))
-            # Convert \( ... \)
             text = re.sub(r'\\\((.*?)\\\)', replacer, text)
-            # Convert \[ ... \]
             text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
             return text
-        safe_html = html.escape(decoded).replace("\n", "<br>")
-        mathml_html = convert_latex(safe_html)
-        all_text += f"<div>{mathml_html}</div>\n"
         if page_num == 1:
             cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
     mathjax_script = """
     <script type="text/javascript" id="MathJax-script" async
       src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">

 model.to(device)
 def process_pdf_to_html(pdf_file, title, author):
+    import re
+    import markdown2
+    from latex2mathml.converter import convert as latex_to_mathml
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
     num_pages = len(doc)
+    # Extract TOC as a dict: {page_number: [(level, title), ...]}
+    toc_entries = doc.get_toc()
+    toc_by_page = {}
+    for level, title, page in toc_entries:
+        toc_by_page.setdefault(page, []).append((level, title))
     all_text = ""
     cover_img_html = ""
         print(f"Decoded content for page {page_num}: {decoded}")
+        # Convert inline and block LaTeX math to MathML
         def convert_latex(text):
             def replacer(match):
                 try:
                     return f"<math>{latex_to_mathml(match.group(1))}</math>"
                 except:
                     return html.escape(match.group(0))
             text = re.sub(r'\\\((.*?)\\\)', replacer, text)
             text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
             return text
+        math_converted = convert_latex(decoded)
+        markdown_converted = markdown2.markdown(math_converted)
+        html_page = markdown_converted.replace("\n", "<br>")
+        # Add TOC-derived headers if present on this page
+        if page_num in toc_by_page:
+            for level, header in toc_by_page[page_num]:
+                tag = f"h{min(level, 6)}"  # Limit to h6
+                html_page = f"<{tag}>{html.escape(header)}</{tag}>\n" + html_page
+        all_text += f"<div>{html_page}</div>\n"
         if page_num == 1:
             cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
+    # MathJax fallback in case MathML fails (some browsers prefer it)
     mathjax_script = """
     <script type="text/javascript" id="MathJax-script" async
       src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">