Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on May 9

Commit

2a16ca6

verified ·

1 Parent(s): bd2cd53

Fixes math/headers/tables/etc...

Browse files

Files changed (1) hide show

app.py +53 -74

app.py CHANGED Viewed

@@ -5,17 +5,16 @@ import fitz  # PyMuPDF
 import tempfile
 from io import BytesIO
 from PIL import Image
-from pathlib import Path
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
-from mathml_utils import convert_inline_and_block_latex_to_mathml
-from ebooklib import epub
-import json
 import html
 # Load model and processor
 model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -25,22 +24,32 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-def process_pdf_to_html(pdf_file, title, author):
-    import re
-    import markdown2
-    from latex2mathml.converter import convert as latex_to_mathml
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
     num_pages = len(doc)
-    # Extract TOC as a dict: {page_number: [(level, title), ...]}
     toc_entries = doc.get_toc()
     toc_by_page = {}
-    for level, title, page in toc_entries:
-        toc_by_page.setdefault(page, []).append((level, title))
-    all_text = ""
     cover_img_html = ""
     for i in range(num_pages):
@@ -72,12 +81,7 @@ def process_pdf_to_html(pdf_file, title, author):
             text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
             image = Image.open(BytesIO(base64.b64decode(image_base64)))
-            inputs = processor(
-                text=[text],
-                images=[image],
-                padding=True,
-                return_tensors="pt",
-            )
             inputs = {k: v.to(device) for k, v in inputs.items()}
             output = model.generate(
@@ -88,93 +92,68 @@ def process_pdf_to_html(pdf_file, title, author):
                 do_sample=True,
             )
-            prompt_length = inputs["input_ids"].shape[1]
-            new_tokens = output[:, prompt_length:].detach().cpu()
-            decoded = "[No output generated]"
-            if new_tokens is not None and new_tokens.shape[1] > 0:
                 try:
-                    decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
-                    raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
-                    try:
-                        parsed = json.loads(raw_output)
-                        decoded = parsed.get("natural_text", raw_output)
-                    except json.JSONDecodeError:
-                        decoded = raw_output
-                except Exception as decode_error:
-                    decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
-            else:
-                decoded = "[Model returned no new tokens]"
-        except Exception as processing_error:
-            decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
-        print(f"Decoded content for page {page_num}: {decoded}")
-        # Convert inline and block LaTeX math to MathML
-        def convert_latex(text):
-            def replacer(match):
-                try:
-                    return f"<math>{latex_to_mathml(match.group(1))}</math>"
                 except:
-                    return html.escape(match.group(0))
-            text = re.sub(r'\\\((.*?)\\\)', replacer, text)
-            text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
-            return text
-        math_converted = convert_latex(decoded)
-        markdown_converted = markdown2.markdown(math_converted)
-        html_page = markdown_converted.replace("\n", "<br>")
-        # Add TOC-derived headers if present on this page
         if page_num in toc_by_page:
             for level, header in toc_by_page[page_num]:
-                tag = f"h{min(level, 6)}"  # Limit to h6
-                html_page = f"<{tag}>{html.escape(header)}</{tag}>\n" + html_page
-        all_text += f"<div>{html_page}</div>\n"
-        if page_num == 1:
-            cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
-    # MathJax fallback in case MathML fails (some browsers prefer it)
-    mathjax_script = """
-    <script type="text/javascript" id="MathJax-script" async
-      src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
-    </script>
-    """
-    full_html = f"""<!DOCTYPE html>
     <html>
     <head>
-        <meta charset="utf-8">
         <title>{html.escape(title)}</title>
-        {mathjax_script}
     </head>
     <body>
         <h1>{html.escape(title)}</h1>
         <h3>{html.escape(author)}</h3>
         {cover_img_html}
-        {all_text}
     </body>
     </html>
     """
     with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
-        tmp.write(full_html)
         return tmp.name
-# Gradio Interface
 iface = gr.Interface(
-    fn=process_pdf_to_html,  # NEW FUNCTION
     inputs=[
         gr.File(label="Upload PDF", file_types=[".pdf"]),
         gr.Textbox(label="HTML Title"),
         gr.Textbox(label="Author(s)")
     ],
     outputs=gr.File(label="Download HTML"),
-    title="PDF to HTML Converter (for Calibre/Kindle)",
-    description="Uploads a PDF, extracts text via vision+prompt, embeds it in a styled HTML file with math support. Ready for Calibre.",
     allow_flagging="never"
 )

 import tempfile
 from io import BytesIO
 from PIL import Image
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
+from latex2mathml.converter import convert as latex_to_mathml
+import markdown2
 import html
+import json
+import re
 # Load model and processor
 model = Qwen2VLForConditionalGeneration.from_pretrained(
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+def convert_latex(text):
+    def replacer(match):
+        try:
+            return f"<math>{latex_to_mathml(match.group(1))}</math>"
+        except:
+            return html.escape(match.group(0))
+    text = re.sub(r'\\\((.*?)\\\)', replacer, text)
+    text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
+    return text
+def stitch_paragraphs(pages):
+    joined = "\n".join(pages)
+    return re.sub(r"(?<!\n)\n(?!\n)", " ", joined)  # Join lines not separated by double newline
+def process_pdf_to_html(pdf_file, title, author):
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
     num_pages = len(doc)
+    # Extract TOC
     toc_entries = doc.get_toc()
     toc_by_page = {}
+    for level, text, page in toc_entries:
+        toc_by_page.setdefault(page, []).append((level, text))
+    pages_output = []
     cover_img_html = ""
     for i in range(num_pages):
             text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
             image = Image.open(BytesIO(base64.b64decode(image_base64)))
+            inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
             inputs = {k: v.to(device) for k, v in inputs.items()}
             output = model.generate(
                 do_sample=True,
             )
+            prompt_len = inputs["input_ids"].shape[1]
+            new_tokens = output[:, prompt_len:].detach().cpu()
+            decoded = ""
+            if new_tokens.shape[1] > 0:
                 try:
+                    raw = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0].strip()
+                    parsed = json.loads(raw)
+                    decoded = parsed.get("natural_text", raw)
                 except:
+                    decoded = raw
+        except Exception as e:
+            decoded = f"[Error on page {page_num}: {str(e)}]"
+        # Save first image as cover
+        if page_num == 1:
+            cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
+        # Add TOC-based headers if any
+        header_html = ""
         if page_num in toc_by_page:
             for level, header in toc_by_page[page_num]:
+                tag = f"h{min(level, 6)}"
+                header_html += f"<{tag}>{html.escape(header)}</{tag}>\n"
+        pages_output.append(f"{header_html}\n{decoded}")
+    # Join paragraphs across pages
+    stitched = stitch_paragraphs(pages_output)
+    mathml = convert_latex(stitched)
+    rendered = markdown2.markdown(mathml)
+    html_doc = f"""<!DOCTYPE html>
     <html>
     <head>
+        <meta charset='utf-8'>
         <title>{html.escape(title)}</title>
     </head>
     <body>
         <h1>{html.escape(title)}</h1>
         <h3>{html.escape(author)}</h3>
         {cover_img_html}
+        {rendered}
     </body>
     </html>
     """
     with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
+        tmp.write(html_doc)
         return tmp.name
 iface = gr.Interface(
+    fn=process_pdf_to_html,
     inputs=[
         gr.File(label="Upload PDF", file_types=[".pdf"]),
         gr.Textbox(label="HTML Title"),
         gr.Textbox(label="Author(s)")
     ],
     outputs=gr.File(label="Download HTML"),
+    title="PDF to HTML Converter (Refined with olmOCR)",
+    description="Uploads a PDF, extracts text via vision+prompt, stitches paragraphs, adds headers, and converts math and markdown to styled HTML.",
     allow_flagging="never"
 )