Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

leonarb commited on May 9

Commit

9f080c3

verified ·

1 Parent(s): 5bdec93

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -38

app.py CHANGED Viewed

@@ -10,13 +10,11 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
-from latex2mathml.converter import convert as latex_to_mathml
 import markdown2
 import html
-import json
-import re
-# Load model and processor
 model = Qwen2VLForConditionalGeneration.from_pretrained(
     "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
 ).eval()
@@ -24,32 +22,35 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-def convert_latex(text):
     def replacer(match):
         try:
             return f"<math>{latex_to_mathml(match.group(1))}</math>"
-        except:
             return html.escape(match.group(0))
     text = re.sub(r'\\\((.*?)\\\)', replacer, text)
     text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
     return text
-def stitch_paragraphs(pages):
-    joined = "\n".join(pages)
-    return re.sub(r"(?<!\n)\n(?!\n)", " ", joined)  # Join lines not separated by double newline
 def process_pdf_to_html(pdf_file, title, author):
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
     num_pages = len(doc)
-    # Extract TOC
     toc_entries = doc.get_toc()
     toc_by_page = {}
-    for level, text, page in toc_entries:
-        toc_by_page.setdefault(page, []).append((level, text))
-    pages_output = []
     cover_img_html = ""
     for i in range(num_pages):
@@ -81,7 +82,12 @@ def process_pdf_to_html(pdf_file, title, author):
             text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
             image = Image.open(BytesIO(base64.b64decode(image_base64)))
-            inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
             inputs = {k: v.to(device) for k, v in inputs.items()}
             output = model.generate(
@@ -95,53 +101,60 @@ def process_pdf_to_html(pdf_file, title, author):
             prompt_len = inputs["input_ids"].shape[1]
             new_tokens = output[:, prompt_len:].detach().cpu()
-            decoded = ""
             if new_tokens.shape[1] > 0:
                 try:
-                    raw = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0].strip()
-                    parsed = json.loads(raw)
-                    decoded = parsed.get("natural_text", raw)
-                except:
-                    decoded = raw
         except Exception as e:
-            decoded = f"[Error on page {page_num}: {str(e)}]"
-        # Save first image as cover
-        if page_num == 1:
-            cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
-        # Add TOC-based headers if any
-        header_html = ""
         if page_num in toc_by_page:
             for level, header in toc_by_page[page_num]:
                 tag = f"h{min(level, 6)}"
-                header_html += f"<{tag}>{html.escape(header)}</{tag}>\n"
-        pages_output.append(f"{header_html}\n{decoded}")
-    # Join paragraphs across pages
-    stitched = stitch_paragraphs(pages_output)
-    mathml = convert_latex(stitched)
-    rendered = markdown2.markdown(mathml)
-    html_doc = f"""<!DOCTYPE html>
     <html>
     <head>
-        <meta charset='utf-8'>
         <title>{html.escape(title)}</title>
     </head>
     <body>
         <h1>{html.escape(title)}</h1>
         <h3>{html.escape(author)}</h3>
         {cover_img_html}
-        {rendered}
     </body>
     </html>
     """
     with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
-        tmp.write(html_doc)
         return tmp.name
 iface = gr.Interface(
@@ -152,8 +165,8 @@ iface = gr.Interface(
         gr.Textbox(label="Author(s)")
     ],
     outputs=gr.File(label="Download HTML"),
-    title="PDF to HTML Converter (Refined with olmOCR)",
-    description="Uploads a PDF, extracts text via vision+prompt, stitches paragraphs, adds headers, and converts math and markdown to styled HTML.",
     allow_flagging="never"
 )

 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts.anchor import get_anchor_text
+import re
 import markdown2
+from latex2mathml.converter import convert as latex_to_mathml
 import html
 model = Qwen2VLForConditionalGeneration.from_pretrained(
     "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
 ).eval()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+def convert_latex_to_mathml(text):
     def replacer(match):
         try:
             return f"<math>{latex_to_mathml(match.group(1))}</math>"
+        except Exception:
             return html.escape(match.group(0))
     text = re.sub(r'\\\((.*?)\\\)', replacer, text)
     text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
     return text
+def clean_page_headers(text):
+    lines = text.split("\n")
+    cleaned = []
+    for line in lines:
+        if not re.match(r'^(\s*Page \d+|\s*\d{1,2}\s*/\s*\d{1,2}|^[A-Z][A-Za-z\s]{0,20}$)', line.strip()):
+            cleaned.append(line)
+    return "\n".join(cleaned)
 def process_pdf_to_html(pdf_file, title, author):
     pdf_path = pdf_file.name
     doc = fitz.open(pdf_path)
     num_pages = len(doc)
     toc_entries = doc.get_toc()
     toc_by_page = {}
+    for level, header, page in toc_entries:
+        toc_by_page.setdefault(page, []).append((level, header))
+    all_text = ""
     cover_img_html = ""
     for i in range(num_pages):
             text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
             image = Image.open(BytesIO(base64.b64decode(image_base64)))
+            inputs = processor(
+                text=[text],
+                images=[image],
+                padding=True,
+                return_tensors="pt",
+            )
             inputs = {k: v.to(device) for k, v in inputs.items()}
             output = model.generate(
             prompt_len = inputs["input_ids"].shape[1]
             new_tokens = output[:, prompt_len:].detach().cpu()
+            decoded = "[No output generated]"
             if new_tokens.shape[1] > 0:
+                decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
+                raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
                 try:
+                    parsed = json.loads(raw_output)
+                    decoded = parsed.get("natural_text", raw_output)
+                except json.JSONDecodeError:
+                    decoded = raw_output
         except Exception as e:
+            decoded = f"[Error on page {page_num}: {e}]"
+        print(f"Decoded content for page {page_num}: {decoded}")
+        cleaned_text = clean_page_headers(decoded)
+        mathml_converted = convert_latex_to_mathml(cleaned_text)
+        markdown_converted = markdown2.markdown(mathml_converted)
+        html_page = markdown_converted.replace("\n", "<br>")
         if page_num in toc_by_page:
             for level, header in toc_by_page[page_num]:
                 tag = f"h{min(level, 6)}"
+                html_page = f"<{tag}>{html.escape(header)}</{tag}>\n" + html_page
+        all_text += f"<div>{html_page}</div>\n"
+        if page_num == 1:
+            cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
+    mathjax_script = """
+    <script type="text/javascript" id="MathJax-script" async
+      src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
+    </script>
+    """
+    full_html = f"""<!DOCTYPE html>
     <html>
     <head>
+        <meta charset="utf-8">
         <title>{html.escape(title)}</title>
+        {mathjax_script}
     </head>
     <body>
         <h1>{html.escape(title)}</h1>
         <h3>{html.escape(author)}</h3>
         {cover_img_html}
+        {all_text}
     </body>
     </html>
     """
     with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
+        tmp.write(full_html)
         return tmp.name
 iface = gr.Interface(
         gr.Textbox(label="Author(s)")
     ],
     outputs=gr.File(label="Download HTML"),
+    title="PDF to HTML Converter with Structure (olmOCR)",
+    description="Extracts text with structure, math, and footnotes using olmOCR and renders to styled HTML.",
     allow_flagging="never"
 )