leonarb commited on
Commit
2a16ca6
·
verified ·
1 Parent(s): bd2cd53

Fixes math/headers/tables/etc...

Browse files
Files changed (1) hide show
  1. app.py +53 -74
app.py CHANGED
@@ -5,17 +5,16 @@ import fitz # PyMuPDF
5
  import tempfile
6
  from io import BytesIO
7
  from PIL import Image
8
- from pathlib import Path
9
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
10
 
11
  from olmocr.data.renderpdf import render_pdf_to_base64png
12
  from olmocr.prompts.anchor import get_anchor_text
13
 
14
- from mathml_utils import convert_inline_and_block_latex_to_mathml
15
-
16
- from ebooklib import epub
17
- import json
18
  import html
 
 
19
 
20
  # Load model and processor
21
  model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -25,22 +24,32 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
25
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
  model.to(device)
27
 
28
- def process_pdf_to_html(pdf_file, title, author):
29
- import re
30
- import markdown2
31
- from latex2mathml.converter import convert as latex_to_mathml
 
 
 
 
 
 
 
 
 
32
 
 
33
  pdf_path = pdf_file.name
34
  doc = fitz.open(pdf_path)
35
  num_pages = len(doc)
36
 
37
- # Extract TOC as a dict: {page_number: [(level, title), ...]}
38
  toc_entries = doc.get_toc()
39
  toc_by_page = {}
40
- for level, title, page in toc_entries:
41
- toc_by_page.setdefault(page, []).append((level, title))
42
 
43
- all_text = ""
44
  cover_img_html = ""
45
 
46
  for i in range(num_pages):
@@ -72,12 +81,7 @@ def process_pdf_to_html(pdf_file, title, author):
72
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
73
  image = Image.open(BytesIO(base64.b64decode(image_base64)))
74
 
75
- inputs = processor(
76
- text=[text],
77
- images=[image],
78
- padding=True,
79
- return_tensors="pt",
80
- )
81
  inputs = {k: v.to(device) for k, v in inputs.items()}
82
 
83
  output = model.generate(
@@ -88,93 +92,68 @@ def process_pdf_to_html(pdf_file, title, author):
88
  do_sample=True,
89
  )
90
 
91
- prompt_length = inputs["input_ids"].shape[1]
92
- new_tokens = output[:, prompt_length:].detach().cpu()
93
 
94
- decoded = "[No output generated]"
95
- if new_tokens is not None and new_tokens.shape[1] > 0:
96
  try:
97
- decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
98
- raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
99
- try:
100
- parsed = json.loads(raw_output)
101
- decoded = parsed.get("natural_text", raw_output)
102
- except json.JSONDecodeError:
103
- decoded = raw_output
104
- except Exception as decode_error:
105
- decoded = f"[Decoding error on page {page_num}: {str(decode_error)}]"
106
- else:
107
- decoded = "[Model returned no new tokens]"
108
-
109
- except Exception as processing_error:
110
- decoded = f"[Processing error on page {page_num}: {str(processing_error)}]"
111
-
112
- print(f"Decoded content for page {page_num}: {decoded}")
113
-
114
- # Convert inline and block LaTeX math to MathML
115
- def convert_latex(text):
116
- def replacer(match):
117
- try:
118
- return f"<math>{latex_to_mathml(match.group(1))}</math>"
119
  except:
120
- return html.escape(match.group(0))
121
- text = re.sub(r'\\\((.*?)\\\)', replacer, text)
122
- text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
123
- return text
124
 
125
- math_converted = convert_latex(decoded)
126
- markdown_converted = markdown2.markdown(math_converted)
127
- html_page = markdown_converted.replace("\n", "<br>")
128
 
129
- # Add TOC-derived headers if present on this page
 
130
  if page_num in toc_by_page:
131
  for level, header in toc_by_page[page_num]:
132
- tag = f"h{min(level, 6)}" # Limit to h6
133
- html_page = f"<{tag}>{html.escape(header)}</{tag}>\n" + html_page
134
 
135
- all_text += f"<div>{html_page}</div>\n"
136
 
137
- if page_num == 1:
138
- cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
139
-
140
- # MathJax fallback in case MathML fails (some browsers prefer it)
141
- mathjax_script = """
142
- <script type="text/javascript" id="MathJax-script" async
143
- src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
144
- </script>
145
- """
146
 
147
- full_html = f"""<!DOCTYPE html>
148
  <html>
149
  <head>
150
- <meta charset="utf-8">
151
  <title>{html.escape(title)}</title>
152
- {mathjax_script}
153
  </head>
154
  <body>
155
  <h1>{html.escape(title)}</h1>
156
  <h3>{html.escape(author)}</h3>
157
  {cover_img_html}
158
- {all_text}
159
  </body>
160
  </html>
161
  """
162
 
163
  with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
164
- tmp.write(full_html)
165
  return tmp.name
166
 
167
- # Gradio Interface
168
  iface = gr.Interface(
169
- fn=process_pdf_to_html, # NEW FUNCTION
170
  inputs=[
171
  gr.File(label="Upload PDF", file_types=[".pdf"]),
172
  gr.Textbox(label="HTML Title"),
173
  gr.Textbox(label="Author(s)")
174
  ],
175
  outputs=gr.File(label="Download HTML"),
176
- title="PDF to HTML Converter (for Calibre/Kindle)",
177
- description="Uploads a PDF, extracts text via vision+prompt, embeds it in a styled HTML file with math support. Ready for Calibre.",
178
  allow_flagging="never"
179
  )
180
 
 
5
  import tempfile
6
  from io import BytesIO
7
  from PIL import Image
 
8
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
9
 
10
  from olmocr.data.renderpdf import render_pdf_to_base64png
11
  from olmocr.prompts.anchor import get_anchor_text
12
 
13
+ from latex2mathml.converter import convert as latex_to_mathml
14
+ import markdown2
 
 
15
  import html
16
+ import json
17
+ import re
18
 
19
  # Load model and processor
20
  model = Qwen2VLForConditionalGeneration.from_pretrained(
 
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  model.to(device)
26
 
27
+ def convert_latex(text):
28
+ def replacer(match):
29
+ try:
30
+ return f"<math>{latex_to_mathml(match.group(1))}</math>"
31
+ except:
32
+ return html.escape(match.group(0))
33
+ text = re.sub(r'\\\((.*?)\\\)', replacer, text)
34
+ text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
35
+ return text
36
+
37
+ def stitch_paragraphs(pages):
38
+ joined = "\n".join(pages)
39
+ return re.sub(r"(?<!\n)\n(?!\n)", " ", joined) # Join lines not separated by double newline
40
 
41
+ def process_pdf_to_html(pdf_file, title, author):
42
  pdf_path = pdf_file.name
43
  doc = fitz.open(pdf_path)
44
  num_pages = len(doc)
45
 
46
+ # Extract TOC
47
  toc_entries = doc.get_toc()
48
  toc_by_page = {}
49
+ for level, text, page in toc_entries:
50
+ toc_by_page.setdefault(page, []).append((level, text))
51
 
52
+ pages_output = []
53
  cover_img_html = ""
54
 
55
  for i in range(num_pages):
 
81
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
82
  image = Image.open(BytesIO(base64.b64decode(image_base64)))
83
 
84
+ inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
 
 
 
 
 
85
  inputs = {k: v.to(device) for k, v in inputs.items()}
86
 
87
  output = model.generate(
 
92
  do_sample=True,
93
  )
94
 
95
+ prompt_len = inputs["input_ids"].shape[1]
96
+ new_tokens = output[:, prompt_len:].detach().cpu()
97
 
98
+ decoded = ""
99
+ if new_tokens.shape[1] > 0:
100
  try:
101
+ raw = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0].strip()
102
+ parsed = json.loads(raw)
103
+ decoded = parsed.get("natural_text", raw)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  except:
105
+ decoded = raw
106
+
107
+ except Exception as e:
108
+ decoded = f"[Error on page {page_num}: {str(e)}]"
109
 
110
+ # Save first image as cover
111
+ if page_num == 1:
112
+ cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
113
 
114
+ # Add TOC-based headers if any
115
+ header_html = ""
116
  if page_num in toc_by_page:
117
  for level, header in toc_by_page[page_num]:
118
+ tag = f"h{min(level, 6)}"
119
+ header_html += f"<{tag}>{html.escape(header)}</{tag}>\n"
120
 
121
+ pages_output.append(f"{header_html}\n{decoded}")
122
 
123
+ # Join paragraphs across pages
124
+ stitched = stitch_paragraphs(pages_output)
125
+ mathml = convert_latex(stitched)
126
+ rendered = markdown2.markdown(mathml)
 
 
 
 
 
127
 
128
+ html_doc = f"""<!DOCTYPE html>
129
  <html>
130
  <head>
131
+ <meta charset='utf-8'>
132
  <title>{html.escape(title)}</title>
 
133
  </head>
134
  <body>
135
  <h1>{html.escape(title)}</h1>
136
  <h3>{html.escape(author)}</h3>
137
  {cover_img_html}
138
+ {rendered}
139
  </body>
140
  </html>
141
  """
142
 
143
  with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
144
+ tmp.write(html_doc)
145
  return tmp.name
146
 
 
147
  iface = gr.Interface(
148
+ fn=process_pdf_to_html,
149
  inputs=[
150
  gr.File(label="Upload PDF", file_types=[".pdf"]),
151
  gr.Textbox(label="HTML Title"),
152
  gr.Textbox(label="Author(s)")
153
  ],
154
  outputs=gr.File(label="Download HTML"),
155
+ title="PDF to HTML Converter (Refined with olmOCR)",
156
+ description="Uploads a PDF, extracts text via vision+prompt, stitches paragraphs, adds headers, and converts math and markdown to styled HTML.",
157
  allow_flagging="never"
158
  )
159