leonarb commited on
Commit
bd2cd53
·
verified ·
1 Parent(s): e9af7f8

Cleans math rendering and TOC header stuff

Browse files
Files changed (1) hide show
  1. app.py +23 -8
app.py CHANGED
@@ -26,10 +26,20 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
  model.to(device)
27
 
28
  def process_pdf_to_html(pdf_file, title, author):
 
 
 
 
29
  pdf_path = pdf_file.name
30
  doc = fitz.open(pdf_path)
31
  num_pages = len(doc)
32
 
 
 
 
 
 
 
33
  all_text = ""
34
  cover_img_html = ""
35
 
@@ -101,28 +111,33 @@ def process_pdf_to_html(pdf_file, title, author):
101
 
102
  print(f"Decoded content for page {page_num}: {decoded}")
103
 
104
- from latex2mathml.converter import convert as latex_to_mathml
105
-
106
  def convert_latex(text):
107
- import re
108
  def replacer(match):
109
  try:
110
  return f"<math>{latex_to_mathml(match.group(1))}</math>"
111
  except:
112
  return html.escape(match.group(0))
113
- # Convert \( ... \)
114
  text = re.sub(r'\\\((.*?)\\\)', replacer, text)
115
- # Convert \[ ... \]
116
  text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
117
  return text
118
 
119
- safe_html = html.escape(decoded).replace("\n", "<br>")
120
- mathml_html = convert_latex(safe_html)
121
- all_text += f"<div>{mathml_html}</div>\n"
 
 
 
 
 
 
 
 
122
 
123
  if page_num == 1:
124
  cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
125
 
 
126
  mathjax_script = """
127
  <script type="text/javascript" id="MathJax-script" async
128
  src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
 
26
  model.to(device)
27
 
28
  def process_pdf_to_html(pdf_file, title, author):
29
+ import re
30
+ import markdown2
31
+ from latex2mathml.converter import convert as latex_to_mathml
32
+
33
  pdf_path = pdf_file.name
34
  doc = fitz.open(pdf_path)
35
  num_pages = len(doc)
36
 
37
+ # Extract TOC as a dict: {page_number: [(level, title), ...]}
38
+ toc_entries = doc.get_toc()
39
+ toc_by_page = {}
40
+ for level, title, page in toc_entries:
41
+ toc_by_page.setdefault(page, []).append((level, title))
42
+
43
  all_text = ""
44
  cover_img_html = ""
45
 
 
111
 
112
  print(f"Decoded content for page {page_num}: {decoded}")
113
 
114
+ # Convert inline and block LaTeX math to MathML
 
115
  def convert_latex(text):
 
116
  def replacer(match):
117
  try:
118
  return f"<math>{latex_to_mathml(match.group(1))}</math>"
119
  except:
120
  return html.escape(match.group(0))
 
121
  text = re.sub(r'\\\((.*?)\\\)', replacer, text)
 
122
  text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
123
  return text
124
 
125
+ math_converted = convert_latex(decoded)
126
+ markdown_converted = markdown2.markdown(math_converted)
127
+ html_page = markdown_converted.replace("\n", "<br>")
128
+
129
+ # Add TOC-derived headers if present on this page
130
+ if page_num in toc_by_page:
131
+ for level, header in toc_by_page[page_num]:
132
+ tag = f"h{min(level, 6)}" # Limit to h6
133
+ html_page = f"<{tag}>{html.escape(header)}</{tag}>\n" + html_page
134
+
135
+ all_text += f"<div>{html_page}</div>\n"
136
 
137
  if page_num == 1:
138
  cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
139
 
140
+ # MathJax fallback in case MathML fails (some browsers prefer it)
141
  mathjax_script = """
142
  <script type="text/javascript" id="MathJax-script" async
143
  src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">