Spaces:
Running
Running
Cleans math rendering and TOC header stuff
Browse files
app.py
CHANGED
@@ -26,10 +26,20 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
26 |
model.to(device)
|
27 |
|
28 |
def process_pdf_to_html(pdf_file, title, author):
|
|
|
|
|
|
|
|
|
29 |
pdf_path = pdf_file.name
|
30 |
doc = fitz.open(pdf_path)
|
31 |
num_pages = len(doc)
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
all_text = ""
|
34 |
cover_img_html = ""
|
35 |
|
@@ -101,28 +111,33 @@ def process_pdf_to_html(pdf_file, title, author):
|
|
101 |
|
102 |
print(f"Decoded content for page {page_num}: {decoded}")
|
103 |
|
104 |
-
|
105 |
-
|
106 |
def convert_latex(text):
|
107 |
-
import re
|
108 |
def replacer(match):
|
109 |
try:
|
110 |
return f"<math>{latex_to_mathml(match.group(1))}</math>"
|
111 |
except:
|
112 |
return html.escape(match.group(0))
|
113 |
-
# Convert \( ... \)
|
114 |
text = re.sub(r'\\\((.*?)\\\)', replacer, text)
|
115 |
-
# Convert \[ ... \]
|
116 |
text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
|
117 |
return text
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
if page_num == 1:
|
124 |
cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
|
125 |
|
|
|
126 |
mathjax_script = """
|
127 |
<script type="text/javascript" id="MathJax-script" async
|
128 |
src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
|
|
|
26 |
model.to(device)
|
27 |
|
28 |
def process_pdf_to_html(pdf_file, title, author):
|
29 |
+
import re
|
30 |
+
import markdown2
|
31 |
+
from latex2mathml.converter import convert as latex_to_mathml
|
32 |
+
|
33 |
pdf_path = pdf_file.name
|
34 |
doc = fitz.open(pdf_path)
|
35 |
num_pages = len(doc)
|
36 |
|
37 |
+
# Extract TOC as a dict: {page_number: [(level, title), ...]}
|
38 |
+
toc_entries = doc.get_toc()
|
39 |
+
toc_by_page = {}
|
40 |
+
for level, title, page in toc_entries:
|
41 |
+
toc_by_page.setdefault(page, []).append((level, title))
|
42 |
+
|
43 |
all_text = ""
|
44 |
cover_img_html = ""
|
45 |
|
|
|
111 |
|
112 |
print(f"Decoded content for page {page_num}: {decoded}")
|
113 |
|
114 |
+
# Convert inline and block LaTeX math to MathML
|
|
|
115 |
def convert_latex(text):
|
|
|
116 |
def replacer(match):
|
117 |
try:
|
118 |
return f"<math>{latex_to_mathml(match.group(1))}</math>"
|
119 |
except:
|
120 |
return html.escape(match.group(0))
|
|
|
121 |
text = re.sub(r'\\\((.*?)\\\)', replacer, text)
|
|
|
122 |
text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
|
123 |
return text
|
124 |
|
125 |
+
math_converted = convert_latex(decoded)
|
126 |
+
markdown_converted = markdown2.markdown(math_converted)
|
127 |
+
html_page = markdown_converted.replace("\n", "<br>")
|
128 |
+
|
129 |
+
# Add TOC-derived headers if present on this page
|
130 |
+
if page_num in toc_by_page:
|
131 |
+
for level, header in toc_by_page[page_num]:
|
132 |
+
tag = f"h{min(level, 6)}" # Limit to h6
|
133 |
+
html_page = f"<{tag}>{html.escape(header)}</{tag}>\n" + html_page
|
134 |
+
|
135 |
+
all_text += f"<div>{html_page}</div>\n"
|
136 |
|
137 |
if page_num == 1:
|
138 |
cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
|
139 |
|
140 |
+
# MathJax fallback in case MathML fails (some browsers prefer it)
|
141 |
mathjax_script = """
|
142 |
<script type="text/javascript" id="MathJax-script" async
|
143 |
src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
|