leonarb commited on
Commit
9f080c3
·
verified ·
1 Parent(s): 5bdec93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -38
app.py CHANGED
@@ -10,13 +10,11 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
10
  from olmocr.data.renderpdf import render_pdf_to_base64png
11
  from olmocr.prompts.anchor import get_anchor_text
12
 
13
- from latex2mathml.converter import convert as latex_to_mathml
14
  import markdown2
 
15
  import html
16
- import json
17
- import re
18
 
19
- # Load model and processor
20
  model = Qwen2VLForConditionalGeneration.from_pretrained(
21
  "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
22
  ).eval()
@@ -24,32 +22,35 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
24
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  model.to(device)
26
 
27
- def convert_latex(text):
28
  def replacer(match):
29
  try:
30
  return f"<math>{latex_to_mathml(match.group(1))}</math>"
31
- except:
32
  return html.escape(match.group(0))
33
  text = re.sub(r'\\\((.*?)\\\)', replacer, text)
34
  text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
35
  return text
36
 
37
- def stitch_paragraphs(pages):
38
- joined = "\n".join(pages)
39
- return re.sub(r"(?<!\n)\n(?!\n)", " ", joined) # Join lines not separated by double newline
 
 
 
 
40
 
41
  def process_pdf_to_html(pdf_file, title, author):
42
  pdf_path = pdf_file.name
43
  doc = fitz.open(pdf_path)
44
  num_pages = len(doc)
45
 
46
- # Extract TOC
47
  toc_entries = doc.get_toc()
48
  toc_by_page = {}
49
- for level, text, page in toc_entries:
50
- toc_by_page.setdefault(page, []).append((level, text))
51
 
52
- pages_output = []
53
  cover_img_html = ""
54
 
55
  for i in range(num_pages):
@@ -81,7 +82,12 @@ def process_pdf_to_html(pdf_file, title, author):
81
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
82
  image = Image.open(BytesIO(base64.b64decode(image_base64)))
83
 
84
- inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
 
 
 
 
 
85
  inputs = {k: v.to(device) for k, v in inputs.items()}
86
 
87
  output = model.generate(
@@ -95,53 +101,60 @@ def process_pdf_to_html(pdf_file, title, author):
95
  prompt_len = inputs["input_ids"].shape[1]
96
  new_tokens = output[:, prompt_len:].detach().cpu()
97
 
98
- decoded = ""
99
  if new_tokens.shape[1] > 0:
 
 
100
  try:
101
- raw = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0].strip()
102
- parsed = json.loads(raw)
103
- decoded = parsed.get("natural_text", raw)
104
- except:
105
- decoded = raw
106
 
107
  except Exception as e:
108
- decoded = f"[Error on page {page_num}: {str(e)}]"
109
 
110
- # Save first image as cover
111
- if page_num == 1:
112
- cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
 
 
 
113
 
114
- # Add TOC-based headers if any
115
- header_html = ""
116
  if page_num in toc_by_page:
117
  for level, header in toc_by_page[page_num]:
118
  tag = f"h{min(level, 6)}"
119
- header_html += f"<{tag}>{html.escape(header)}</{tag}>\n"
 
 
120
 
121
- pages_output.append(f"{header_html}\n{decoded}")
 
122
 
123
- # Join paragraphs across pages
124
- stitched = stitch_paragraphs(pages_output)
125
- mathml = convert_latex(stitched)
126
- rendered = markdown2.markdown(mathml)
 
127
 
128
- html_doc = f"""<!DOCTYPE html>
129
  <html>
130
  <head>
131
- <meta charset='utf-8'>
132
  <title>{html.escape(title)}</title>
 
133
  </head>
134
  <body>
135
  <h1>{html.escape(title)}</h1>
136
  <h3>{html.escape(author)}</h3>
137
  {cover_img_html}
138
- {rendered}
139
  </body>
140
  </html>
141
  """
142
 
143
  with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
144
- tmp.write(html_doc)
145
  return tmp.name
146
 
147
  iface = gr.Interface(
@@ -152,8 +165,8 @@ iface = gr.Interface(
152
  gr.Textbox(label="Author(s)")
153
  ],
154
  outputs=gr.File(label="Download HTML"),
155
- title="PDF to HTML Converter (Refined with olmOCR)",
156
- description="Uploads a PDF, extracts text via vision+prompt, stitches paragraphs, adds headers, and converts math and markdown to styled HTML.",
157
  allow_flagging="never"
158
  )
159
 
 
10
  from olmocr.data.renderpdf import render_pdf_to_base64png
11
  from olmocr.prompts.anchor import get_anchor_text
12
 
13
+ import re
14
  import markdown2
15
+ from latex2mathml.converter import convert as latex_to_mathml
16
  import html
 
 
17
 
 
18
  model = Qwen2VLForConditionalGeneration.from_pretrained(
19
  "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
20
  ).eval()
 
22
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
  model.to(device)
24
 
25
+ def convert_latex_to_mathml(text):
26
  def replacer(match):
27
  try:
28
  return f"<math>{latex_to_mathml(match.group(1))}</math>"
29
+ except Exception:
30
  return html.escape(match.group(0))
31
  text = re.sub(r'\\\((.*?)\\\)', replacer, text)
32
  text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
33
  return text
34
 
35
+ def clean_page_headers(text):
36
+ lines = text.split("\n")
37
+ cleaned = []
38
+ for line in lines:
39
+ if not re.match(r'^(\s*Page \d+|\s*\d{1,2}\s*/\s*\d{1,2}|^[A-Z][A-Za-z\s]{0,20}$)', line.strip()):
40
+ cleaned.append(line)
41
+ return "\n".join(cleaned)
42
 
43
  def process_pdf_to_html(pdf_file, title, author):
44
  pdf_path = pdf_file.name
45
  doc = fitz.open(pdf_path)
46
  num_pages = len(doc)
47
 
 
48
  toc_entries = doc.get_toc()
49
  toc_by_page = {}
50
+ for level, header, page in toc_entries:
51
+ toc_by_page.setdefault(page, []).append((level, header))
52
 
53
+ all_text = ""
54
  cover_img_html = ""
55
 
56
  for i in range(num_pages):
 
82
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
83
  image = Image.open(BytesIO(base64.b64decode(image_base64)))
84
 
85
+ inputs = processor(
86
+ text=[text],
87
+ images=[image],
88
+ padding=True,
89
+ return_tensors="pt",
90
+ )
91
  inputs = {k: v.to(device) for k, v in inputs.items()}
92
 
93
  output = model.generate(
 
101
  prompt_len = inputs["input_ids"].shape[1]
102
  new_tokens = output[:, prompt_len:].detach().cpu()
103
 
104
+ decoded = "[No output generated]"
105
  if new_tokens.shape[1] > 0:
106
+ decoded_list = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
107
+ raw_output = decoded_list[0].strip() if decoded_list else "[No output generated]"
108
  try:
109
+ parsed = json.loads(raw_output)
110
+ decoded = parsed.get("natural_text", raw_output)
111
+ except json.JSONDecodeError:
112
+ decoded = raw_output
 
113
 
114
  except Exception as e:
115
+ decoded = f"[Error on page {page_num}: {e}]"
116
 
117
+ print(f"Decoded content for page {page_num}: {decoded}")
118
+
119
+ cleaned_text = clean_page_headers(decoded)
120
+ mathml_converted = convert_latex_to_mathml(cleaned_text)
121
+ markdown_converted = markdown2.markdown(mathml_converted)
122
+ html_page = markdown_converted.replace("\n", "<br>")
123
 
 
 
124
  if page_num in toc_by_page:
125
  for level, header in toc_by_page[page_num]:
126
  tag = f"h{min(level, 6)}"
127
+ html_page = f"<{tag}>{html.escape(header)}</{tag}>\n" + html_page
128
+
129
+ all_text += f"<div>{html_page}</div>\n"
130
 
131
+ if page_num == 1:
132
+ cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
133
 
134
+ mathjax_script = """
135
+ <script type="text/javascript" id="MathJax-script" async
136
+ src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
137
+ </script>
138
+ """
139
 
140
+ full_html = f"""<!DOCTYPE html>
141
  <html>
142
  <head>
143
+ <meta charset="utf-8">
144
  <title>{html.escape(title)}</title>
145
+ {mathjax_script}
146
  </head>
147
  <body>
148
  <h1>{html.escape(title)}</h1>
149
  <h3>{html.escape(author)}</h3>
150
  {cover_img_html}
151
+ {all_text}
152
  </body>
153
  </html>
154
  """
155
 
156
  with tempfile.NamedTemporaryFile(delete=False, suffix=".html", dir="/tmp", mode="w", encoding="utf-8") as tmp:
157
+ tmp.write(full_html)
158
  return tmp.name
159
 
160
  iface = gr.Interface(
 
165
  gr.Textbox(label="Author(s)")
166
  ],
167
  outputs=gr.File(label="Download HTML"),
168
+ title="PDF to HTML Converter with Structure (olmOCR)",
169
+ description="Extracts text with structure, math, and footnotes using olmOCR and renders to styled HTML.",
170
  allow_flagging="never"
171
  )
172