|
import os |
|
import sys |
|
|
|
if "APP_PATH" in os.environ: |
|
app_path = os.path.abspath(os.environ["APP_PATH"]) |
|
if os.getcwd() != app_path: |
|
|
|
os.chdir(app_path) |
|
if app_path not in sys.path: |
|
sys.path.append(app_path) |
|
|
|
import gradio as gr |
|
import requests |
|
from contextlib import suppress |
|
|
|
from marker.settings import settings |
|
|
|
import base64 |
|
import io |
|
import re |
|
from typing import Any, Dict |
|
import json |
|
|
|
import pypdfium2 |
|
from PIL import Image |
|
|
|
from marker.converters.pdf import PdfConverter |
|
from marker.models import create_model_dict |
|
from marker.config.parser import ConfigParser |
|
from marker.output import text_from_rendered |
|
from marker.schema import BlockTypes |
|
|
|
COLORS = [ |
|
"#4e79a7", |
|
"#f28e2c", |
|
"#e15759", |
|
"#76b7b2", |
|
"#59a14f", |
|
"#edc949", |
|
"#af7aa1", |
|
"#ff9da7", |
|
"#9c755f", |
|
"#bab0ab" |
|
] |
|
|
|
def load_models(): |
|
return create_model_dict() |
|
|
|
def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict): |
|
config_dict = config_parser.generate_config_dict() |
|
config_dict["pdftext_workers"] = 1 |
|
converter = PdfConverter( |
|
config=config_dict, |
|
artifact_dict=model_dict, |
|
processor_list=config_parser.get_processors(), |
|
renderer=config_parser.get_renderer() |
|
) |
|
return converter(fname) |
|
|
|
def open_pdf(pdf_file): |
|
return pypdfium2.PdfDocument(pdf_file) |
|
|
|
def count_pdf(pdf_file): |
|
doc = open_pdf(pdf_file) |
|
return len(doc) |
|
|
|
def get_page_image(pdf_file, page_num, dpi=96): |
|
doc = open_pdf(pdf_file) |
|
renderer = doc.render( |
|
pypdfium2.PdfBitmap.to_pil, |
|
page_indices=[page_num - 1], |
|
scale=dpi / 72, |
|
) |
|
png = list(renderer)[0] |
|
png_image = png.convert("RGB") |
|
return png_image |
|
|
|
|
|
def img_to_html(img, img_alt): |
|
img_bytes = io.BytesIO() |
|
img.save(img_bytes, format="PNG") |
|
img_bytes = img_bytes.getvalue() |
|
encoded = base64.b64encode(img_bytes).decode() |
|
img_html = f'<img src="data:image/png;base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">' |
|
return img_html |
|
|
|
def markdown_insert_images(markdown, images): |
|
image_tags = re.findall(r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', markdown) |
|
|
|
for image in image_tags: |
|
image_markdown = image[0] |
|
image_alt = image[1] |
|
image_path = image[2] |
|
if image_path in images: |
|
markdown = markdown.replace(image_markdown, img_to_html(images[image_path], image_alt)) |
|
return markdown |
|
|
|
|
|
if 'model_dict' not in globals(): |
|
model_dict = load_models() |
|
|
|
img_state = gr.State([]) |
|
|
|
with gr.Blocks(title="Marker") as demo: |
|
gr.Markdown(""" |
|
# Marker Demo |
|
 |
|
This app will let you try marker, a PDF -> Markdown converter. It works with any languages, and extracts images, tables, equations, etc. |
|
|
|
Find the original project [here](https://github.com/VikParuchuri/marker). |
|
Or this project [here](https://github.com/xiaoyao9184/docker-marker). |
|
See the [README](./blob/main/README.md) for Spaces's metadata. |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
in_file = gr.File(label="PDF file:", file_types=[".pdf"]) |
|
in_num = gr.Slider(label="PDF file page number", minimum=1, maximum=1, value=1, step=1, visible=False) |
|
in_img = gr.AnnotatedImage( |
|
label="PDF file (preview)", visible=False |
|
) |
|
|
|
page_range_txt = gr.Textbox(label="Page range to parse, comma separated like 0,5-10,20", value=f"") |
|
output_format_dd = gr.Dropdown(label="Output format", choices=["markdown", "json", "html", "chunks"], value="markdown") |
|
|
|
use_llm_ckb = gr.Checkbox(label="Use LLM", value=False, info="Use LLM for higher quality processing") |
|
force_ocr_ckb = gr.Checkbox(label="Force OCR", value=True, info="Force OCR on all pages") |
|
show_blocks_ckb = gr.Checkbox(label="Show Blocks", info="Display detected blocks, only when output is JSON", value=False, interactive=False) |
|
debug_ckb = gr.Checkbox(label="Debug", value=False, info="Show debug information") |
|
strip_existing_ocr_ckb = gr.Checkbox(label="Strip existing OCR", value=False, info="Strip existing OCR text from the PDF and re-OCR.") |
|
format_lines_ckb = gr.Checkbox(label="Format lines", value=False, info="Format lines in the document with OCR model") |
|
disable_ocr_math_ckb = gr.Checkbox(label="Disable math", value=False, info="Disable math in OCR output - no inline math") |
|
run_marker_btn = gr.Button("Run Marker", interactive=False) |
|
with gr.Column(): |
|
result_md = gr.Markdown(label="Result markdown", visible=False) |
|
result_json = gr.JSON(label="Result json", visible=False) |
|
result_html = gr.Markdown(label="Result html", visible=False) |
|
debug_img_pdf = gr.Image(label="PDF debug image", visible=False) |
|
debug_img_layout = gr.Image(label="Layout debug image", visible=False) |
|
|
|
|
|
def show_image(file, num=1): |
|
if file is None: |
|
return [ |
|
gr.update(visible=False, maximum=1, value=num), |
|
gr.update(visible=False), |
|
"0-0"] |
|
count = count_pdf(file) |
|
img = get_page_image(file, num) |
|
return [ |
|
gr.update(visible=True, maximum=count), |
|
gr.update(visible=True, value=(img, [])), |
|
f"0-{num-1}"] |
|
|
|
in_file.clear( |
|
fn=show_image, |
|
inputs=[in_file], |
|
outputs=[in_num, in_img, page_range_txt], |
|
api_name=False |
|
) |
|
in_file.upload( |
|
fn=show_image, |
|
inputs=[in_file], |
|
outputs=[in_num, in_img, page_range_txt], |
|
api_name=False |
|
) |
|
in_num.change( |
|
fn=show_image, |
|
inputs=[in_file, in_num], |
|
outputs=[in_num, in_img, page_range_txt], |
|
api_name=False |
|
) |
|
|
|
def check_page_range(page_range, file): |
|
count = count_pdf(file) if file is not None else 1 |
|
if not re.match(r"^(\d+(-\d+)?)?$", page_range): |
|
gr.Warning(f"Invalid format. Please use 0-{count-1}", duration=0) |
|
return [ |
|
gr.update(info=f"format 0-{count-1}"), |
|
gr.update(interactive=False)] |
|
else: |
|
return [ |
|
gr.update(info=f"format 0-{count-1}"), |
|
gr.update(interactive=True)] |
|
page_range_txt.change( |
|
fn=check_page_range, |
|
inputs=[page_range_txt, in_file], |
|
outputs=[page_range_txt, run_marker_btn], |
|
api_name=False |
|
) |
|
|
|
output_format_dd.change( |
|
fn=lambda x: gr.update(interactive=x == "json" or x == "chunks", value=x == "json" or x == "chunks",), |
|
inputs=[output_format_dd], |
|
outputs=[show_blocks_ckb], |
|
api_name=False |
|
) |
|
|
|
|
|
def run_marker_img(filename, page_range, force_ocr, output_format, show_blocks, debug, use_llm, strip_existing_ocr, format_lines, disable_ocr_math): |
|
""" |
|
Run marker on the given PDF file and return processed results in multiple formats. |
|
|
|
Args: |
|
filename (str): Path to the input PDF file. |
|
page_range (str): Page range to process (e.g., "0-5"). |
|
force_ocr (bool, optional): If True (default), force OCR even on text-based PDFs. |
|
output_format (str, optional): Output format. One of: "markdown", "html", "json", "chunks". |
|
Defaults to "markdown". |
|
show_blocks (bool, optional): If True, show blocks in preview image with JSON output. |
|
Defaults to False. |
|
debug (bool, optional): If True, return additional debug images (rendered page and layout). |
|
Defaults to False. |
|
use_llm (bool, optional): If True, use LLM-assisted parsing for better semantic output. |
|
Defaults to False. |
|
strip_existing_ocr (bool, optional): If True, strip embedded OCR text and re-run OCR. |
|
Defaults to False. |
|
format_lines (bool, optional): If True, format lines in the document with OCR model. |
|
Defaults to False. |
|
disable_ocr_math (bool, optional): If True, disable math in OCR output - no inline math. |
|
Defaults to False. |
|
Returns: |
|
tuple: |
|
- markdown_result (str): Markdown output string. |
|
- json_result (str): JSON output string. |
|
- html_result (str): HTML output string. |
|
- page_image (dict or None): Rendered image of PDF page (if debug is True, else None). |
|
- layout_image (dict or None): Visualized layout image (if debug is True, else None). |
|
- preview_image (dict or None): Preview image. |
|
""" |
|
|
|
with suppress(Exception): |
|
requests.get("https://counterapi.com/api/xiaoyao9184.github.com/view/docker-marker") |
|
|
|
cli_options = { |
|
"output_format": output_format, |
|
"page_range": page_range, |
|
"force_ocr": force_ocr, |
|
"debug": debug, |
|
"output_dir": settings.DEBUG_DATA_FOLDER if debug else None, |
|
"use_llm": use_llm, |
|
"strip_existing_ocr": strip_existing_ocr, |
|
"format_lines": format_lines, |
|
"disable_ocr_math": disable_ocr_math, |
|
} |
|
config_parser = ConfigParser(cli_options) |
|
rendered = convert_pdf( |
|
filename, |
|
config_parser |
|
) |
|
gr_debug_pdf = gr.update(visible=False) |
|
gr_debug_lay = gr.update(visible=False) |
|
if debug: |
|
debug_data_path = rendered.metadata.get("debug_data_path") |
|
if debug_data_path: |
|
page_range = config_parser.generate_config_dict()["page_range"] |
|
first_page = page_range[0] if page_range else 0 |
|
|
|
pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png") |
|
img = Image.open(pdf_image_path) |
|
gr_debug_pdf = gr.update(visible=True, value=img) |
|
layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png") |
|
img = Image.open(layout_image_path) |
|
gr_debug_lay = gr.update(visible=True, value=img) |
|
|
|
gr_img = gr.update() |
|
|
|
text, ext, images = text_from_rendered(rendered) |
|
if output_format == "markdown": |
|
text = markdown_insert_images(text, images) |
|
return [ |
|
gr.update(visible=True, value=text), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr_debug_pdf, |
|
gr_debug_lay, |
|
gr_img |
|
] |
|
elif output_format == "json": |
|
if show_blocks: |
|
doc_json = json.loads(text) |
|
color_map = {} |
|
sections = [] |
|
def traverse(block): |
|
if "block_type" in block: |
|
try: |
|
index = list(BlockTypes.__members__).index(block["block_type"]) |
|
color = COLORS[index % len(COLORS)] |
|
except (ValueError, IndexError): |
|
color = "#cccccc" |
|
|
|
label = block["id"].replace("/page/0/", "") |
|
color_map[label] = color |
|
|
|
bbox = tuple(int(x) for x in block["bbox"]) |
|
sections.append((bbox, label)) |
|
if "children" in block and isinstance(block["children"], list): |
|
for child in block["children"]: |
|
traverse(child) |
|
traverse(doc_json["children"][0]) |
|
|
|
page_range = config_parser.generate_config_dict()["page_range"] |
|
first_page = page_range[0] if page_range else 0 |
|
img = get_page_image(filename, first_page + 1, dpi=72) |
|
|
|
gr_img = gr.update(value=(img, sections), color_map=color_map) |
|
|
|
return [ |
|
gr.update(visible=False), |
|
gr.update(visible=True, value=text), |
|
gr.update(visible=False), |
|
gr_debug_pdf, |
|
gr_debug_lay, |
|
gr_img |
|
] |
|
elif output_format == "html": |
|
return [ |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=True, value=text), |
|
gr_debug_pdf, |
|
gr_debug_lay, |
|
gr_img |
|
] |
|
elif output_format == "chunks": |
|
if show_blocks: |
|
doc_json = json.loads(text) |
|
color_map = {} |
|
sections = [] |
|
def traverse(block): |
|
if "block_type" in block: |
|
try: |
|
index = list(BlockTypes.__members__).index(block["block_type"]) |
|
color = COLORS[index % len(COLORS)] |
|
except (ValueError, IndexError): |
|
color = "#cccccc" |
|
|
|
label = block["id"].replace("/page/0/", "") |
|
color_map[label] = color |
|
|
|
bbox = tuple(int(x) for x in block["bbox"]) |
|
sections.append((bbox, label)) |
|
if "blocks" in block and isinstance(block["blocks"], list): |
|
for child in block["blocks"]: |
|
traverse(child) |
|
traverse(doc_json) |
|
|
|
page_range = config_parser.generate_config_dict()["page_range"] |
|
first_page = page_range[0] if page_range else 0 |
|
img = get_page_image(filename, first_page + 1, dpi=72) |
|
|
|
gr_img = gr.update(value=(img, sections), color_map=color_map) |
|
|
|
return [ |
|
gr.update(visible=False), |
|
gr.update(visible=True, value=text), |
|
gr.update(visible=False), |
|
gr_debug_pdf, |
|
gr_debug_lay, |
|
gr_img |
|
] |
|
|
|
run_marker_btn.click( |
|
fn=run_marker_img, |
|
inputs=[in_file, page_range_txt, force_ocr_ckb, output_format_dd, show_blocks_ckb, debug_ckb, use_llm_ckb, strip_existing_ocr_ckb, format_lines_ckb, disable_ocr_math_ckb], |
|
outputs=[result_md, result_json, result_html, debug_img_pdf, debug_img_layout, in_img] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(server_name="0.0.0.0", server_port=7860, mcp_server=True, ssr_mode=False) |
|
|