Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,8 @@ import time
|
|
| 6 |
import zipfile
|
| 7 |
from pathlib import Path
|
| 8 |
import re
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# os.system('pip install -U magic-pdf==0.8.1')
|
| 11 |
os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
|
|
@@ -174,12 +176,32 @@ all_lang = [""]
|
|
| 174 |
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
|
| 175 |
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
if __name__ == "__main__":
|
| 178 |
with gr.Blocks() as demo:
|
| 179 |
gr.HTML(header)
|
| 180 |
with gr.Row():
|
| 181 |
with gr.Column(variant='panel', scale=5):
|
| 182 |
-
|
| 183 |
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
|
| 184 |
with gr.Row():
|
| 185 |
layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3")
|
|
@@ -190,14 +212,14 @@ if __name__ == "__main__":
|
|
| 190 |
table_enable = gr.Checkbox(label="Enable table recognition(test)", value=False)
|
| 191 |
with gr.Row():
|
| 192 |
change_bu = gr.Button("Convert")
|
| 193 |
-
clear_bu = gr.ClearButton(
|
| 194 |
-
pdf_show = PDF(label="
|
| 195 |
with gr.Accordion("Examples:"):
|
| 196 |
example_root = os.path.join(os.path.dirname(__file__), "examples")
|
| 197 |
gr.Examples(
|
| 198 |
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
|
| 199 |
_.endswith("pdf")],
|
| 200 |
-
inputs=pdf_show
|
| 201 |
)
|
| 202 |
|
| 203 |
with gr.Column(variant='panel', scale=5):
|
|
@@ -208,8 +230,9 @@ if __name__ == "__main__":
|
|
| 208 |
latex_delimiters=latex_delimiters, line_breaks=True)
|
| 209 |
with gr.Tab("Markdown text"):
|
| 210 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
|
|
|
| 211 |
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
|
| 212 |
outputs=[md, md_text, output_file, pdf_show])
|
| 213 |
-
clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
|
| 214 |
|
| 215 |
demo.launch()
|
|
|
|
| 6 |
import zipfile
|
| 7 |
from pathlib import Path
|
| 8 |
import re
|
| 9 |
+
import uuid
|
| 10 |
+
import pymupdf
|
| 11 |
|
| 12 |
# os.system('pip install -U magic-pdf==0.8.1')
|
| 13 |
os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
|
|
|
|
| 176 |
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
|
| 177 |
|
| 178 |
|
| 179 |
+
def to_pdf(file_path):
|
| 180 |
+
with pymupdf.open(file_path) as f:
|
| 181 |
+
if f.is_pdf:
|
| 182 |
+
return file_path
|
| 183 |
+
else:
|
| 184 |
+
pdf_bytes = f.convert_to_pdf()
|
| 185 |
+
# 将pdfbytes 写入到uuid.pdf中
|
| 186 |
+
# 生成唯一的文件名
|
| 187 |
+
unique_filename = f"{uuid.uuid4()}.pdf"
|
| 188 |
+
|
| 189 |
+
# 构建完整的文件路径
|
| 190 |
+
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
|
| 191 |
+
|
| 192 |
+
# 将字节数据写入文件
|
| 193 |
+
with open(tmp_file_path, 'wb') as tmp_pdf_file:
|
| 194 |
+
tmp_pdf_file.write(pdf_bytes)
|
| 195 |
+
|
| 196 |
+
return tmp_file_path
|
| 197 |
+
|
| 198 |
+
|
| 199 |
if __name__ == "__main__":
|
| 200 |
with gr.Blocks() as demo:
|
| 201 |
gr.HTML(header)
|
| 202 |
with gr.Row():
|
| 203 |
with gr.Column(variant='panel', scale=5):
|
| 204 |
+
file = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", "jpg"])
|
| 205 |
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
|
| 206 |
with gr.Row():
|
| 207 |
layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3")
|
|
|
|
| 212 |
table_enable = gr.Checkbox(label="Enable table recognition(test)", value=False)
|
| 213 |
with gr.Row():
|
| 214 |
change_bu = gr.Button("Convert")
|
| 215 |
+
clear_bu = gr.ClearButton(value="Clear")
|
| 216 |
+
pdf_show = PDF(label="PDF preview", interactive=True, height=800)
|
| 217 |
with gr.Accordion("Examples:"):
|
| 218 |
example_root = os.path.join(os.path.dirname(__file__), "examples")
|
| 219 |
gr.Examples(
|
| 220 |
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
|
| 221 |
_.endswith("pdf")],
|
| 222 |
+
inputs=pdf_show
|
| 223 |
)
|
| 224 |
|
| 225 |
with gr.Column(variant='panel', scale=5):
|
|
|
|
| 230 |
latex_delimiters=latex_delimiters, line_breaks=True)
|
| 231 |
with gr.Tab("Markdown text"):
|
| 232 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
| 233 |
+
file.upload(fn=to_pdf, inputs=file, outputs=pdf_show)
|
| 234 |
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
|
| 235 |
outputs=[md, md_text, output_file, pdf_show])
|
| 236 |
+
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr, table_enable, language])
|
| 237 |
|
| 238 |
demo.launch()
|