|
|
|
|
|
import argparse |
|
import os |
|
from pathlib import Path |
|
import re |
|
import shutil |
|
import tempfile |
|
import uuid |
|
|
|
import aspose.words as aw |
|
import pymupdf4llm |
|
|
|
from project_settings import project_path |
|
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown |
|
|
|
|
|
def get_args(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--filename", |
|
default=(project_path / "data/files/pdf/2024.naacl-long.35.pdf").as_posix(), |
|
|
|
|
|
type=str |
|
) |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
@BaseToMarkdown.register("pymupdf4llm") |
|
class PyMuPdf2Llm(BaseToMarkdown): |
|
""" |
|
不支持图像 |
|
https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/ |
|
""" |
|
def __init__(self, filename: str, image_folder: str = "media"): |
|
super().__init__(filename) |
|
|
|
def save_to_zip(self, output_dir: str): |
|
basename = str(uuid.uuid4()) |
|
|
|
temp_dir = Path(tempfile.gettempdir()) / basename |
|
temp_dir.mkdir(parents=True, exist_ok=False) |
|
|
|
md_file = temp_dir / f"{basename}.md" |
|
|
|
|
|
md_text = pymupdf4llm.to_markdown(self.filename) |
|
with open(md_file.as_posix(), "w", encoding="utf-8") as f: |
|
f.write(md_text) |
|
|
|
|
|
output_zip_file = os.path.join(output_dir, f"{basename}.zip") |
|
self.zip_directory(temp_dir, output_zip_file) |
|
shutil.rmtree(temp_dir) |
|
return output_zip_file |
|
|
|
|
|
@BaseToMarkdown.register("aspose_words") |
|
class AsposeWordsPdf2Md(BaseToMarkdown): |
|
""" |
|
https://pypi.org/project/aspose-words/ |
|
https://products.aspose.com/words/python-net/ |
|
https://products.aspose.com/words/python-net/merge/pdf-to-markdown/ |
|
""" |
|
def __init__(self, filename: str, image_folder: str = "media"): |
|
super().__init__(filename) |
|
self.doc = aw.Document(self.filename) |
|
self.image_folder = image_folder |
|
|
|
def save_to_zip(self, output_dir: str): |
|
basename = str(uuid.uuid4()) |
|
|
|
temp_dir = Path(tempfile.gettempdir()) / basename |
|
temp_dir.mkdir(parents=True, exist_ok=False) |
|
|
|
md_file = temp_dir / f"{basename}.md" |
|
media_dir = temp_dir / self.image_folder |
|
media_dir.mkdir(parents=True, exist_ok=False) |
|
|
|
|
|
self.doc.save(md_file.as_posix()) |
|
|
|
|
|
for pattern in ["*.jpeg", "*.jpg", "*.png", "*.gif", "*.bmp", "*.tiff"]: |
|
for image_file in temp_dir.glob(pattern): |
|
shutil.move( |
|
src=image_file.as_posix(), |
|
dst=media_dir.as_posix(), |
|
) |
|
|
|
|
|
with open(md_file.as_posix(), "r", encoding="utf-8") as f: |
|
md_text = f.read() |
|
md_text = self.convert_image_to_media_dir(md_text, image_folder=self.image_folder) |
|
with open(md_file.as_posix(), "w", encoding="utf-8") as f: |
|
f.write(md_text) |
|
|
|
|
|
output_zip_file = os.path.join(output_dir, f"{basename}.zip") |
|
self.zip_directory(temp_dir, output_zip_file) |
|
shutil.rmtree(temp_dir) |
|
return output_zip_file |
|
|
|
def convert_image_to_media_dir(self, |
|
markdown_text: str, |
|
image_folder: str = "media", |
|
): |
|
|
|
pattern1 = r'\!\[(?:.*?)\]\((.+?)\)' |
|
|
|
def replace(match): |
|
relative_path = match.group(1) |
|
relative_path = os.path.join(image_folder, relative_path) |
|
result = f"" |
|
return result |
|
markdown_text = re.sub(pattern1, replace, markdown_text) |
|
|
|
return markdown_text |
|
|
|
|
|
def main(): |
|
args = get_args() |
|
|
|
p2m = PyMuPdf2Llm(args.filename) |
|
|
|
|
|
output_zip_file = p2m.save_to_zip(output_dir=".") |
|
print(output_zip_file) |
|
return |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|