Multi-Model-Pdf-Chat / pdfparsing.py
Waris01's picture
Model-Files
95305d3 verified
raw
history blame contribute delete
668 Bytes
import pymupdf4llm
from langchain.docstore.document import Document
def ExtractDatafrompdf(pdf_path):
"""Extract PDF data using pymupdf4llm and return LangChain Documents."""
md_pages = pymupdf4llm.to_markdown(
pdf_path,
write_images=True,
image_path="images",
image_format="png",
page_chunks=True
)
print("First page structure:", md_pages[0])
documents = []
for page in md_pages:
text = page["text"]
page_num = md_pages.index(page) + 1
documents.append(Document(page_content=text, metadata={"page_number": page_num}))
return documents