Spaces:
Sleeping
Sleeping
File size: 668 Bytes
95305d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
import pymupdf4llm
from langchain.docstore.document import Document
def ExtractDatafrompdf(pdf_path):
"""Extract PDF data using pymupdf4llm and return LangChain Documents."""
md_pages = pymupdf4llm.to_markdown(
pdf_path,
write_images=True,
image_path="images",
image_format="png",
page_chunks=True
)
print("First page structure:", md_pages[0])
documents = []
for page in md_pages:
text = page["text"]
page_num = md_pages.index(page) + 1
documents.append(Document(page_content=text, metadata={"page_number": page_num}))
return documents
|