Spaces:
Sleeping
Sleeping
import pymupdf4llm | |
from langchain.docstore.document import Document | |
def ExtractDatafrompdf(pdf_path): | |
"""Extract PDF data using pymupdf4llm and return LangChain Documents.""" | |
md_pages = pymupdf4llm.to_markdown( | |
pdf_path, | |
write_images=True, | |
image_path="images", | |
image_format="png", | |
page_chunks=True | |
) | |
print("First page structure:", md_pages[0]) | |
documents = [] | |
for page in md_pages: | |
text = page["text"] | |
page_num = md_pages.index(page) + 1 | |
documents.append(Document(page_content=text, metadata={"page_number": page_num})) | |
return documents | |