File size: 668 Bytes
95305d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import pymupdf4llm
from langchain.docstore.document import Document

def ExtractDatafrompdf(pdf_path):
    """Extract PDF data using pymupdf4llm and return LangChain Documents."""
    md_pages = pymupdf4llm.to_markdown(
        pdf_path,
        write_images=True,
        image_path="images",
        image_format="png",
        page_chunks=True
    )

    
    print("First page structure:", md_pages[0])

    documents = []
    for page in md_pages:
        text = page["text"]  
        page_num = md_pages.index(page) + 1  
        documents.append(Document(page_content=text, metadata={"page_number": page_num}))
    
    return documents