File size: 4,385 Bytes
d237c98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.types.doc.document import TableItem
from docling_core.types.doc.labels import DocItemLabel
from langchain_core.documents import Document
from PIL import Image
import base64
import io
import itertools
import os

def process_pdf(file_path, embeddings_tokenizer, vision_model):
    """
    Process a PDF file and extract text, tables, and images with descriptions.
    
    Args:
        file_path (str): Path to the PDF file
        embeddings_tokenizer: Tokenizer for chunking text
        vision_model: Model for processing images
        
    Returns:
        tuple: (text_chunks, table_chunks, image_descriptions)
    """
    # Step 1: Define PDF processing options
    pdf_pipeline_options = PdfPipelineOptions(
        do_ocr=True,
        generate_picture_images=True
    )

    # Step 2: Link input format to pipeline options
    format_options = {
        InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
    }

    # Step 3: Initialize the converter with format options
    converter = DocumentConverter(format_options=format_options)

    # Step 4: List of sources (can be file paths or URLs)
    sources = [file_path]

    # Step 5: Convert PDFs to structured documents
    conversions = {
        source: converter.convert(source=source).document for source in sources
    }

    # Process text chunks
    doc_id = 0
    texts = []
    
    for source, docling_document in conversions.items():
        chunker = HybridChunker(tokenizer=embeddings_tokenizer)

        for chunk in chunker.chunk(docling_document):
            items = chunk.meta.doc_items

            # Skip if chunk is just a table
            if len(items) == 1 and isinstance(items[0], TableItem):
                continue

            # Collect references from items
            refs = "".join(item.get_ref().cref for item in items)
            text = chunk.text

            # Store as LangChain document
            document = Document(
                page_content=text,
                metadata={
                    "doc_id": (doc_id := doc_id + 1),
                    "source": source,
                    "ref": refs,
                }
            )
            texts.append(document)

    # Process tables
    doc_id = len(texts)
    tables = []
    
    for source, docling_document in conversions.items():
        for table in docling_document.tables:
            if table.label == DocItemLabel.TABLE:
                ref = table.get_ref().cref
                text = table.export_to_markdown()

                document = Document(
                    page_content=text,
                    metadata={
                        "doc_id": (doc_id := doc_id + 1),
                        "source": source,
                        "ref": ref,
                    }
                )
                tables.append(document)

    # Process images
    doc_id = len(texts) + len(tables)
    pictures = []
    
    for source, docling_document in conversions.items():
        for picture in docling_document.pictures:
            ref = picture.get_ref().cref
            image = picture.get_image(docling_document)

            if image:
                try:
                    # Process with Gemini
                    response = vision_model.generate_content([
                        "Extract all text and describe key visual elements in this image. "
                        "Include any numbers, labels, or important details.",
                        image
                    ])

                    # Create a document with the vision model's description
                    document = Document(
                        page_content=response.text,
                        metadata={
                            "doc_id": doc_id,
                            "source": source,
                            "ref": ref,
                        }
                    )
                    pictures.append(document)
                    doc_id += 1
                except Exception as e:
                    print(f"Error processing image {ref}: {str(e)}")

    return texts, tables, pictures