Prathamesh1420 commited on
Commit
d237c98
·
verified ·
1 Parent(s): 61e6b08

Create utils/document_processing.py

Browse files
Files changed (1) hide show
  1. utils/document_processing.py +129 -0
utils/document_processing.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docling.document_converter import DocumentConverter, PdfFormatOption
2
+ from docling.datamodel.base_models import InputFormat
3
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
4
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
5
+ from docling_core.types.doc.document import TableItem
6
+ from docling_core.types.doc.labels import DocItemLabel
7
+ from langchain_core.documents import Document
8
+ from PIL import Image
9
+ import base64
10
+ import io
11
+ import itertools
12
+ import os
13
+
14
+ def process_pdf(file_path, embeddings_tokenizer, vision_model):
15
+ """
16
+ Process a PDF file and extract text, tables, and images with descriptions.
17
+
18
+ Args:
19
+ file_path (str): Path to the PDF file
20
+ embeddings_tokenizer: Tokenizer for chunking text
21
+ vision_model: Model for processing images
22
+
23
+ Returns:
24
+ tuple: (text_chunks, table_chunks, image_descriptions)
25
+ """
26
+ # Step 1: Define PDF processing options
27
+ pdf_pipeline_options = PdfPipelineOptions(
28
+ do_ocr=True,
29
+ generate_picture_images=True
30
+ )
31
+
32
+ # Step 2: Link input format to pipeline options
33
+ format_options = {
34
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
35
+ }
36
+
37
+ # Step 3: Initialize the converter with format options
38
+ converter = DocumentConverter(format_options=format_options)
39
+
40
+ # Step 4: List of sources (can be file paths or URLs)
41
+ sources = [file_path]
42
+
43
+ # Step 5: Convert PDFs to structured documents
44
+ conversions = {
45
+ source: converter.convert(source=source).document for source in sources
46
+ }
47
+
48
+ # Process text chunks
49
+ doc_id = 0
50
+ texts = []
51
+
52
+ for source, docling_document in conversions.items():
53
+ chunker = HybridChunker(tokenizer=embeddings_tokenizer)
54
+
55
+ for chunk in chunker.chunk(docling_document):
56
+ items = chunk.meta.doc_items
57
+
58
+ # Skip if chunk is just a table
59
+ if len(items) == 1 and isinstance(items[0], TableItem):
60
+ continue
61
+
62
+ # Collect references from items
63
+ refs = "".join(item.get_ref().cref for item in items)
64
+ text = chunk.text
65
+
66
+ # Store as LangChain document
67
+ document = Document(
68
+ page_content=text,
69
+ metadata={
70
+ "doc_id": (doc_id := doc_id + 1),
71
+ "source": source,
72
+ "ref": refs,
73
+ }
74
+ )
75
+ texts.append(document)
76
+
77
+ # Process tables
78
+ doc_id = len(texts)
79
+ tables = []
80
+
81
+ for source, docling_document in conversions.items():
82
+ for table in docling_document.tables:
83
+ if table.label == DocItemLabel.TABLE:
84
+ ref = table.get_ref().cref
85
+ text = table.export_to_markdown()
86
+
87
+ document = Document(
88
+ page_content=text,
89
+ metadata={
90
+ "doc_id": (doc_id := doc_id + 1),
91
+ "source": source,
92
+ "ref": ref,
93
+ }
94
+ )
95
+ tables.append(document)
96
+
97
+ # Process images
98
+ doc_id = len(texts) + len(tables)
99
+ pictures = []
100
+
101
+ for source, docling_document in conversions.items():
102
+ for picture in docling_document.pictures:
103
+ ref = picture.get_ref().cref
104
+ image = picture.get_image(docling_document)
105
+
106
+ if image:
107
+ try:
108
+ # Process with Gemini
109
+ response = vision_model.generate_content([
110
+ "Extract all text and describe key visual elements in this image. "
111
+ "Include any numbers, labels, or important details.",
112
+ image
113
+ ])
114
+
115
+ # Create a document with the vision model's description
116
+ document = Document(
117
+ page_content=response.text,
118
+ metadata={
119
+ "doc_id": doc_id,
120
+ "source": source,
121
+ "ref": ref,
122
+ }
123
+ )
124
+ pictures.append(document)
125
+ doc_id += 1
126
+ except Exception as e:
127
+ print(f"Error processing image {ref}: {str(e)}")
128
+
129
+ return texts, tables, pictures