from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from docling_core.types.doc.document import TableItem from docling_core.types.doc.labels import DocItemLabel from langchain_core.documents import Document from PIL import Image import base64 import io import itertools import os def process_pdf(file_path, embeddings_tokenizer, vision_model): """ Process a PDF file and extract text, tables, and images with descriptions. Args: file_path (str): Path to the PDF file embeddings_tokenizer: Tokenizer for chunking text vision_model: Model for processing images Returns: tuple: (text_chunks, table_chunks, image_descriptions) """ # Step 1: Define PDF processing options pdf_pipeline_options = PdfPipelineOptions( do_ocr=True, generate_picture_images=True ) # Step 2: Link input format to pipeline options format_options = { InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options), } # Step 3: Initialize the converter with format options converter = DocumentConverter(format_options=format_options) # Step 4: List of sources (can be file paths or URLs) sources = [file_path] # Step 5: Convert PDFs to structured documents conversions = { source: converter.convert(source=source).document for source in sources } # Process text chunks doc_id = 0 texts = [] for source, docling_document in conversions.items(): chunker = HybridChunker(tokenizer=embeddings_tokenizer) for chunk in chunker.chunk(docling_document): items = chunk.meta.doc_items # Skip if chunk is just a table if len(items) == 1 and isinstance(items[0], TableItem): continue # Collect references from items refs = "".join(item.get_ref().cref for item in items) text = chunk.text # Store as LangChain document document = Document( page_content=text, metadata={ "doc_id": (doc_id := doc_id + 1), "source": source, "ref": refs, } ) texts.append(document) # Process tables doc_id = len(texts) tables = [] for source, docling_document in conversions.items(): for table in docling_document.tables: if table.label == DocItemLabel.TABLE: ref = table.get_ref().cref text = table.export_to_markdown() document = Document( page_content=text, metadata={ "doc_id": (doc_id := doc_id + 1), "source": source, "ref": ref, } ) tables.append(document) # Process images doc_id = len(texts) + len(tables) pictures = [] for source, docling_document in conversions.items(): for picture in docling_document.pictures: ref = picture.get_ref().cref image = picture.get_image(docling_document) if image: try: # Process with Gemini response = vision_model.generate_content([ "Extract all text and describe key visual elements in this image. " "Include any numbers, labels, or important details.", image ]) # Create a document with the vision model's description document = Document( page_content=response.text, metadata={ "doc_id": doc_id, "source": source, "ref": ref, } ) pictures.append(document) doc_id += 1 except Exception as e: print(f"Error processing image {ref}: {str(e)}") return texts, tables, pictures