maintenance_chatbot

Sleeping

File size: 4,385 Bytes

d237c98

from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.types.doc.document import TableItem
from docling_core.types.doc.labels import DocItemLabel
from langchain_core.documents import Document
from PIL import Image
import base64
import io
import itertools
import os

def process_pdf(file_path, embeddings_tokenizer, vision_model):
    """
    Process a PDF file and extract text, tables, and images with descriptions.
    
    Args:
        file_path (str): Path to the PDF file
        embeddings_tokenizer: Tokenizer for chunking text
        vision_model: Model for processing images
        
    Returns:
        tuple: (text_chunks, table_chunks, image_descriptions)
    """
    # Step 1: Define PDF processing options
    pdf_pipeline_options = PdfPipelineOptions(
        do_ocr=True,
        generate_picture_images=True
    )

    # Step 2: Link input format to pipeline options
    format_options = {
        InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
    }

    # Step 3: Initialize the converter with format options
    converter = DocumentConverter(format_options=format_options)

    # Step 4: List of sources (can be file paths or URLs)
    sources = [file_path]

    # Step 5: Convert PDFs to structured documents
    conversions = {
        source: converter.convert(source=source).document for source in sources
    }

    # Process text chunks
    doc_id = 0
    texts = []
    
    for source, docling_document in conversions.items():
        chunker = HybridChunker(tokenizer=embeddings_tokenizer)

        for chunk in chunker.chunk(docling_document):
            items = chunk.meta.doc_items

            # Skip if chunk is just a table
            if len(items) == 1 and isinstance(items[0], TableItem):
                continue

            # Collect references from items
            refs = "".join(item.get_ref().cref for item in items)
            text = chunk.text

            # Store as LangChain document
            document = Document(
                page_content=text,
                metadata={
                    "doc_id": (doc_id := doc_id + 1),
                    "source": source,
                    "ref": refs,
                }
            )
            texts.append(document)

    # Process tables
    doc_id = len(texts)
    tables = []
    
    for source, docling_document in conversions.items():
        for table in docling_document.tables:
            if table.label == DocItemLabel.TABLE:
                ref = table.get_ref().cref
                text = table.export_to_markdown()

                document = Document(
                    page_content=text,
                    metadata={
                        "doc_id": (doc_id := doc_id + 1),
                        "source": source,
                        "ref": ref,
                    }
                )
                tables.append(document)

    # Process images
    doc_id = len(texts) + len(tables)
    pictures = []
    
    for source, docling_document in conversions.items():
        for picture in docling_document.pictures:
            ref = picture.get_ref().cref
            image = picture.get_image(docling_document)

            if image:
                try:
                    # Process with Gemini
                    response = vision_model.generate_content([
                        "Extract all text and describe key visual elements in this image. "
                        "Include any numbers, labels, or important details.",
                        image
                    ])

                    # Create a document with the vision model's description
                    document = Document(
                        page_content=response.text,
                        metadata={
                            "doc_id": doc_id,
                            "source": source,
                            "ref": ref,
                        }
                    )
                    pictures.append(document)
                    doc_id += 1
                except Exception as e:
                    print(f"Error processing image {ref}: {str(e)}")

    return texts, tables, pictures