maintenance_chatbot

Sleeping

App Files Files Community

maintenance_chatbot / utils /document_processing.py

Prathamesh1420

Create utils/document_processing.py

d237c98 verified 21 days ago

raw

history blame contribute delete

4.39 kB

	from docling.document_converter import DocumentConverter, PdfFormatOption
	from docling.datamodel.base_models import InputFormat
	from docling.datamodel.pipeline_options import PdfPipelineOptions
	from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
	from docling_core.types.doc.document import TableItem
	from docling_core.types.doc.labels import DocItemLabel
	from langchain_core.documents import Document
	from PIL import Image
	import base64
	import io
	import itertools
	import os

	def process_pdf(file_path, embeddings_tokenizer, vision_model):
	"""
	Process a PDF file and extract text, tables, and images with descriptions.

	Args:
	file_path (str): Path to the PDF file
	embeddings_tokenizer: Tokenizer for chunking text
	vision_model: Model for processing images

	Returns:
	tuple: (text_chunks, table_chunks, image_descriptions)
	"""
	# Step 1: Define PDF processing options
	pdf_pipeline_options = PdfPipelineOptions(
	do_ocr=True,
	generate_picture_images=True
	)

	# Step 2: Link input format to pipeline options
	format_options = {
	InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
	}

	# Step 3: Initialize the converter with format options
	converter = DocumentConverter(format_options=format_options)

	# Step 4: List of sources (can be file paths or URLs)
	sources = [file_path]

	# Step 5: Convert PDFs to structured documents
	conversions = {
	source: converter.convert(source=source).document for source in sources
	}

	# Process text chunks
	doc_id = 0
	texts = []

	for source, docling_document in conversions.items():
	chunker = HybridChunker(tokenizer=embeddings_tokenizer)

	for chunk in chunker.chunk(docling_document):
	items = chunk.meta.doc_items

	# Skip if chunk is just a table
	if len(items) == 1 and isinstance(items[0], TableItem):
	continue

	# Collect references from items
	refs = "".join(item.get_ref().cref for item in items)
	text = chunk.text

	# Store as LangChain document
	document = Document(
	page_content=text,
	metadata={
	"doc_id": (doc_id := doc_id + 1),
	"source": source,
	"ref": refs,
	}
	)
	texts.append(document)

	# Process tables
	doc_id = len(texts)
	tables = []

	for source, docling_document in conversions.items():
	for table in docling_document.tables:
	if table.label == DocItemLabel.TABLE:
	ref = table.get_ref().cref
	text = table.export_to_markdown()

	document = Document(
	page_content=text,
	metadata={
	"doc_id": (doc_id := doc_id + 1),
	"source": source,
	"ref": ref,
	}
	)
	tables.append(document)

	# Process images
	doc_id = len(texts) + len(tables)
	pictures = []

	for source, docling_document in conversions.items():
	for picture in docling_document.pictures:
	ref = picture.get_ref().cref
	image = picture.get_image(docling_document)

	if image:
	try:
	# Process with Gemini
	response = vision_model.generate_content([
	"Extract all text and describe key visual elements in this image. "
	"Include any numbers, labels, or important details.",
	image
	])

	# Create a document with the vision model's description
	document = Document(
	page_content=response.text,
	metadata={
	"doc_id": doc_id,
	"source": source,
	"ref": ref,
	}
	)
	pictures.append(document)
	doc_id += 1
	except Exception as e:
	print(f"Error processing image {ref}: {str(e)}")

	return texts, tables, pictures