Spaces:
Sleeping
Sleeping
from docling.document_converter import DocumentConverter, PdfFormatOption | |
from docling.datamodel.base_models import InputFormat | |
from docling.datamodel.pipeline_options import PdfPipelineOptions | |
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker | |
from docling_core.types.doc.document import TableItem | |
from docling_core.types.doc.labels import DocItemLabel | |
from langchain_core.documents import Document | |
from PIL import Image | |
import base64 | |
import io | |
import itertools | |
import os | |
def process_pdf(file_path, embeddings_tokenizer, vision_model): | |
""" | |
Process a PDF file and extract text, tables, and images with descriptions. | |
Args: | |
file_path (str): Path to the PDF file | |
embeddings_tokenizer: Tokenizer for chunking text | |
vision_model: Model for processing images | |
Returns: | |
tuple: (text_chunks, table_chunks, image_descriptions) | |
""" | |
# Step 1: Define PDF processing options | |
pdf_pipeline_options = PdfPipelineOptions( | |
do_ocr=True, | |
generate_picture_images=True | |
) | |
# Step 2: Link input format to pipeline options | |
format_options = { | |
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options), | |
} | |
# Step 3: Initialize the converter with format options | |
converter = DocumentConverter(format_options=format_options) | |
# Step 4: List of sources (can be file paths or URLs) | |
sources = [file_path] | |
# Step 5: Convert PDFs to structured documents | |
conversions = { | |
source: converter.convert(source=source).document for source in sources | |
} | |
# Process text chunks | |
doc_id = 0 | |
texts = [] | |
for source, docling_document in conversions.items(): | |
chunker = HybridChunker(tokenizer=embeddings_tokenizer) | |
for chunk in chunker.chunk(docling_document): | |
items = chunk.meta.doc_items | |
# Skip if chunk is just a table | |
if len(items) == 1 and isinstance(items[0], TableItem): | |
continue | |
# Collect references from items | |
refs = "".join(item.get_ref().cref for item in items) | |
text = chunk.text | |
# Store as LangChain document | |
document = Document( | |
page_content=text, | |
metadata={ | |
"doc_id": (doc_id := doc_id + 1), | |
"source": source, | |
"ref": refs, | |
} | |
) | |
texts.append(document) | |
# Process tables | |
doc_id = len(texts) | |
tables = [] | |
for source, docling_document in conversions.items(): | |
for table in docling_document.tables: | |
if table.label == DocItemLabel.TABLE: | |
ref = table.get_ref().cref | |
text = table.export_to_markdown() | |
document = Document( | |
page_content=text, | |
metadata={ | |
"doc_id": (doc_id := doc_id + 1), | |
"source": source, | |
"ref": ref, | |
} | |
) | |
tables.append(document) | |
# Process images | |
doc_id = len(texts) + len(tables) | |
pictures = [] | |
for source, docling_document in conversions.items(): | |
for picture in docling_document.pictures: | |
ref = picture.get_ref().cref | |
image = picture.get_image(docling_document) | |
if image: | |
try: | |
# Process with Gemini | |
response = vision_model.generate_content([ | |
"Extract all text and describe key visual elements in this image. " | |
"Include any numbers, labels, or important details.", | |
image | |
]) | |
# Create a document with the vision model's description | |
document = Document( | |
page_content=response.text, | |
metadata={ | |
"doc_id": doc_id, | |
"source": source, | |
"ref": ref, | |
} | |
) | |
pictures.append(document) | |
doc_id += 1 | |
except Exception as e: | |
print(f"Error processing image {ref}: {str(e)}") | |
return texts, tables, pictures |