Spaces:

anand004
/

Multimodal-PDF-RAG

Runtime error

Multimodal-PDF-RAG / utils.py

Update utils.py

65aad38 verified 12 months ago

1.6 kB

	import pymupdf
	from PIL import Image
	import io
	import gradio as gr
	import base64
	import pandas as pd
	import pymupdf


	def image_to_bytes(image):
	img_byte_arr = io.BytesIO()
	image.save(img_byte_arr, format="PNG")
	return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")


	def extract_pdfs(docs, doc_collection):
	if docs:
	doc_collection = []
	doc_collection.extend(docs)
	return (
	doc_collection,
	gr.Tabs(selected=1),
	pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
	)


	def extract_images(docs):
	images = []
	for doc_path in docs:
	doc = pymupdf.open(doc_path) # open a document

	for page_index in range(len(doc)): # iterate over pdf pages
	page = doc[page_index] # get the page
	image_list = page.get_images()

	for image_index, img in enumerate(
	image_list, start=1
	): # enumerate the image list
	xref = img[0] # get the XREF of the image
	pix = pymupdf.Pixmap(doc, xref) # create a Pixmap

	if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
	pix = pymupdf.Pixmap(pymupdf.csRGB, pix)

	images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
	return images


	def clean_text(text):
	text = text.strip()
	cleaned_text = text.replace("\n", " ")
	cleaned_text = cleaned_text.replace("\t", " ")
	cleaned_text = cleaned_text.replace(" ", " ")
	cleaned_text = cleaned_text.strip()
	return cleaned_text