Spaces:

Pavan147
/

Docling_Image

Sleeping

App Files Files Community

Docling_Image / app.py

Pavan147

Update app.py

dbef28c verified 5 days ago

raw

history blame contribute delete

5.11 kB


	# import re
	# import gradio as gr
	# from transformers import AutoProcessor, AutoModelForImageTextToText
	# from PIL import Image

	# # Load model & processor once at startup
	# processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
	# model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")

	# def smoldocling_readimage(image, prompt_text="Convert to docling"):
	# messages = [
	# {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
	# ]
	# prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	# inputs = processor(text=prompt, images=[image], return_tensors="pt")
	# outputs = model.generate(**inputs, max_new_tokens=1024)
	# prompt_length = inputs.input_ids.shape[1]
	# generated = outputs[:, prompt_length:]
	# result = processor.batch_decode(generated, skip_special_tokens=False)[0]
	# return result.replace("<end_of_utterance>", "").strip()

	# def extract_numbers(docling_text):
	# # Extract all floating numbers from the docling text using regex
	# numbers = re.findall(r"[-+]?\d*\.\d+\|\d+", docling_text)
	# return list(map(float, numbers))

	# def compare_outputs(img1, img2):
	# # Extract docling text from both images
	# output1 = smoldocling_readimage(img1)
	# output2 = smoldocling_readimage(img2)

	# # Extract numbers from both outputs
	# nums1 = extract_numbers(output1)
	# nums2 = extract_numbers(output2)

	# # Compare numbers — find matching count based on position
	# length = min(len(nums1), len(nums2))
	# matches = sum(1 for i in range(length) if abs(nums1[i] - nums2[i]) < 1e-3)

	# # Calculate similarity accuracy percentage
	# total = max(len(nums1), len(nums2))
	# accuracy = (matches / total) * 100 if total > 0 else 0

	# # Prepare result text
	# result_text = (
	# f"Output for Image 1:\n{output1}\n\n"
	# f"Output for Image 2:\n{output2}\n\n"
	# f"Similarity Accuracy: {accuracy:.2f}%\n"
	# f"Matching Values: {matches} out of {total}"
	# )
	# return result_text

	# # Gradio UI: take 2 images, output similarity report
	# demo = gr.Interface(
	# fn=compare_outputs,
	# inputs=[
	# gr.Image(type="pil", label="Upload Image 1"),
	# gr.Image(type="pil", label="Upload Image 2"),
	# ],
	# outputs="text",
	# title="SmolDocling Image Comparison",
	# description="Upload two document images. This app extracts data from both and compares similarity."
	# )

	# demo.launch()


	import re
	import gradio as gr
	from transformers import AutoProcessor, AutoModelForImageTextToText
	from PIL import Image

	# Load model & processor once at startup
	processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
	model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")

	def smoldocling_readimage(image, prompt_text="Convert to docling"):
	messages = [
	{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
	]
	prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(text=prompt, images=[image], return_tensors="pt")
	outputs = model.generate(**inputs, max_new_tokens=1024)
	prompt_length = inputs.input_ids.shape[1]
	generated = outputs[:, prompt_length:]
	result = processor.batch_decode(generated, skip_special_tokens=False)[0]
	return result.replace("<end_of_utterance>", "").strip()

	def extract_numbers(docling_text):
	# Extract all floating numbers from the docling text
	numbers = re.findall(r"[-+]?\d*\.\d+\|\d+", docling_text)
	return list(map(float, numbers))

	def compare_outputs(img1, img2):
	# Get outputs
	output1 = smoldocling_readimage(img1)
	output2 = smoldocling_readimage(img2)

	# Extract numbers
	nums1 = extract_numbers(output1)
	nums2 = extract_numbers(output2)

	length = min(len(nums1), len(nums2))
	matches = 0
	mismatches = []

	for i in range(length):
	if abs(nums1[i] - nums2[i]) < 1e-3:
	matches += 1
	else:
	mismatches.append(f"Pos {i+1}: {nums1[i]} ≠ {nums2[i]}")

	total = max(len(nums1), len(nums2))
	accuracy = (matches / total) * 100 if total > 0 else 0

	mismatch_text = "\n".join(mismatches) if mismatches else "✅ All values match."

	result_text = (
	f"📄 Output for Image 1:\n{output1}\n\n"
	f"📄 Output for Image 2:\n{output2}\n\n"
	f"🔍 Similarity Accuracy: {accuracy:.2f}%\n"
	f"✅ Matching Values: {matches} / {total}\n"
	f"❌ Mismatches:\n{mismatch_text}"
	)
	return result_text

	# Gradio UI
	demo = gr.Interface(
	fn=compare_outputs,
	inputs=[
	gr.Image(type="pil", label="Upload Image 1"),
	gr.Image(type="pil", label="Upload Image 2"),
	],
	outputs="text",
	title="SmolDocling Image Comparison",
	description="Upload two document images to extract values and compare similarity, with detailed mismatches."
	)

	demo.launch()