Spaces:
Sleeping
Sleeping
File size: 3,885 Bytes
df46f51 d887fd5 6c102e5 99c8757 df46f51 99c8757 6c102e5 df46f51 0214886 99c8757 6c102e5 99c8757 6c102e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# import gradio as gr
# from transformers import AutoProcessor, AutoModelForImageTextToText
# from PIL import Image
# import re
# # Load SmolDocling model & processor once
# processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
# model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# def extract_fcel_values_from_image(image, prompt_text):
# """Run SmolDocling on an image and return numeric values inside <fcel> tags."""
# # Prepare prompt for the model
# messages = [
# {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
# ]
# prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(text=prompt, images=[image], return_tensors="pt")
# # Generate output
# outputs = model.generate(**inputs, max_new_tokens=2048)
# prompt_length = inputs.input_ids.shape[1]
# generated = outputs[:, prompt_length:]
# result = processor.batch_decode(generated, skip_special_tokens=False)[0]
# clean_text = result.replace("<end_of_utterance>", "").strip()
# # Extract only <fcel> values
# values = re.findall(r"<fcel>([\d.]+)", clean_text)
# values = [float(v) for v in values] # convert to floats
# return values, clean_text
# def compare_images(image1, image2, prompt_text):
# # Extract fcel values from both images
# values1, raw1 = extract_fcel_values_from_image(image1, prompt_text)
# values2, raw2 = extract_fcel_values_from_image(image2, prompt_text)
# # Calculate accuracy
# if len(values1) == len(values2) and values1 == values2:
# accuracy = 100.0
# else:
# matches = sum(1 for a, b in zip(values1, values2) if a == b)
# total = max(len(values1), len(values2))
# accuracy = (matches / total) * 100 if total > 0 else 0
# return {
# # "Extracted Values 1": values1,
# # "Extracted Values 2": values2,
# "Accuracy (%)": accuracy
# }
# # Gradio UI
# demo = gr.Interface(
# fn=compare_images,
# inputs=[
# gr.Image(type="pil", label="Upload First Table Image"),
# gr.Image(type="pil", label="Upload Second Table Image"),
# gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt")
# ],
# outputs="json",
# title="Table Data Accuracy Checker (SmolDocling)",
# description="Uploads two table images, extracts only <fcel> values from OTSL output, and compares them for accuracy."
# )
# demo.launch()
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
# Load model & processor once at startup
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
def smoldocling_readimage(image, prompt_text):
messages = [
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1024)
prompt_length = inputs.input_ids.shape[1]
generated = outputs[:, prompt_length:]
result = processor.batch_decode(generated, skip_special_tokens=False)[0]
return result.replace("<end_of_utterance>", "").strip()
# Gradio UI
demo = gr.Interface(
fn=smoldocling_readimage,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
],
outputs="ostl",
title="SmolDocling Web App",
description="Upload a document image and convert it to structured docling format."
)
demo.launch() |