Spaces:
Sleeping
Sleeping
File size: 5,522 Bytes
df46f51 1b7aff0 d887fd5 6c102e5 99c8757 df46f51 99c8757 6c102e5 1b7aff0 df46f51 0214886 99c8757 6c102e5 1b7aff0 6c102e5 99c8757 1b7aff0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
# import gradio as gr
# from transformers import AutoProcessor, AutoModelForImageTextToText
# from PIL import Image
# import re
# # Load SmolDocling model & processor once
# processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
# model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# def extract_fcel_values_from_image(image, prompt_text):
# """Run SmolDocling on an image and return numeric values inside <fcel> tags."""
# # Prepare prompt for the model
# messages = [
# {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
# ]
# prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(text=prompt, images=[image], return_tensors="pt")
# # Generate output
# outputs = model.generate(**inputs, max_new_tokens=2048)
# prompt_length = inputs.input_ids.shape[1]
# generated = outputs[:, prompt_length:]
# result = processor.batch_decode(generated, skip_special_tokens=False)[0]
# clean_text = result.replace("<end_of_utterance>", "").strip()
# # Extract only <fcel> values
# values = re.findall(r"<fcel>([\d.]+)", clean_text)
# values = [float(v) for v in values] # convert to floats
# return values, clean_text
# def compare_images(image1, image2, prompt_text):
# # Extract fcel values from both images
# values1, raw1 = extract_fcel_values_from_image(image1, prompt_text)
# values2, raw2 = extract_fcel_values_from_image(image2, prompt_text)
# # Calculate accuracy
# if len(values1) == len(values2) and values1 == values2:
# accuracy = 100.0
# else:
# matches = sum(1 for a, b in zip(values1, values2) if a == b)
# total = max(len(values1), len(values2))
# accuracy = (matches / total) * 100 if total > 0 else 0
# return {
# # "Extracted Values 1": values1,
# # "Extracted Values 2": values2,
# "Accuracy (%)": accuracy
# }
# # Gradio UI
# demo = gr.Interface(
# fn=compare_images,
# inputs=[
# gr.Image(type="pil", label="Upload First Table Image"),
# gr.Image(type="pil", label="Upload Second Table Image"),
# gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt")
# ],
# outputs="json",
# title="Table Data Accuracy Checker (SmolDocling)",
# description="Uploads two table images, extracts only <fcel> values from OTSL output, and compares them for accuracy."
# )
# demo.launch()
# import gradio as gr
# from transformers import AutoProcessor, AutoModelForImageTextToText
# from PIL import Image
# # Load model & processor once at startup
# processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
# model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# def smoldocling_readimage(image, prompt_text):
# messages = [
# {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
# ]
# prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(text=prompt, images=[image], return_tensors="pt")
# outputs = model.generate(**inputs, max_new_tokens=1024)
# prompt_length = inputs.input_ids.shape[1]
# generated = outputs[:, prompt_length:]
# result = processor.batch_decode(generated, skip_special_tokens=False)[0]
# return result.replace("<end_of_utterance>", "").strip()
# # Gradio UI
# demo = gr.Interface(
# fn=smoldocling_readimage,
# inputs=[
# gr.Image(type="pil", label="Upload Image"),
# gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
# ],
# outputs="html",
# title="SmolDocling Web App",
# description="Upload a document image and convert it to structured docling format."
# )
# demo.launch()
import re
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
# Load model & processor once at startup
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
def smoldocling_readimage(image, prompt_text):
messages = [
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1024)
prompt_length = inputs.input_ids.shape[1]
generated = outputs[:, prompt_length:]
raw_result = processor.batch_decode(generated, skip_special_tokens=False)[0]
# Remove all tags like <tag> and </tag>
text_without_tags = re.sub(r'<.*?>', '', raw_result)
# Extract all numbers (integers or decimals)
numbers = re.findall(r'\d+\.\d+|\d+', text_without_tags)
# Join numbers with commas
cleaned_result = ",".join(numbers)
return cleaned_result
# Gradio UI
demo = gr.Interface(
fn=smoldocling_readimage,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
],
outputs="text",
title="SmolDocling Web App",
description="Upload a document image and convert it to structured docling format."
)
demo.launch()
|