Spaces:
Sleeping
Sleeping
File size: 6,022 Bytes
df46f51 1b7aff0 aa63203 d887fd5 6c102e5 99c8757 df46f51 99c8757 6c102e5 aa63203 6c102e5 aa63203 1b7aff0 aa63203 df46f51 0214886 99c8757 6c102e5 aa63203 6c102e5 99c8757 1b7aff0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# import gradio as gr
# from transformers import AutoProcessor, AutoModelForImageTextToText
# from PIL import Image
# import re
# # Load SmolDocling model & processor once
# processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
# model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# def extract_fcel_values_from_image(image, prompt_text):
# """Run SmolDocling on an image and return numeric values inside <fcel> tags."""
# # Prepare prompt for the model
# messages = [
# {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
# ]
# prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(text=prompt, images=[image], return_tensors="pt")
# # Generate output
# outputs = model.generate(**inputs, max_new_tokens=2048)
# prompt_length = inputs.input_ids.shape[1]
# generated = outputs[:, prompt_length:]
# result = processor.batch_decode(generated, skip_special_tokens=False)[0]
# clean_text = result.replace("<end_of_utterance>", "").strip()
# # Extract only <fcel> values
# values = re.findall(r"<fcel>([\d.]+)", clean_text)
# values = [float(v) for v in values] # convert to floats
# return values, clean_text
# def compare_images(image1, image2, prompt_text):
# # Extract fcel values from both images
# values1, raw1 = extract_fcel_values_from_image(image1, prompt_text)
# values2, raw2 = extract_fcel_values_from_image(image2, prompt_text)
# # Calculate accuracy
# if len(values1) == len(values2) and values1 == values2:
# accuracy = 100.0
# else:
# matches = sum(1 for a, b in zip(values1, values2) if a == b)
# total = max(len(values1), len(values2))
# accuracy = (matches / total) * 100 if total > 0 else 0
# return {
# # "Extracted Values 1": values1,
# # "Extracted Values 2": values2,
# "Accuracy (%)": accuracy
# }
# # Gradio UI
# demo = gr.Interface(
# fn=compare_images,
# inputs=[
# gr.Image(type="pil", label="Upload First Table Image"),
# gr.Image(type="pil", label="Upload Second Table Image"),
# gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt")
# ],
# outputs="json",
# title="Table Data Accuracy Checker (SmolDocling)",
# description="Uploads two table images, extracts only <fcel> values from OTSL output, and compares them for accuracy."
# )
# demo.launch()
# import gradio as gr
# from transformers import AutoProcessor, AutoModelForImageTextToText
# from PIL import Image
# # Load model & processor once at startup
# processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
# model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# def smoldocling_readimage(image, prompt_text):
# messages = [
# {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
# ]
# prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(text=prompt, images=[image], return_tensors="pt")
# outputs = model.generate(**inputs, max_new_tokens=1024)
# prompt_length = inputs.input_ids.shape[1]
# generated = outputs[:, prompt_length:]
# result = processor.batch_decode(generated, skip_special_tokens=False)[0]
# return result.replace("<end_of_utterance>", "").strip()
# # Gradio UI
# demo = gr.Interface(
# fn=smoldocling_readimage,
# inputs=[
# gr.Image(type="pil", label="Upload Image"),
# gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
# ],
# outputs="html",
# title="SmolDocling Web App",
# description="Upload a document image and convert it to structured docling format."
# )
# demo.launch()
import re
import json
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
# Load model & processor once at startup
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
def parse_docling_to_json(docling_text):
# Remove unwanted tags like <otsl>, </otsl>, <loc_...>
cleaned = re.sub(r"</?otsl>|<loc_[^>]+>", "", docling_text)
# Split by line break <nl>
lines = cleaned.split("<nl>")
table = []
for line in lines:
if not line.strip():
continue
# Extract all <fcel> values
cells = re.findall(r"<fcel>([^<]+)", line)
# Convert to floats if possible
try:
row = [float(cell) for cell in cells]
except ValueError:
# If conversion fails, keep as string
row = cells
table.append(row)
return json.dumps(table, indent=2)
def smoldocling_readimage(image, prompt_text):
messages = [
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1024)
prompt_length = inputs.input_ids.shape[1]
generated = outputs[:, prompt_length:]
result = processor.batch_decode(generated, skip_special_tokens=False)[0]
# Parse raw docling output to JSON
json_output = parse_docling_to_json(result)
return f"<pre>{json_output}</pre>"
# Gradio UI
demo = gr.Interface(
fn=smoldocling_readimage,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
],
outputs="html",
title="SmolDocling Web App",
description="Upload a document image and convert it to structured docling format."
)
demo.launch()
|