# import gradio as gr # from transformers import AutoProcessor, AutoModelForImageTextToText # from PIL import Image # import re # # Load SmolDocling model & processor once # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") # model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") # def extract_fcel_values_from_image(image, prompt_text): # """Run SmolDocling on an image and return numeric values inside tags.""" # # Prepare prompt for the model # messages = [ # {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} # ] # prompt = processor.apply_chat_template(messages, add_generation_prompt=True) # inputs = processor(text=prompt, images=[image], return_tensors="pt") # # Generate output # outputs = model.generate(**inputs, max_new_tokens=2048) # prompt_length = inputs.input_ids.shape[1] # generated = outputs[:, prompt_length:] # result = processor.batch_decode(generated, skip_special_tokens=False)[0] # clean_text = result.replace("", "").strip() # # Extract only values # values = re.findall(r"([\d.]+)", clean_text) # values = [float(v) for v in values] # convert to floats # return values, clean_text # def compare_images(image1, image2, prompt_text): # # Extract fcel values from both images # values1, raw1 = extract_fcel_values_from_image(image1, prompt_text) # values2, raw2 = extract_fcel_values_from_image(image2, prompt_text) # # Calculate accuracy # if len(values1) == len(values2) and values1 == values2: # accuracy = 100.0 # else: # matches = sum(1 for a, b in zip(values1, values2) if a == b) # total = max(len(values1), len(values2)) # accuracy = (matches / total) * 100 if total > 0 else 0 # return { # "Extracted Values 1": values1, # "Extracted Values 2": values2, # "Accuracy (%)": accuracy # } # # Gradio UI # demo = gr.Interface( # fn=compare_images, # inputs=[ # gr.Image(type="pil", label="Upload First Table Image"), # gr.Image(type="pil", label="Upload Second Table Image"), # gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt") # ], # outputs="json", # title="Table Data Accuracy Checker (SmolDocling)", # description="Uploads two table images, extracts only values from OTSL output, and compares them for accuracy." # ) # demo.launch() import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image import re # Load model & processor once at startup processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") def extract_numbers_from_docling(docling_text): # Remove tags except keep content between and # Use regex to find all numbers (integers or decimals) numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text) # Convert strings to floats or ints as appropriate def convert_num(s): return int(s) if s.isdigit() else float(s) return [convert_num(num) for num in numbers] def smoldocling_readimage(image, prompt_text): messages = [ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=1024) prompt_length = inputs.input_ids.shape[1] generated = outputs[:, prompt_length:] result = processor.batch_decode(generated, skip_special_tokens=False)[0] clean_result = result.replace("", "").strip() numbers = extract_numbers_from_docling(clean_result) return numbers # Gradio UI demo = gr.Interface( fn=smoldocling_readimage, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"), ], outputs=gr.JSON(), title="SmolDocling Web App - Extract Numbers", description="Upload a document image and extract numeric values as a list." ) demo.launch()