# import gradio as gr # from transformers import AutoProcessor, AutoModelForImageTextToText # from PIL import Image # import re # # Load SmolDocling model & processor once # processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") # model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") # def extract_fcel_values_from_image(image, prompt_text): # """Run SmolDocling on an image and return numeric values inside tags.""" # # Prepare prompt for the model # messages = [ # {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} # ] # prompt = processor.apply_chat_template(messages, add_generation_prompt=True) # inputs = processor(text=prompt, images=[image], return_tensors="pt") # # Generate output # outputs = model.generate(**inputs, max_new_tokens=2048) # prompt_length = inputs.input_ids.shape[1] # generated = outputs[:, prompt_length:] # result = processor.batch_decode(generated, skip_special_tokens=False)[0] # clean_text = result.replace("", "").strip() # # Extract only values # values = re.findall(r"([\d.]+)", clean_text) # values = [float(v) for v in values] # convert to floats # return values, clean_text # def compare_images(image1, image2, prompt_text): # # Extract fcel values from both images # values1, raw1 = extract_fcel_values_from_image(image1, prompt_text) # values2, raw2 = extract_fcel_values_from_image(image2, prompt_text) # # Calculate accuracy # if len(values1) == len(values2) and values1 == values2: # accuracy = 100.0 # else: # matches = sum(1 for a, b in zip(values1, values2) if a == b) # total = max(len(values1), len(values2)) # accuracy = (matches / total) * 100 if total > 0 else 0 # return { # # "Extracted Values 1": values1, # # "Extracted Values 2": values2, # "Accuracy (%)": accuracy # } # # Gradio UI # demo = gr.Interface( # fn=compare_images, # inputs=[ # gr.Image(type="pil", label="Upload First Table Image"), # gr.Image(type="pil", label="Upload Second Table Image"), # gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt") # ], # outputs="json", # title="Table Data Accuracy Checker (SmolDocling)", # description="Uploads two table images, extracts only values from OTSL output, and compares them for accuracy." # ) # demo.launch() import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image # Load model & processor once at startup processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") def smoldocling_readimage(image, prompt_text): messages = [ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=1024) prompt_length = inputs.input_ids.shape[1] generated = outputs[:, prompt_length:] result = processor.batch_decode(generated, skip_special_tokens=False)[0] return result.replace("", "").strip() # Gradio UI demo = gr.Interface( fn=smoldocling_readimage, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"), ], outputs="html", title="SmolDocling Web App", description="Upload a document image and convert it to structured docling format." ) demo.launch()