Pavan147 commited on
Commit
a62604d
·
verified ·
1 Parent(s): 99c8757

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -11
app.py CHANGED
@@ -1,33 +1,64 @@
1
  import gradio as gr
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
  from PIL import Image
 
4
 
5
- # Load model & processor once at startup
6
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
7
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
8
 
9
- def smoldocling_readimage(image, prompt_text):
 
 
10
  messages = [
11
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
12
  ]
13
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
14
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
15
- outputs = model.generate(**inputs, max_new_tokens=1024)
 
 
16
  prompt_length = inputs.input_ids.shape[1]
17
  generated = outputs[:, prompt_length:]
18
  result = processor.batch_decode(generated, skip_special_tokens=False)[0]
19
- return result.replace("<end_of_utterance>", "").strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  # Gradio UI
22
  demo = gr.Interface(
23
- fn=smoldocling_readimage,
24
  inputs=[
25
- gr.Image(type="pil", label="Upload Image"),
26
- gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Convert to docling)", label="Prompt"),
 
27
  ],
28
- outputs="text",
29
- title="SmolDocling Web App",
30
- description="Upload a document image and convert it to structured docling format."
31
  )
32
 
33
- demo.launch()
 
1
  import gradio as gr
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
  from PIL import Image
4
+ import re
5
 
6
+ # Load SmolDocling model & processor once
7
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
8
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
9
 
10
+ def extract_fcel_values_from_image(image, prompt_text):
11
+ """Run SmolDocling on an image and return numeric values inside <fcel> tags."""
12
+ # Prepare prompt for the model
13
  messages = [
14
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
15
  ]
16
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
17
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
18
+
19
+ # Generate output
20
+ outputs = model.generate(**inputs, max_new_tokens=2048)
21
  prompt_length = inputs.input_ids.shape[1]
22
  generated = outputs[:, prompt_length:]
23
  result = processor.batch_decode(generated, skip_special_tokens=False)[0]
24
+ clean_text = result.replace("<end_of_utterance>", "").strip()
25
+
26
+ # Extract only <fcel> values
27
+ values = re.findall(r"<fcel>([\d.]+)", clean_text)
28
+ values = [float(v) for v in values] # convert to floats
29
+
30
+ return values, clean_text
31
+
32
+ def compare_images(image1, image2, prompt_text):
33
+ # Extract fcel values from both images
34
+ values1, raw1 = extract_fcel_values_from_image(image1, prompt_text)
35
+ values2, raw2 = extract_fcel_values_from_image(image2, prompt_text)
36
+
37
+ # Calculate accuracy
38
+ if len(values1) == len(values2) and values1 == values2:
39
+ accuracy = 100.0
40
+ else:
41
+ matches = sum(1 for a, b in zip(values1, values2) if a == b)
42
+ total = max(len(values1), len(values2))
43
+ accuracy = (matches / total) * 100 if total > 0 else 0
44
+
45
+ return {
46
+ "Extracted Values 1": values1,
47
+ "Extracted Values 2": values2,
48
+ "Accuracy (%)": accuracy
49
+ }
50
 
51
  # Gradio UI
52
  demo = gr.Interface(
53
+ fn=compare_images,
54
  inputs=[
55
+ gr.Image(type="pil", label="Upload First Table Image"),
56
+ gr.Image(type="pil", label="Upload Second Table Image"),
57
+ gr.Textbox(lines=1, placeholder="Enter prompt (e.g. Extract table as OTSL)", label="Prompt")
58
  ],
59
+ outputs="json",
60
+ title="Table Data Accuracy Checker (SmolDocling)",
61
+ description="Uploads two table images, extracts only <fcel> values from OTSL output, and compares them for accuracy."
62
  )
63
 
64
+ demo.launch()