mansari722 commited on
Commit
4f5d54d
·
verified ·
1 Parent(s): cc6df37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -13
app.py CHANGED
@@ -3,28 +3,24 @@ from transformers import AutoProcessor, AutoModelForVision2Seq
3
  from PIL import Image
4
  import torch
5
 
6
- # Load the model and processor
7
  model_name = "ds4sd/SmolDocling-256M-preview"
8
  processor = AutoProcessor.from_pretrained(model_name)
9
- model = AutoModelForVision2Seq.from_pretrained(
10
- model_name, torch_dtype=torch.bfloat16
11
- ).to("cuda" if torch.cuda.is_available() else "cpu")
12
 
13
- # Define the inference function
14
  def process_image(image):
15
  inputs = processor(images=image, return_tensors="pt").to(model.device)
16
- outputs = model.generate(**inputs, max_new_tokens=1024)
17
- result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
18
  return result
19
 
20
- # Create the Gradio interface
21
  iface = gr.Interface(
22
  fn=process_image,
23
- inputs=gr.inputs.Image(type="pil"),
24
  outputs="text",
25
- title="SmolDocling Document Conversion",
26
- description="Upload an image of a document page to convert it to structured text."
27
  )
28
 
29
- if __name__ == "__main__":
30
- iface.launch()
 
3
  from PIL import Image
4
  import torch
5
 
6
+ # Load model & processor
7
  model_name = "ds4sd/SmolDocling-256M-preview"
8
  processor = AutoProcessor.from_pretrained(model_name)
9
+ model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda" if torch.cuda.is_available() else "cpu")
 
 
10
 
 
11
  def process_image(image):
12
  inputs = processor(images=image, return_tensors="pt").to(model.device)
13
+ output = model.generate(**inputs, max_new_tokens=1024)
14
+ result = processor.batch_decode(output, skip_special_tokens=True)[0]
15
  return result
16
 
17
+ # Create Gradio interface
18
  iface = gr.Interface(
19
  fn=process_image,
20
+ inputs=gr.Image(type="pil"), # ✅ FIXED
21
  outputs="text",
22
+ title="SmolDocling Document Processing",
23
+ description="Upload a document image to extract text."
24
  )
25
 
26
+ iface.launch()