Spaces:

Hammedalmodel
/

handwritten_to_text

Running on Zero

App Files Files Community

Odulana Hammed commited on Jan 18

Commit

e0dc9e6

verified ·

1 Parent(s): 0020f51

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -36

app.py CHANGED Viewed

@@ -1,49 +1,51 @@
-import gradio as gr
-from transformers import AutoProcessor, MllamaForConditionalGeneration
 from PIL import Image
 import torch
-import time
 import spaces
-# Load Vision-Instruct model
 ckpt = "alpindale/Llama-3.2-11B-Vision-Instruct"
-model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda")
 processor = AutoProcessor.from_pretrained(ckpt)
-# Define the function to extract text from the image
 @spaces.GPU
-def extract_text_from_image(image, max_new_tokens=250):
-    """
-    Extract handwritten text from the image using Meta-Llama Vision-Instruct.
-    """
-    try:
-        # Process the image
-        inputs = processor(images=image, return_tensors="pt").to("cuda")
-        # Generate the prediction
-        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
-        # Decode the generated text
-        extracted_text = processor.decode(outputs[0], skip_special_tokens=True)
-        return extracted_text
-    except Exception as e:
-        return f"An error occurred: {str(e)}"
-# Define Gradio interface for image upload and text extraction
-title = "Handwritten Text Extraction"
-description = """
-Upload an image with handwritten text, and this app will use Meta-Llama Vision-Instruct to extract the text.
-"""
 demo = gr.Interface(
-    fn=extract_text_from_image,
-    inputs=gr.Image(type="pil", label="Upload Handwritten Image"),
     outputs=gr.Textbox(label="Extracted Text"),
-    title=title,
-    description=description,
-    live=False  # Disable live updates since the extraction will happen after the user submits
 )
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

+from transformers import MllamaForConditionalGeneration, AutoProcessor
 from PIL import Image
 import torch
+import gradio as gr
 import spaces
+# Initialize model and processor
 ckpt = "alpindale/Llama-3.2-11B-Vision-Instruct"
+model = MllamaForConditionalGeneration.from_pretrained(
+    ckpt,
+    torch_dtype=torch.bfloat16
+).to("cuda")
 processor = AutoProcessor.from_pretrained(ckpt)
 @spaces.GPU
+def extract_text(image):
+    # Convert image to RGB
+    image = Image.open(image).convert("RGB")
+    # Create message structure
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Extract handwritten text from the image"},
+                {"type": "image"}
+            ]
+        }
+    ]
+    # Process input
+    texts = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda")
+    # Generate output
+    outputs = model.generate(**inputs, max_new_tokens=250)
+    result = processor.decode(outputs[0], skip_special_tokens=True)
+    return result
+# Create Gradio interface
 demo = gr.Interface(
+    fn=extract_text,
+    inputs=gr.Image(type="filepath", label="Upload Image"),
     outputs=gr.Textbox(label="Extracted Text"),
+    title="Handwritten Text Extractor",
+    description="Upload an image containing handwritten text to extract its content.",
 )
+# Launch the app
+demo.launch(debug=True)