import gradio as gr from transformers import AutoModel, AutoProcessor from PIL import Image import torch import json # Load the LayoutLMv3 model and processor def load_model(): processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True) model = AutoModel.from_pretrained("microsoft/layoutlmv3-base") return processor, model processor, model = load_model() # Function to process the uploaded image def process_document(image): try: # Ensure image is a PIL Image (Gradio provides it as PIL with type="pil") if not isinstance(image, Image.Image): return None, "Error: Invalid image format. Please upload a valid image." # Preprocess the image with the processor encoding = processor(image, return_tensors="pt") # Run the model with torch.no_grad(): outputs = model(**encoding) # Extract logits or embeddings (modify based on your task) logits = outputs.logits if hasattr(outputs, 'logits') else outputs.last_hidden_state # Placeholder result; customize based on your task (e.g., token classification, text extraction) result = { "status": "success", "model_output_shape": str(logits.shape), "message": "Document processed successfully. Customize this section for specific outputs." } return image, json.dumps(result, indent=2) except Exception as e: return image, f"Error processing document: {str(e)}" # Gradio Interface with gr.Blocks(title="Document Analysis with LayoutLMv3") as demo: gr.Markdown("# Document Analysis with LayoutLMv3") gr.Markdown("Upload a document image (PNG, JPG, JPEG) to analyze its layout and extract text.") with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="Upload Document Image") submit_button = gr.Button("Process Document") with gr.Column(): image_output = gr.Image(label="Uploaded Image") text_output = gr.Textbox(label="Analysis Results") submit_button.click( fn=process_document, inputs=image_input, outputs=[image_output, text_output] ) gr.Markdown(""" ### Instructions 1. Upload a document image (PNG, JPG, or JPEG). 2. Click "Process Document" to analyze the image. 3. View the results in the output section. 4. This is a basic demo; customize the output processing for specific tasks (e.g., text extraction, layout analysis). """) # Launch the Gradio app demo.launch()