import streamlit as st from transformers import AutoProcessor, AutoModelForVision2Seq from PIL import Image import torch import io # Load model and processor once @st.cache_resource def load_model(): model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForVision2Seq.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu") return processor, model processor, model = load_model() # Streamlit UI st.title("Aadhaar Card Information Extractor") uploaded_file = st.file_uploader("Upload Aadhaar card image", type=["jpg", "png", "jpeg"]) if uploaded_file is not None: image = Image.open(uploaded_file).convert("RGB") st.image(image, caption="Uploaded Aadhaar Card", use_column_width=True) if st.button("Extract Info"): with st.spinner("Extracting..."): prompt = ( "You are an AI system for extracting information from Indian Aadhaar cards. " "From the image, extract and return a structured JSON with:\n" "- Name\n" "- Father's Name\n" "- Date of Birth\n" "- Gender\n" "- Aadhaar Number\n" "- Address (Street, Locality, District, State, PIN)\n" "- QR code data (if visible)\n" "- Bounding box of photograph as [x1, y1, x2, y2]\n" "Respond only with JSON." ) inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=512) result = processor.batch_decode(outputs, skip_special_tokens=True)[0] st.code(result, language="json")