import streamlit as st import cv2 import numpy as np import pandas as pd from PIL import Image import torch import timm from torchvision.transforms import Compose, Resize, ToTensor, Normalize from segment_anything import SamPredictor, sam_model_registry import requests import os # Streamlit configuration st.set_page_config(page_title="Volume Estimator", layout="wide") st.title("Volume Estimation using SAM Segmentation + MiDaS Depth") @st.cache_resource def load_models(): import requests import os # ✅ Use Hugging Face public model file URL checkpoint_url = "https://huggingface.co/HCMUE-Research/SAM-vit-h/resolve/main/sam_vit_h_4b8939.pth" checkpoint_path = "sam_vit_h_4b8939.pth" # Download only if not already present if not os.path.exists(checkpoint_path): st.info("Downloading SAM model checkpoint...") response = requests.get(checkpoint_url) with open(checkpoint_path, "wb") as f: f.write(response.content) device = "cuda" if torch.cuda.is_available() else "cpu" sam = sam_model_registry["vit_h"](checkpoint=checkpoint_path).to(device) predictor = SamPredictor(sam) # Load MiDaS model midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large") midas.eval() midas_transform = Compose([ Resize(384), ToTensor(), Normalize(mean=[0.5]*3, std=[0.5]*3) ]) return predictor, midas, midas_transform predictor, midas_model, midas_transform = load_models() # Input source selection source_option = st.radio("Select input source", ("Upload Image", "Use Webcam")) uploaded_file = None image_pil = None if source_option == "Upload Image": uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) if uploaded_file: image_pil = Image.open(uploaded_file).convert("RGB") elif source_option == "Use Webcam": run_camera = st.checkbox("Start Camera") if run_camera: cap = cv2.VideoCapture(0) stframe = st.empty() while run_camera and cap.isOpened(): ret, frame = cap.read() if not ret: break frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) stframe.image(frame_rgb, caption="Live Camera Feed", channels="RGB") if st.button("Capture Frame"): image_pil = Image.fromarray(frame_rgb) cap.release() break # Continue only if an image is available if image_pil: image_np = np.array(image_pil) img_h, img_w = image_np.shape[:2] st.image(image_pil, caption="Selected Image", use_container_width=True) # Real-world reference dimensions real_image_width_cm = 100 real_image_height_cm = 75 assumed_max_depth_cm = 100 pixel_to_cm_x = real_image_width_cm / img_w pixel_to_cm_y = real_image_height_cm / img_h # SAM Segmentation predictor.set_image(image_np) masks, _, _ = predictor.predict(multimask_output=False) # MiDaS Depth Estimation input_tensor = midas_transform(image_pil).unsqueeze(0) with torch.no_grad(): depth_prediction = midas_model(input_tensor).squeeze().cpu().numpy() depth_resized = cv2.resize(depth_prediction, (img_w, img_h)) # Compute object volumes volume_data = [] for i, mask in enumerate(masks): mask_np = mask x, y, w, h = cv2.boundingRect(mask_np.astype(np.uint8)) width_px = w height_px = h width_cm = width_px * pixel_to_cm_x height_cm = height_px * pixel_to_cm_y depth_masked = depth_resized[mask_np > 0.5] if depth_masked.size == 0: continue normalized_depth = (depth_masked - np.min(depth_resized)) / (np.max(depth_resized) - np.min(depth_resized) + 1e-6) depth_cm = np.mean(normalized_depth) * assumed_max_depth_cm volume_cm3 = round(depth_cm * width_cm * height_cm, 2) volume_data.append({ "Object": f"Object #{i+1}", "Length (Depth)": f"{round(depth_cm, 2)} cm", "Breadth (Width)": f"{round(width_cm, 2)} cm", "Height": f"{round(height_cm, 2)} cm", "Volume": f"{volume_cm3} cm³" }) # Display volume results if volume_data: df = pd.DataFrame(volume_data) st.markdown("### Object Dimensions and Volume") st.dataframe(df) csv = df.to_csv(index=False).encode('utf-8') st.download_button("Download Volume Table as CSV", csv, "object_volumes_with_units.csv", "text/csv") else: st.warning("No objects were segmented.")