import streamlit as st import cv2 import numpy as np import pandas as pd from PIL import Image import torch from torchvision.transforms import Compose, Resize, ToTensor, Normalize from segment_anything import SamPredictor, sam_model_registry import requests import os # Streamlit configuration st.set_page_config(page_title="Volume Estimator", layout="wide") st.title("Volume Estimation using SAM Segmentation + MiDaS Depth") # Load SAM and MiDaS models @st.cache_resource def load_models(): # Download SAM checkpoint from Hugging Face checkpoint_url = "https://huggingface.co/HCMUE-Research/SAM-vit-h/resolve/main/sam_vit_h_4b8939.pth" checkpoint_path = "sam_vit_h_4b8939.pth" if not os.path.exists(checkpoint_path): with open(checkpoint_path, "wb") as f: f.write(requests.get(checkpoint_url).content) # Load SAM device = "cuda" if torch.cuda.is_available() else "cpu" sam = sam_model_registry["vit_h"](checkpoint=checkpoint_path).to(device) predictor = SamPredictor(sam) # Load MiDaS midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large") midas.eval() midas_transform = Compose([ Resize(384), ToTensor(), Normalize(mean=[0.5]*3, std=[0.5]*3) ]) return predictor, midas, midas_transform predictor, midas_model, midas_transform = load_models() # Input source selection source_option = st.radio("Select input source", ("Upload Image", "Use Webcam")) uploaded_file = None image_pil = None if source_option == "Upload Image": uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) if uploaded_file: image_pil = Image.open(uploaded_file).convert("RGB") elif source_option == "Use Webcam": run_camera = st.checkbox("Start Camera") if run_camera: cap = cv2.VideoCapture(0) stframe = st.empty() while run_camera and cap.isOpened(): ret, frame = cap.read() if not ret: break frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) stframe.image(frame_rgb, caption="Live Camera Feed", channels="RGB") if st.button("Capture Frame"): image_pil = Image.fromarray(frame_rgb) cap.release() break # Continue only if an image is available if image_pil: image_np = np.array(image_pil) img_h, img_w = image_np.shape[:2] st.image(image_pil, caption="Selected Image", use_container_width=True) # Real-world reference dimensions real_image_width_cm = 100 real_image_height_cm = 75 assumed_max_depth_cm = 100 pixel_to_cm_x = real_image_width_cm / img_w pixel_to_cm_y = real_image_height_cm / img_h # SAM Segmentation predictor.set_image(image_np) masks, _, _ = predictor.predict(multimask_output=False) # MiDaS Depth Estimation input_tensor = midas_transform(image_pil).unsqueeze(0) with torch.no_grad(): depth_prediction = midas_model(input_tensor).squeeze().cpu().numpy() depth_resized = cv2.resize(depth_prediction, (img_w, img_h)) # Compute object volumes volume_data = [] for i, mask in enumerate(masks): mask_np = mask x, y, w, h = cv2.boundingRect(mask_np.astype(np.uint8)) width_px = w height_px = h width_cm = width_px * pixel_to_cm_x height_cm = height_px * pixel_to_cm_y depth_masked = depth_resized[mask_np > 0.5] if depth_masked.size == 0: continue normalized_depth = (depth_masked - np.min(depth_resized)) / (np.max(depth_resized) - np.min(depth_resized) + 1e-6) depth_cm = np.mean(normalized_depth) * assumed_max_depth_cm volume_cm3 = round(depth_cm * width_cm * height_cm, 2) volume_data.append({ "Object": f"Object #{i+1}", "Length (Depth)": f"{round(depth_cm, 2)} cm", "Breadth (Width)": f"{round(width_cm, 2)} cm", "Height": f"{round(height_cm, 2)} cm", "Volume": f"{volume_cm3} cm³" }) # Display volume results if volume_data: df = pd.DataFrame(volume_data) st.markdown("### Object Dimensions and Volume") st.dataframe(df) csv = df.to_csv(index=False).encode('utf-8') st.download_button("Download Volume Table as CSV", csv, "object_volumes_with_units.csv", "text/csv") else: st.warning("No objects were segmented.")