|
import streamlit as st |
|
import cv2 |
|
import numpy as np |
|
import pandas as pd |
|
from PIL import Image |
|
import torch |
|
from torchvision.transforms import Compose, Resize, ToTensor, Normalize |
|
from segment_anything import SamPredictor, sam_model_registry |
|
import requests |
|
import os |
|
|
|
|
|
st.set_page_config(page_title="Volume Estimator", layout="wide") |
|
st.title("Volume Estimation using SAM Segmentation + MiDaS Depth") |
|
|
|
|
|
@st.cache_resource |
|
def load_models(): |
|
|
|
checkpoint_url = "https://huggingface.co/HCMUE-Research/SAM-vit-h/resolve/main/sam_vit_h_4b8939.pth" |
|
checkpoint_path = "sam_vit_h_4b8939.pth" |
|
if not os.path.exists(checkpoint_path): |
|
with open(checkpoint_path, "wb") as f: |
|
f.write(requests.get(checkpoint_url).content) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
sam = sam_model_registry["vit_h"](checkpoint=checkpoint_path).to(device) |
|
predictor = SamPredictor(sam) |
|
|
|
|
|
midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large") |
|
midas.eval() |
|
midas_transform = Compose([ |
|
Resize(384), |
|
ToTensor(), |
|
Normalize(mean=[0.5]*3, std=[0.5]*3) |
|
]) |
|
return predictor, midas, midas_transform |
|
|
|
predictor, midas_model, midas_transform = load_models() |
|
|
|
|
|
source_option = st.radio("Select input source", ("Upload Image", "Use Webcam")) |
|
|
|
uploaded_file = None |
|
image_pil = None |
|
|
|
if source_option == "Upload Image": |
|
uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) |
|
if uploaded_file: |
|
image_pil = Image.open(uploaded_file).convert("RGB") |
|
|
|
elif source_option == "Use Webcam": |
|
run_camera = st.checkbox("Start Camera") |
|
|
|
if run_camera: |
|
cap = cv2.VideoCapture(0) |
|
stframe = st.empty() |
|
|
|
while run_camera and cap.isOpened(): |
|
ret, frame = cap.read() |
|
if not ret: |
|
break |
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
stframe.image(frame_rgb, caption="Live Camera Feed", channels="RGB") |
|
|
|
if st.button("Capture Frame"): |
|
image_pil = Image.fromarray(frame_rgb) |
|
cap.release() |
|
break |
|
|
|
|
|
if image_pil: |
|
image_np = np.array(image_pil) |
|
img_h, img_w = image_np.shape[:2] |
|
st.image(image_pil, caption="Selected Image", use_container_width=True) |
|
|
|
|
|
real_image_width_cm = 100 |
|
real_image_height_cm = 75 |
|
assumed_max_depth_cm = 100 |
|
|
|
pixel_to_cm_x = real_image_width_cm / img_w |
|
pixel_to_cm_y = real_image_height_cm / img_h |
|
|
|
|
|
predictor.set_image(image_np) |
|
masks, _, _ = predictor.predict(multimask_output=False) |
|
|
|
|
|
input_tensor = midas_transform(image_pil).unsqueeze(0) |
|
with torch.no_grad(): |
|
depth_prediction = midas_model(input_tensor).squeeze().cpu().numpy() |
|
depth_resized = cv2.resize(depth_prediction, (img_w, img_h)) |
|
|
|
|
|
volume_data = [] |
|
for i, mask in enumerate(masks): |
|
mask_np = mask |
|
x, y, w, h = cv2.boundingRect(mask_np.astype(np.uint8)) |
|
width_px = w |
|
height_px = h |
|
|
|
width_cm = width_px * pixel_to_cm_x |
|
height_cm = height_px * pixel_to_cm_y |
|
|
|
depth_masked = depth_resized[mask_np > 0.5] |
|
if depth_masked.size == 0: |
|
continue |
|
|
|
normalized_depth = (depth_masked - np.min(depth_resized)) / (np.max(depth_resized) - np.min(depth_resized) + 1e-6) |
|
depth_cm = np.mean(normalized_depth) * assumed_max_depth_cm |
|
|
|
volume_cm3 = round(depth_cm * width_cm * height_cm, 2) |
|
|
|
volume_data.append({ |
|
"Object": f"Object #{i+1}", |
|
"Length (Depth)": f"{round(depth_cm, 2)} cm", |
|
"Breadth (Width)": f"{round(width_cm, 2)} cm", |
|
"Height": f"{round(height_cm, 2)} cm", |
|
"Volume": f"{volume_cm3} cm³" |
|
}) |
|
|
|
|
|
if volume_data: |
|
df = pd.DataFrame(volume_data) |
|
st.markdown("### Object Dimensions and Volume") |
|
st.dataframe(df) |
|
|
|
csv = df.to_csv(index=False).encode('utf-8') |
|
st.download_button("Download Volume Table as CSV", csv, "object_volumes_with_units.csv", "text/csv") |
|
else: |
|
st.warning("No objects were segmented.") |
|
|