File size: 5,821 Bytes
8166792 0dd36da 8166792 661e202 8166792 2609a96 661e202 8166792 661e202 8166792 0dd36da 2609a96 0dd36da 2609a96 0dd36da 2609a96 0dd36da 2609a96 0dd36da 8166792 0dd36da 8166792 2609a96 8166792 2609a96 8166792 0dd36da 8166792 661e202 2609a96 661e202 8166792 661e202 8166792 2609a96 8166792 661e202 2609a96 8166792 2609a96 8166792 0dd36da 8166792 661e202 8166792 661e202 8166792 2609a96 864e7db 2609a96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import cv2
import torch
import numpy as np
import time
from midas.model_loader import default_models, load_model
import os
import urllib.request
import spaces
MODEL_FILE_URL = {
"midas_v21_small_256" : "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
"dpt_hybrid_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt",
"dpt_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt",
"dpt_swin2_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt",
"dpt_beit_large_512" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt",
}
class MonocularDepthEstimator:
def __init__(self,
model_type="midas_v21_small_256",
model_weights_path="models/",
optimize=False,
side_by_side=False,
height=None,
square=False,
grayscale=False):
# Store parameters but don't initialize CUDA
self.model_type = model_type
self.model_weights_path = model_weights_path
self.is_optimize = optimize
self.is_square = square
self.is_grayscale = grayscale
self.height = height
self.side_by_side = side_by_side
self.model = None
self.transform = None
self.net_w = None
self.net_h = None
print("Initializing parameters...")
# Download model if needed
if not os.path.exists(model_weights_path+model_type+".pt"):
print("Model file not found. Downloading...")
urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
print("Model file downloaded successfully.")
def load_model_if_needed(self):
if self.model is None:
print("Loading MiDaS model...")
self.model, self.transform, self.net_w, self.net_h = load_model(
'cuda',
self.model_weights_path + self.model_type + ".pt",
self.model_type,
self.is_optimize,
self.height,
self.is_square
)
print("Model loaded successfully")
print("Net width and height: ", (self.net_w, self.net_h))
@spaces.GPU
def predict(self, image, target_size):
# Load model if not loaded
self.load_model_if_needed()
# convert img to tensor and load to gpu
img_tensor = torch.from_numpy(image).to('cuda').unsqueeze(0)
if self.is_optimize:
img_tensor = img_tensor.to(memory_format=torch.channels_last)
img_tensor = img_tensor.half()
prediction = self.model.forward(img_tensor)
prediction = (
torch.nn.functional.interpolate(
prediction.unsqueeze(1),
size=target_size[::-1],
mode="bicubic",
align_corners=False,
)
.squeeze()
.cpu()
.numpy()
)
return prediction
def process_prediction(self, depth_map):
# normalizing depth image
depth_min = depth_map.min()
depth_max = depth_map.max()
normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2)
depth_colormap = cv2.applyColorMap(np.uint8(grayscale_depthmap), cv2.COLORMAP_INFERNO)
return normalized_depth/255, depth_colormap/255
@spaces.GPU
def make_prediction(self, image):
image = image.copy()
try:
print("Starting depth estimation...")
with torch.no_grad():
original_image_rgb = np.flip(image, 2) # in [0, 255] (flip required to get RGB)
# resizing the image to feed to the model
self.load_model_if_needed()
image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
# monocular depth prediction
pred = self.predict(image_tranformed, target_size=original_image_rgb.shape[1::-1])
# process the model predictions
depthmap, depth_colormap = self.process_prediction(pred)
print("Depth estimation complete")
return depthmap, depth_colormap
except Exception as e:
print(f"Error in make_prediction: {str(e)}")
import traceback
print(traceback.format_exc())
raise
@spaces.GPU
def run(self, input_path):
cap = cv2.VideoCapture(input_path)
if not cap.isOpened():
print("Error opening video file")
return
with torch.no_grad():
while cap.isOpened():
inference_start_time = time.time()
ret, frame = cap.read()
if ret == True:
_, depth_colormap = self.make_prediction(frame)
inference_end_time = time.time()
fps = round(1/(inference_end_time - inference_start_time))
cv2.putText(depth_colormap, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', depth_colormap)
if cv2.waitKey(1) == 27: # Escape key
break
else:
break
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")
depth_estimator.run("assets/videos/testvideo2.mp4")
|