Alessio Grancini
		
	commited on
		
		
					Update monocular_depth_estimator.py
Browse files- monocular_depth_estimator.py +122 -60
    	
        monocular_depth_estimator.py
    CHANGED
    
    | @@ -5,7 +5,6 @@ import time | |
| 5 | 
             
            from midas.model_loader import default_models, load_model
         | 
| 6 | 
             
            import os
         | 
| 7 | 
             
            import urllib.request
         | 
| 8 | 
            -
            import spaces
         | 
| 9 |  | 
| 10 | 
             
            MODEL_FILE_URL = {
         | 
| 11 | 
             
                "midas_v21_small_256" : "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
         | 
| @@ -16,101 +15,164 @@ MODEL_FILE_URL = { | |
| 16 | 
             
            }
         | 
| 17 |  | 
| 18 | 
             
            class MonocularDepthEstimator:
         | 
| 19 | 
            -
                def __init__(self, | 
| 20 | 
            -
                     | 
| 21 | 
            -
                     | 
| 22 | 
             
                    optimize=False, 
         | 
| 23 | 
             
                    side_by_side=False, 
         | 
| 24 | 
             
                    height=None, 
         | 
| 25 | 
             
                    square=False, 
         | 
| 26 | 
             
                    grayscale=False):
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 27 |  | 
| 28 | 
            -
                    #  | 
| 29 | 
            -
                     | 
| 30 | 
            -
                    self.model_weights_path = model_weights_path
         | 
| 31 | 
             
                    self.is_optimize = optimize
         | 
| 32 | 
             
                    self.is_square = square
         | 
| 33 | 
             
                    self.is_grayscale = grayscale
         | 
| 34 | 
             
                    self.height = height
         | 
| 35 | 
             
                    self.side_by_side = side_by_side
         | 
| 36 | 
            -
                    self.model = None
         | 
| 37 | 
            -
                    self.transform = None
         | 
| 38 | 
            -
                    self.net_w = None
         | 
| 39 | 
            -
                    self.net_h = None
         | 
| 40 |  | 
| 41 | 
            -
                     | 
|  | |
|  | |
|  | |
|  | |
| 42 | 
             
                    if not os.path.exists(model_weights_path+model_type+".pt"):
         | 
| 43 | 
             
                        print("Model file not found. Downloading...")
         | 
|  | |
| 44 | 
             
                        urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
         | 
| 45 | 
             
                        print("Model file downloaded successfully.")
         | 
| 46 |  | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
                     | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
                            self.model_type,
         | 
| 55 | 
            -
                            self.is_optimize,
         | 
| 56 | 
            -
                            self.height,
         | 
| 57 | 
            -
                            self.is_square
         | 
| 58 | 
            -
                        )
         | 
| 59 | 
            -
                        print("Model loaded successfully")
         | 
| 60 |  | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
                    self.load_model_if_needed()
         | 
| 64 | 
            -
                    img_tensor = torch.from_numpy(image).to('cuda').unsqueeze(0)
         | 
| 65 |  | 
| 66 | 
            -
                    if self.is_optimize:
         | 
| 67 | 
             
                        img_tensor = img_tensor.to(memory_format=torch.channels_last)
         | 
| 68 | 
             
                        img_tensor = img_tensor.half()
         | 
| 69 |  | 
| 70 | 
            -
                     | 
| 71 | 
            -
             | 
| 72 | 
            -
                         | 
| 73 | 
            -
                             | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
                                align_corners=False,
         | 
| 78 | 
            -
                            )
         | 
| 79 | 
            -
                            .squeeze()
         | 
| 80 | 
            -
                            .cpu()
         | 
| 81 | 
            -
                            .numpy()
         | 
| 82 | 
             
                        )
         | 
|  | |
|  | |
|  | |
|  | |
| 83 |  | 
| 84 | 
             
                    return prediction
         | 
| 85 |  | 
| 86 | 
             
                def process_prediction(self, depth_map):
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 87 | 
             
                    depth_min = depth_map.min()
         | 
| 88 | 
             
                    depth_max = depth_map.max()
         | 
| 89 | 
             
                    normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
         | 
|  | |
|  | |
|  | |
| 90 | 
             
                    grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2)
         | 
| 91 | 
             
                    depth_colormap = cv2.applyColorMap(np.uint8(grayscale_depthmap), cv2.COLORMAP_INFERNO)  
         | 
|  | |
| 92 | 
             
                    return normalized_depth/255, depth_colormap/255
         | 
| 93 |  | 
| 94 | 
            -
                @spaces.GPU
         | 
| 95 | 
             
                def make_prediction(self, image):
         | 
| 96 | 
            -
                     | 
| 97 | 
            -
             | 
| 98 | 
            -
                         | 
| 99 | 
            -
                         | 
| 100 | 
            -
                        self.load_model_if_needed()
         | 
| 101 | 
             
                        image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
         | 
| 102 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 103 | 
             
                        depthmap, depth_colormap = self.process_prediction(pred)
         | 
| 104 | 
            -
             | 
| 105 | 
            -
             | 
| 106 | 
            -
             | 
| 107 | 
            -
             | 
| 108 | 
            -
             | 
| 109 | 
            -
             | 
| 110 | 
            -
             | 
| 111 | 
            -
             | 
| 112 | 
            -
             | 
| 113 | 
            -
             | 
| 114 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 115 |  | 
|  | |
| 116 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 5 | 
             
            from midas.model_loader import default_models, load_model
         | 
| 6 | 
             
            import os
         | 
| 7 | 
             
            import urllib.request
         | 
|  | |
| 8 |  | 
| 9 | 
             
            MODEL_FILE_URL = {
         | 
| 10 | 
             
                "midas_v21_small_256" : "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
         | 
|  | |
| 15 | 
             
            }
         | 
| 16 |  | 
| 17 | 
             
            class MonocularDepthEstimator:
         | 
| 18 | 
            +
                def __init__(self,
         | 
| 19 | 
            +
                    model_type="midas_v21_small_256",
         | 
| 20 | 
            +
                    model_weights_path="models/", 
         | 
| 21 | 
             
                    optimize=False, 
         | 
| 22 | 
             
                    side_by_side=False, 
         | 
| 23 | 
             
                    height=None, 
         | 
| 24 | 
             
                    square=False, 
         | 
| 25 | 
             
                    grayscale=False):
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    # model type
         | 
| 28 | 
            +
                    # MiDaS 3.1:
         | 
| 29 | 
            +
                    # For highest quality: dpt_beit_large_512
         | 
| 30 | 
            +
                    # For moderately less quality, but better speed-performance trade-off: dpt_swin2_large_384
         | 
| 31 | 
            +
                    # For embedded devices: dpt_swin2_tiny_256, dpt_levit_224
         | 
| 32 | 
            +
                    # For inference on Intel CPUs, OpenVINO may be used for the small legacy model: openvino_midas_v21_small .xml, .bin
         | 
| 33 | 
            +
                    
         | 
| 34 | 
            +
                    # MiDaS 3.0: 
         | 
| 35 | 
            +
                    # Legacy transformer models dpt_large_384 and dpt_hybrid_384
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                    # MiDaS 2.1: 
         | 
| 38 | 
            +
                    # Legacy convolutional models midas_v21_384 and midas_v21_small_256
         | 
| 39 |  | 
| 40 | 
            +
                    # params
         | 
| 41 | 
            +
                    print("Initializing parameters and model...")
         | 
|  | |
| 42 | 
             
                    self.is_optimize = optimize
         | 
| 43 | 
             
                    self.is_square = square
         | 
| 44 | 
             
                    self.is_grayscale = grayscale
         | 
| 45 | 
             
                    self.height = height
         | 
| 46 | 
             
                    self.side_by_side = side_by_side
         | 
|  | |
|  | |
|  | |
|  | |
| 47 |  | 
| 48 | 
            +
                    # select device
         | 
| 49 | 
            +
                    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         | 
| 50 | 
            +
                    print("Running inference on : %s" % self.device)
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                    # loading model
         | 
| 53 | 
             
                    if not os.path.exists(model_weights_path+model_type+".pt"):
         | 
| 54 | 
             
                        print("Model file not found. Downloading...")
         | 
| 55 | 
            +
                        # Download the model file
         | 
| 56 | 
             
                        urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
         | 
| 57 | 
             
                        print("Model file downloaded successfully.")
         | 
| 58 |  | 
| 59 | 
            +
                    self.model, self.transform, self.net_w, self.net_h = load_model(self.device, model_weights_path+model_type+".pt", 
         | 
| 60 | 
            +
                                                                                    model_type, optimize, height, square)    
         | 
| 61 | 
            +
                    print("Net width and height: ", (self.net_w, self.net_h))
         | 
| 62 | 
            +
                    
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                def predict(self, image, model, target_size):
         | 
| 65 | 
            +
                    
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 66 |  | 
| 67 | 
            +
                    # convert img to tensor and load to gpu
         | 
| 68 | 
            +
                    img_tensor = torch.from_numpy(image).to(self.device).unsqueeze(0)
         | 
|  | |
|  | |
| 69 |  | 
| 70 | 
            +
                    if self.is_optimize and self.device == torch.device("cuda"):
         | 
| 71 | 
             
                        img_tensor = img_tensor.to(memory_format=torch.channels_last)
         | 
| 72 | 
             
                        img_tensor = img_tensor.half()
         | 
| 73 |  | 
| 74 | 
            +
                    prediction = model.forward(img_tensor)
         | 
| 75 | 
            +
                    prediction = (
         | 
| 76 | 
            +
                        torch.nn.functional.interpolate(
         | 
| 77 | 
            +
                            prediction.unsqueeze(1),
         | 
| 78 | 
            +
                            size=target_size[::-1],
         | 
| 79 | 
            +
                            mode="bicubic",
         | 
| 80 | 
            +
                            align_corners=False,
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 81 | 
             
                        )
         | 
| 82 | 
            +
                        .squeeze()
         | 
| 83 | 
            +
                        .cpu()
         | 
| 84 | 
            +
                        .numpy()
         | 
| 85 | 
            +
                    )
         | 
| 86 |  | 
| 87 | 
             
                    return prediction
         | 
| 88 |  | 
| 89 | 
             
                def process_prediction(self, depth_map):
         | 
| 90 | 
            +
                    """
         | 
| 91 | 
            +
                    Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map
         | 
| 92 | 
            +
                    for better visibility.
         | 
| 93 | 
            +
                    Args:
         | 
| 94 | 
            +
                        original_img: the RGB image
         | 
| 95 | 
            +
                        depth_img: the depth map
         | 
| 96 | 
            +
                        is_grayscale: use a grayscale colormap?
         | 
| 97 | 
            +
                    Returns:
         | 
| 98 | 
            +
                        the image and depth map place side by side
         | 
| 99 | 
            +
                    """
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                    # normalizing depth image
         | 
| 102 | 
             
                    depth_min = depth_map.min()
         | 
| 103 | 
             
                    depth_max = depth_map.max()
         | 
| 104 | 
             
                    normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
         | 
| 105 | 
            +
                    
         | 
| 106 | 
            +
                    # normalized_depth *= 3
         | 
| 107 | 
            +
                    # grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3
         | 
| 108 | 
             
                    grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2)
         | 
| 109 | 
             
                    depth_colormap = cv2.applyColorMap(np.uint8(grayscale_depthmap), cv2.COLORMAP_INFERNO)  
         | 
| 110 | 
            +
                        
         | 
| 111 | 
             
                    return normalized_depth/255, depth_colormap/255
         | 
| 112 |  | 
|  | |
| 113 | 
             
                def make_prediction(self, image):
         | 
| 114 | 
            +
                    image = image.copy()
         | 
| 115 | 
            +
                    with torch.no_grad():
         | 
| 116 | 
            +
                        original_image_rgb = np.flip(image, 2)  # in [0, 255] (flip required to get RGB)
         | 
| 117 | 
            +
                        # resizing the image to feed to the model
         | 
|  | |
| 118 | 
             
                        image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                        # monocular depth prediction
         | 
| 121 | 
            +
                        pred = self.predict(image_tranformed, self.model, target_size=original_image_rgb.shape[1::-1]) 
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                        # process the model predictions
         | 
| 124 | 
             
                        depthmap, depth_colormap = self.process_prediction(pred)
         | 
| 125 | 
            +
                    return depthmap, depth_colormap
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                def run(self, input_path):
         | 
| 128 | 
            +
                    
         | 
| 129 | 
            +
                    # input video
         | 
| 130 | 
            +
                    cap = cv2.VideoCapture(input_path)
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                    # Check if camera opened successfully
         | 
| 133 | 
            +
                    if not cap.isOpened():
         | 
| 134 | 
            +
                        print("Error opening video file")
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                    with torch.no_grad():
         | 
| 137 | 
            +
                         while cap.isOpened():
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                            # Capture frame-by-frame
         | 
| 140 | 
            +
                            inference_start_time = time.time()
         | 
| 141 | 
            +
                            ret, frame = cap.read()                
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                            if ret == True:
         | 
| 144 | 
            +
                                _, depth_colormap = self.make_prediction(frame)                    
         | 
| 145 | 
            +
                                inference_end_time = time.time()
         | 
| 146 | 
            +
                                fps = round(1/(inference_end_time - inference_start_time))
         | 
| 147 | 
            +
                                cv2.putText(depth_colormap, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
         | 
| 148 | 
            +
                                cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', depth_colormap)
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                                # Press ESC on keyboard to exit
         | 
| 151 | 
            +
                                if cv2.waitKey(1) == 27:  # Escape key
         | 
| 152 | 
            +
                                    break
         | 
| 153 | 
            +
                            
         | 
| 154 | 
            +
                            else:
         | 
| 155 | 
            +
                                break
         | 
| 156 | 
            +
             | 
| 157 | 
            +
             | 
| 158 | 
            +
                    # When everything done, release
         | 
| 159 | 
            +
                    # the video capture object
         | 
| 160 | 
            +
                    cap.release()
         | 
| 161 | 
            +
                    
         | 
| 162 | 
            +
                    # Closes all the frames
         | 
| 163 | 
            +
                    cv2.destroyAllWindows()
         | 
| 164 | 
            +
             | 
| 165 | 
            +
             | 
| 166 | 
            +
             | 
| 167 | 
            +
            if __name__ == "__main__":
         | 
| 168 | 
            +
                # params
         | 
| 169 | 
            +
                INPUT_PATH = "assets/videos/testvideo2.mp4"
         | 
| 170 |  | 
| 171 | 
            +
                os.environ['CUDA_VISIBLE_DEVICES'] = '0'
         | 
| 172 |  | 
| 173 | 
            +
                 # set torch options
         | 
| 174 | 
            +
                torch.backends.cudnn.enabled = True
         | 
| 175 | 
            +
                torch.backends.cudnn.benchmark = True
         | 
| 176 | 
            +
                
         | 
| 177 | 
            +
                depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")
         | 
| 178 | 
            +
                depth_estimator.run(INPUT_PATH)
         | 
