Spaces:

viranchi123
/

gradui-1

Running

App Files Files Community

viranchi123 commited on 18 days ago

Commit

ca46f55

verified ·

1 Parent(s): c315ee6

Upload 75 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gradio/certificate.pem +31 -0
README_Unified.md +128 -0
app.py +299 -0
app2.py +324 -0
checkpoints/labels.txt +4 -0
depth_anything_v2/__pycache__/dinov2.cpython-312.pyc +0 -0
depth_anything_v2/__pycache__/dpt.cpython-312.pyc +0 -0
depth_anything_v2/dinov2.py +415 -0
depth_anything_v2/dinov2_layers/__init__.py +11 -0
depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-312.pyc +0 -0
depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-312.pyc +0 -0
depth_anything_v2/dinov2_layers/__pycache__/block.cpython-312.pyc +0 -0
depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-312.pyc +0 -0
depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-312.pyc +0 -0
depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-312.pyc +0 -0
depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-312.pyc +0 -0
depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-312.pyc +0 -0
depth_anything_v2/dinov2_layers/attention.py +83 -0
depth_anything_v2/dinov2_layers/block.py +252 -0
depth_anything_v2/dinov2_layers/drop_path.py +35 -0
depth_anything_v2/dinov2_layers/layer_scale.py +28 -0
depth_anything_v2/dinov2_layers/mlp.py +41 -0
depth_anything_v2/dinov2_layers/patch_embed.py +89 -0
depth_anything_v2/dinov2_layers/swiglu_ffn.py +63 -0
depth_anything_v2/dpt.py +221 -0
depth_anything_v2/util/__pycache__/blocks.cpython-312.pyc +0 -0
depth_anything_v2/util/__pycache__/transform.cpython-312.pyc +0 -0
depth_anything_v2/util/blocks.py +148 -0
depth_anything_v2/util/transform.py +158 -0
environment.yml +0 -0
environment_export.yml +182 -0
environment_from_history.yml +30 -0
environment_linux.yml +116 -0
keras_model_3.h5 +3 -0
labels.txt +4 -0
main_app.py +540 -0
metric_depth/README.md +114 -0
metric_depth/assets/compare_zoedepth.png +3 -0
metric_depth/dataset/hypersim.py +74 -0
metric_depth/dataset/kitti.py +57 -0
metric_depth/dataset/splits/hypersim/val.txt +0 -0
metric_depth/dataset/splits/kitti/val.txt +0 -0
metric_depth/dataset/splits/vkitti2/train.txt +0 -0
metric_depth/dataset/transform.py +277 -0
metric_depth/dataset/vkitti2.py +54 -0
metric_depth/depth_anything_v2/dinov2.py +415 -0
metric_depth/depth_anything_v2/dinov2_layers/__init__.py +11 -0
metric_depth/depth_anything_v2/dinov2_layers/attention.py +83 -0
metric_depth/depth_anything_v2/dinov2_layers/block.py +252 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+metric_depth/assets/compare_zoedepth.png filter=lfs diff=lfs merge=lfs -text

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README_Unified.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Medical AI Suite - Unified Interface
+A comprehensive web application that combines wound classification and depth estimation capabilities in a single, modern interface.
+## 🚀 Quick Start
+### Option 1: Use the Launcher (Recommended)
+```bash
+python launcher.py
+```
+This will show you a menu to choose which application to run.
+### Option 2: Run the Unified Interface Directly
+```bash
+python main_app.py
+```
+### Option 3: Run Individual Applications
+```bash
+# Wound Classification only
+python app2.py
+# Depth Estimation only
+python app.py
+```
+## 🏥 Features
+### Tab 1: Wound Classification
+- **AI-powered wound type classification**
+- **Grad-CAM visualization** - See which areas the model focuses on
+- **Confidence scores** with color-coded bars
+- **Real-time analysis** - Results update as you upload images
+### Tab 2: Depth Estimation & 3D Visualization
+- **Depth map generation** using DepthAnythingV2 model
+- **Interactive 3D point cloud visualization**
+- **Adjustable parameters** (focal length, point density)
+- **Multiple output formats** (grayscale, raw, PLY point cloud)
+- **Image slider comparison** between original and depth map
+## 🎨 Interface Features
+- **Modern dark theme** with gradient backgrounds
+- **Tabbed navigation** between applications
+- **Responsive design** that works on different screen sizes
+- **Professional medical interface** styling
+- **Real-time feedback** and progress indicators
+## 📁 File Structure
+```
+├── main_app.py          # Unified interface (NEW)
+├── launcher.py          # Application launcher (NEW)
+├── app.py              # Original depth estimation app
+├── app2.py             # Original wound classification app
+├── checkpoints/
+│   ├── keras_model.h5  # Wound classification model
+│   └── depth_anything_v2_vitl.pth  # Depth estimation model
+├── labels.txt          # Wound classification labels
+└── depth_anything_v2/  # Depth model implementation
+```
+## 🔧 Requirements
+The unified interface requires all the same dependencies as the individual applications:
+- `gradio`
+- `tensorflow`
+- `torch`
+- `opencv-python`
+- `pillow`
+- `numpy`
+- `matplotlib`
+- `plotly`
+- `open3d`
+- `gradio-imageslider`
+## 🌐 Access
+Once launched, the interface will be available at:
+- **Local**: http://localhost:7860
+- **Public**: A public link will be provided when the server starts
+## 💡 Usage Tips
+### Wound Classification
+1. Upload a clear image of the wound
+2. The model will automatically classify the wound type
+3. View the Grad-CAM heatmap to see which areas influenced the decision
+4. Check confidence scores for all possible classifications
+### Depth Estimation
+1. Upload an image for depth analysis
+2. Adjust the number of 3D points (higher = more detailed but slower)
+3. Set focal length parameters if you know your camera specs
+4. Click "Compute Depth" to generate results
+5. Download depth maps and point clouds as needed
+6. Explore the interactive 3D visualization
+## 🛠️ Troubleshooting
+### Model Loading Issues
+If models fail to load, the interface will show appropriate error messages and continue to function with limited capabilities.
+### Performance
+- For large images, consider reducing the number of 3D points
+- Depth estimation works best with good lighting and clear subjects
+- Wound classification works best with well-lit, focused images
+### Browser Compatibility
+The interface works best with modern browsers (Chrome, Firefox, Safari, Edge).
+## 🔄 Navigation
+You can easily switch between the two main functionalities using the tabs at the top of the interface. Each tab maintains its own state, so you can work on both applications simultaneously.
+## 📞 Support
+If you encounter any issues:
+1. Check that all required model files are present
+2. Ensure all dependencies are installed
+3. Try running individual applications first to isolate issues
+4. Check the console output for error messages
+---
+**Enjoy using the Medical AI Suite! 🏥✨**

app.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import glob
+import gradio as gr
+import matplotlib
+import numpy as np
+from PIL import Image
+import torch
+import tempfile
+from gradio_imageslider import ImageSlider
+import plotly.graph_objects as go
+import plotly.express as px
+import open3d as o3d
+from depth_anything_v2.dpt import DepthAnythingV2
+css = """
+#img-display-container {
+    max-height: 100vh;
+}
+#img-display-input {
+    max-height: 80vh;
+}
+#img-display-output {
+    max-height: 80vh;
+}
+#download {
+    height: 62px;
+}
+h1 {
+    text-align: center;
+    font-size: 3rem;
+    font-weight: bold;
+    margin: 2rem 0;
+    color: #2c3e50;
+}
+"""
+DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+model_configs = {
+    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+    'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
+}
+encoder = 'vitl'
+model = DepthAnythingV2(**model_configs[encoder])
+state_dict = torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu")
+model.load_state_dict(state_dict)
+model = model.to(DEVICE).eval()
+title = "Depth Estimation, 3D Visualization"
+description = """Official demo for **Depth Estimation, 3D Visualization**."""
+def predict_depth(image):
+    return model.infer_image(image)
+def calculate_max_points(image):
+    """Calculate maximum points based on image dimensions (3x pixel count)"""
+    if image is None:
+        return 10000  # Default value
+    h, w = image.shape[:2]
+    max_points = h * w * 3
+    # Ensure minimum and reasonable maximum values
+    return max(1000, min(max_points, 1000000))
+def update_slider_on_image_upload(image):
+    """Update the points slider when an image is uploaded"""
+    max_points = calculate_max_points(image)
+    default_value = min(10000, max_points // 10)  # 10% of max points as default
+    return gr.Slider(minimum=1000, maximum=max_points, value=default_value, step=1000,
+                     label=f"Number of 3D points (max: {max_points:,})")
+def create_3d_depth_visualization(image, depth_map, max_points=10000):
+    """Create an interactive 3D visualization of the depth map"""
+    h, w = depth_map.shape
+    # Downsample to avoid too many points for performance
+    step = max(1, int(np.sqrt(h * w / max_points)))
+    # Create coordinate grids
+    y_coords, x_coords = np.mgrid[0:h:step, 0:w:step]
+    depth_values = depth_map[::step, ::step]
+    # Flatten arrays
+    x_flat = x_coords.flatten()
+    y_flat = y_coords.flatten()
+    z_flat = depth_values.flatten()
+    # Get corresponding image colors
+    image_colors = image[::step, ::step, :]
+    colors_flat = image_colors.reshape(-1, 3)
+    # Create 3D scatter plot
+    fig = go.Figure(data=[go.Scatter3d(
+        x=x_flat,
+        y=y_flat,
+        z=z_flat,
+        mode='markers',
+        marker=dict(
+            size=2,
+            color=colors_flat,
+            opacity=0.8
+        ),
+        hovertemplate='<b>Position:</b> (%{x:.0f}, %{y:.0f})<br>' +
+                     '<b>Depth:</b> %{z:.2f}<br>' +
+                     '<extra></extra>'
+    )])
+    fig.update_layout(
+        title="3D Depth Visualization (Hover to see depth values)",
+        scene=dict(
+            xaxis_title="X (pixels)",
+            yaxis_title="Y (pixels)",
+            zaxis_title="Depth",
+            camera=dict(
+                eye=dict(x=1.5, y=1.5, z=1.5)
+            )
+        ),
+        width=600,
+        height=500
+    )
+    return fig
+def create_point_cloud(image, depth_map, focal_length_x=470.4, focal_length_y=470.4, max_points=100000):
+    """Create a point cloud from depth map using camera intrinsics"""
+    h, w = depth_map.shape
+    # Downsample to avoid too many points for performance
+    step = max(1, int(np.sqrt(h * w / max_points)))
+    # Create mesh grid for camera coordinates
+    y_coords, x_coords = np.mgrid[0:h:step, 0:w:step]
+    # Convert to camera coordinates (normalized by focal length)
+    x_cam = (x_coords - w / 2) / focal_length_x
+    y_cam = (y_coords - h / 2) / focal_length_y
+    # Get depth values
+    depth_values = depth_map[::step, ::step]
+    # Calculate 3D points: (x_cam * depth, y_cam * depth, depth)
+    x_3d = x_cam * depth_values
+    y_3d = y_cam * depth_values
+    z_3d = depth_values
+    # Flatten arrays
+    points = np.stack([x_3d.flatten(), y_3d.flatten(), z_3d.flatten()], axis=1)
+    # Get corresponding image colors
+    image_colors = image[::step, ::step, :]
+    colors = image_colors.reshape(-1, 3) / 255.0
+    # Create Open3D point cloud
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points)
+    pcd.colors = o3d.utility.Vector3dVector(colors)
+    return pcd
+def create_enhanced_3d_visualization(image, depth_map, max_points=10000):
+    """Create an enhanced 3D visualization using proper camera projection"""
+    h, w = depth_map.shape
+    # Downsample to avoid too many points for performance
+    step = max(1, int(np.sqrt(h * w / max_points)))
+    # Create mesh grid for camera coordinates
+    y_coords, x_coords = np.mgrid[0:h:step, 0:w:step]
+    # Convert to camera coordinates (normalized by focal length)
+    focal_length = 470.4  # Default focal length
+    x_cam = (x_coords - w / 2) / focal_length
+    y_cam = (y_coords - h / 2) / focal_length
+    # Get depth values
+    depth_values = depth_map[::step, ::step]
+    # Calculate 3D points: (x_cam * depth, y_cam * depth, depth)
+    x_3d = x_cam * depth_values
+    y_3d = y_cam * depth_values
+    z_3d = depth_values
+    # Flatten arrays
+    x_flat = x_3d.flatten()
+    y_flat = y_3d.flatten()
+    z_flat = z_3d.flatten()
+    # Get corresponding image colors
+    image_colors = image[::step, ::step, :]
+    colors_flat = image_colors.reshape(-1, 3)
+    # Create 3D scatter plot with proper camera projection
+    fig = go.Figure(data=[go.Scatter3d(
+        x=x_flat,
+        y=y_flat,
+        z=z_flat,
+        mode='markers',
+        marker=dict(
+            size=1.5,
+            color=colors_flat,
+            opacity=0.9
+        ),
+        hovertemplate='<b>3D Position:</b> (%{x:.3f}, %{y:.3f}, %{z:.3f})<br>' +
+                     '<b>Depth:</b> %{z:.2f}<br>' +
+                     '<extra></extra>'
+    )])
+    fig.update_layout(
+        title="3D Point Cloud Visualization (Camera Projection)",
+        scene=dict(
+            xaxis_title="X (meters)",
+            yaxis_title="Y (meters)",
+            zaxis_title="Z (meters)",
+            camera=dict(
+                eye=dict(x=2.0, y=2.0, z=2.0),
+                center=dict(x=0, y=0, z=0),
+                up=dict(x=0, y=0, z=1)
+            ),
+            aspectmode='data'
+        ),
+        width=700,
+        height=600
+    )
+    return fig
+with gr.Blocks(css=css) as demo:
+    gr.HTML(f"<h1>{title}</h1>")
+    gr.Markdown(description)
+    gr.Markdown("### Depth Prediction demo")
+    with gr.Row():
+        input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input')
+        depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output')
+    with gr.Row():
+        submit = gr.Button(value="Compute Depth", variant="primary")
+        points_slider = gr.Slider(minimum=1000, maximum=10000, value=10000, step=1000,
+                                 label="Number of 3D points (upload image to update max)")
+    with gr.Row():
+        focal_length_x = gr.Slider(minimum=100, maximum=1000, value=470.4, step=10,
+                                  label="Focal Length X (pixels)")
+        focal_length_y = gr.Slider(minimum=100, maximum=1000, value=470.4, step=10,
+                                  label="Focal Length Y (pixels)")
+    with gr.Row():
+        gray_depth_file = gr.File(label="Grayscale depth map", elem_id="download")
+        raw_file = gr.File(label="16-bit raw output (can be considered as disparity)", elem_id="download")
+        point_cloud_file = gr.File(label="Point Cloud (.ply)", elem_id="download")
+    # 3D Visualization
+    gr.Markdown("### 3D Point Cloud Visualization")
+    gr.Markdown("Enhanced 3D visualization using proper camera projection. Hover over points to see 3D coordinates.")
+    depth_3d_plot = gr.Plot(label="3D Point Cloud")
+    cmap = matplotlib.colormaps.get_cmap('Spectral_r')
+    def on_submit(image, num_points, focal_x, focal_y):
+        original_image = image.copy()
+        h, w = image.shape[:2]
+        depth = predict_depth(image[:, :, ::-1])
+        raw_depth = Image.fromarray(depth.astype('uint16'))
+        tmp_raw_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+        raw_depth.save(tmp_raw_depth.name)
+        depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
+        depth = depth.astype(np.uint8)
+        colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
+        gray_depth = Image.fromarray(depth)
+        tmp_gray_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+        gray_depth.save(tmp_gray_depth.name)
+        # Create point cloud
+        pcd = create_point_cloud(original_image, depth, focal_x, focal_y, max_points=num_points)
+        tmp_pointcloud = tempfile.NamedTemporaryFile(suffix='.ply', delete=False)
+        o3d.io.write_point_cloud(tmp_pointcloud.name, pcd)
+        # Create enhanced 3D visualization
+        depth_3d = create_enhanced_3d_visualization(original_image, depth, max_points=num_points)
+        return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name, tmp_pointcloud.name, depth_3d]
+    # Update slider when image is uploaded
+    input_image.change(
+        fn=update_slider_on_image_upload,
+        inputs=[input_image],
+        outputs=[points_slider]
+    )
+    submit.click(on_submit, inputs=[input_image, points_slider, focal_length_x, focal_length_y],
+                 outputs=[depth_image_slider, gray_depth_file, raw_file, point_cloud_file, depth_3d_plot])
+if __name__ == '__main__':
+    demo.queue().launch()

app2.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import gradio as gr
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing import image as keras_image
+from tensorflow.keras import backend as K
+import matplotlib.pyplot as plt
+from PIL import Image
+import io
+import cv2
+# --- Load model and labels ---
+model = load_model("checkpoints/keras_model.h5")
+with open("labels.txt", "r") as f:
+    class_labels = [line.strip() for line in f]
+# --- Preprocess input ---
+def preprocess_input(img):
+    img = img.resize((224, 224))
+    arr = keras_image.img_to_array(img)
+    arr = arr / 255.0
+    return np.expand_dims(arr, axis=0)
+# --- Enhanced Grad-CAM implementation for Keras ---
+def get_gradcam_heatmap(img_array, model, class_index, last_conv_layer_name="conv5_block3_out"):
+    try:
+        # Try to find the specified layer
+        target_layer = model.get_layer(last_conv_layer_name)
+    except:
+        # Fallback: find any convolutional layer
+        for layer in model.layers:
+            if 'conv' in layer.name.lower():
+                target_layer = layer
+                break
+        else:
+            return None
+    grad_model = tf.keras.models.Model(
+        [model.inputs], [target_layer.output, model.output]
+    )
+    with tf.GradientTape() as tape:
+        conv_outputs, predictions = grad_model(img_array)
+        loss = predictions[:, class_index]
+    grads = tape.gradient(loss, conv_outputs)[0]
+    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))
+    conv_outputs = conv_outputs[0]
+    heatmap = tf.reduce_sum(tf.multiply(pooled_grads, conv_outputs), axis=-1)
+    heatmap = np.maximum(heatmap, 0)
+    heatmap = heatmap / np.max(heatmap + K.epsilon())
+    return heatmap.numpy()
+# --- Enhanced Overlay heatmap on image ---
+def overlay_gradcam(original_img, heatmap):
+    if heatmap is None:
+        return original_img
+    # Resize heatmap
+    heatmap = cv2.resize(heatmap, original_img.size)
+    # Normalize safely
+    heatmap = np.maximum(heatmap, 0)
+    if np.max(heatmap) != 0:
+        heatmap /= np.max(heatmap)
+    heatmap = np.uint8(255 * heatmap)
+    # Apply JET colormap for better medical visualization
+    heatmap_color = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
+    # Convert PIL to array
+    original_array = np.array(original_img.convert("RGB"))
+    # Enhanced blend with better contrast
+    superimposed_img = cv2.addWeighted(original_array, 0.6, heatmap_color, 0.4, 0)
+    return Image.fromarray(superimposed_img)
+# --- Enhanced Prediction Function ---
+def classify_and_explain(img):
+    if img is None:
+        return None, {}, "No image provided"
+    img_array = preprocess_input(img)
+    predictions = model.predict(img_array, verbose=0)[0]
+    pred_idx = int(np.argmax(predictions))
+    pred_class = class_labels[pred_idx]
+    confidence_dict = {class_labels[i]: float(predictions[i]) for i in range(len(class_labels))}
+    # Enhanced Grad-CAM
+    try:
+        heatmap = get_gradcam_heatmap(img_array, model, pred_idx)
+        gradcam_img = overlay_gradcam(img.resize((224, 224)), heatmap)
+    except Exception as e:
+        print(f"Grad-CAM error: {e}")
+        gradcam_img = img.resize((224, 224))  # fallback image
+    return gradcam_img, confidence_dict
+# --- Custom CSS for Dark Mode Medical Interface ---
+css = """
+.gradio-container {
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    background: #1a1a1a;
+    min-height: 100vh;
+    padding: 20px;
+    color: #ffffff;
+}
+.main-header {
+    text-align: center;
+    color: white;
+    margin-bottom: 2rem;
+    padding: 2rem 0;
+}
+.main-header h1 {
+    font-size: 2.5rem;
+    margin-bottom: 0.5rem;
+    text-shadow: 2px 2px 4px rgba(0,0,0,0.5);
+    color: #ffffff;
+}
+.confidence-bar {
+    background: linear-gradient(90deg, #3498db 0%, #2ecc71 100%);
+    height: 25px;
+    border-radius: 12px;
+    margin: 8px 0;
+    transition: all 0.3s ease;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.3);
+}
+.confidence-container {
+    margin: 15px 0;
+    padding: 20px;
+    border-radius: 12px;
+    background: rgba(255,255,255,0.1);
+    backdrop-filter: blur(10px);
+    box-shadow: 0 8px 32px rgba(0,0,0,0.3);
+    border: 1px solid rgba(255,255,255,0.1);
+}
+.input-section, .output-section {
+    background: rgba(255,255,255,0.05);
+    padding: 25px;
+    border-radius: 15px;
+    margin: 15px;
+    backdrop-filter: blur(10px);
+    box-shadow: 0 8px 32px rgba(0,0,0,0.3);
+    border: 1px solid rgba(255,255,255,0.1);
+}
+.section-title {
+    color: #ffffff;
+    font-size: 1.3rem;
+    font-weight: 600;
+    margin-bottom: 15px;
+    border-bottom: 2px solid #3498db;
+    padding-bottom: 8px;
+}
+.gradio-button {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    border: none;
+    color: white;
+    padding: 12px 24px;
+    border-radius: 25px;
+    font-weight: 600;
+    transition: all 0.3s ease;
+    box-shadow: 0 4px 15px rgba(0,0,0,0.3);
+}
+.gradio-button:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 6px 20px rgba(0,0,0,0.4);
+}
+.gradio-image {
+    border-radius: 12px;
+    box-shadow: 0 4px 15px rgba(0,0,0,0.3);
+    border: 1px solid rgba(255,255,255,0.1);
+}
+.gradio-textbox, .gradio-number {
+    border-radius: 8px;
+    border: 2px solid #333333;
+    padding: 12px;
+    font-size: 1rem;
+    background: rgba(255,255,255,0.05);
+    color: #ffffff;
+}
+.gradio-textbox:focus, .gradio-number:focus {
+    border-color: #3498db;
+    box-shadow: 0 0 0 0.2rem rgba(52,152,219,0.25);
+}
+.gradio-label {
+    color: #ffffff !important;
+}
+.heatmap-container {
+    background: rgba(255,255,255,0.05);
+    padding: 15px;
+    border-radius: 12px;
+    border: 1px solid rgba(255,255,255,0.1);
+    margin: 10px 0;
+}
+.prediction-container {
+    background: rgba(52,152,219,0.1);
+    padding: 20px;
+    border-radius: 12px;
+    border-left: 5px solid #3498db;
+    margin: 15px 0;
+}
+"""
+# --- Function to create confidence bars HTML ---
+def create_confidence_bars(confidence_dict):
+    html_content = "<div class='confidence-container'>"
+    for class_name, confidence in confidence_dict.items():
+        percentage = confidence * 100
+        # Color coding based on confidence
+        if percentage > 70:
+            color = "#28a745"  # Green for high confidence
+        elif percentage > 40:
+            color = "#ffc107"  # Yellow for medium confidence
+        else:
+            color = "#dc3545"  # Red for low confidence
+        html_content += f"""
+            <div style='margin: 12px 0;'>
+                <div style='display: flex; justify-content: space-between; margin-bottom: 8px;'>
+                    <span style='font-weight: bold; color: {color};'>{class_name}</span>
+                    <span style='font-weight: bold; color: {color};'>{percentage:.1f}%</span>
+                </div>
+                <div class='confidence-bar' style='width: {percentage}%; background: {color};'></div>
+            </div>
+        """
+    html_content += "</div>"
+    return html_content
+# --- Enhanced Prediction Function with Dark Mode Interface ---
+def enhanced_classify_and_explain(img):
+    if img is None:
+        return None, "No image provided", 0, ""
+    gradcam_img, confidence_dict = classify_and_explain(img)
+    # Get predicted class and confidence
+    pred_class = max(confidence_dict, key=confidence_dict.get)
+    confidence = confidence_dict[pred_class]
+    # Create confidence bars HTML
+    confidence_bars_html = create_confidence_bars(confidence_dict)
+    return gradcam_img, pred_class, confidence, confidence_bars_html
+# --- Enhanced Gradio Interface ---
+with gr.Blocks(css=css, title="Wound Classification") as demo:
+    gr.HTML("""
+        <div class="main-header">
+            <h1>Wound Classification</h1>
+        </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.HTML("<div class='section-title'>Input Image</div>")
+            input_image = gr.Image(
+                label="Upload wound image",
+                type="pil",
+                height=350,
+                container=True
+            )
+        with gr.Column(scale=1):
+            gr.HTML("<div class='section-title'>Analysis Results</div>")
+            # Prediction results
+            prediction_output = gr.Textbox(
+                label="Predicted Wound Type",
+                interactive=False,
+                container=True
+            )
+            confidence_output = gr.Number(
+                label="Confidence Score",
+                interactive=False,
+                container=True
+            )
+            # Confidence bars for all classes
+            confidence_bars = gr.HTML(
+                label="Confidence Scores by Class",
+                container=True
+            )
+    with gr.Row():
+        with gr.Column():
+            gr.HTML("<div class='section-title'>Model Focus Visualization</div>")
+            cam_output = gr.Image(
+                label="Grad-CAM Heatmap - Shows which areas the model focused on",
+                height=350,
+                container=True
+            )
+    # Event handlers
+    input_image.change(
+        fn=enhanced_classify_and_explain,
+        inputs=[input_image],
+        outputs=[cam_output, prediction_output, confidence_output, confidence_bars]
+    )
+# --- Launch the enhanced interface ---
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        show_error=True
+    )

checkpoints/labels.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+0 Burns
+1 Surgical Wound
+2 Traumatic Wound
+3 Diabetic Foot Ulcer

depth_anything_v2/__pycache__/dinov2.cpython-312.pyc ADDED Viewed

Binary file (18.7 kB). View file

depth_anything_v2/__pycache__/dpt.cpython-312.pyc ADDED Viewed

Binary file (10.6 kB). View file

depth_anything_v2/dinov2.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            # (int(w0), int(h0)), # to solve the upsampling shape issue
+            mode="bicubic",
+            antialias=self.interpolate_antialias
+        )
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def DINOv2(model_name):
+    model_zoo = {
+        "vits": vit_small,
+        "vitb": vit_base,
+        "vitl": vit_large,
+        "vitg": vit_giant2
+    }
+    return model_zoo[model_name](
+        img_size=518,
+        patch_size=14,
+        init_values=1.0,
+        ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
+        block_chunks=0,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1
+    )

depth_anything_v2/dinov2_layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (441 Bytes). View file

depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-312.pyc ADDED Viewed

Binary file (3.95 kB). View file

depth_anything_v2/dinov2_layers/__pycache__/block.cpython-312.pyc ADDED Viewed

Binary file (13.1 kB). View file

depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-312.pyc ADDED Viewed

Binary file (1.65 kB). View file

depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-312.pyc ADDED Viewed

Binary file (1.42 kB). View file

depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-312.pyc ADDED Viewed

Binary file (1.85 kB). View file

depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-312.pyc ADDED Viewed

Binary file (4.06 kB). View file

depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-312.pyc ADDED Viewed

Binary file (2.84 kB). View file

depth_anything_v2/dinov2_layers/attention.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

depth_anything_v2/dinov2_layers/block.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

depth_anything_v2/dinov2_layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

depth_anything_v2/dinov2_layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

depth_anything_v2/dinov2_layers/mlp.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

depth_anything_v2/dinov2_layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

depth_anything_v2/dinov2_layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, Optional
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+try:
+    from xformers.ops import SwiGLU
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

depth_anything_v2/dpt.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import cv2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.transforms import Compose
+from .dinov2 import DINOv2
+from .util.blocks import FeatureFusionBlock, _make_scratch
+from .util.transform import Resize, NormalizeImage, PrepareForNet
+def _make_fusion_block(features, use_bn, size=None):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+    )
+class ConvBlock(nn.Module):
+    def __init__(self, in_feature, out_feature):
+        super().__init__()
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(out_feature),
+            nn.ReLU(True)
+        )
+    def forward(self, x):
+        return self.conv_block(x)
+class DPTHead(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        features=256,
+        use_bn=False,
+        out_channels=[256, 512, 1024, 1024],
+        use_clstoken=False
+    ):
+        super(DPTHead, self).__init__()
+        self.use_clstoken = use_clstoken
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if use_clstoken:
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True),
+            nn.Identity(),
+        )
+    def forward(self, out_features, patch_h, patch_w):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        layer_1, layer_2, layer_3, layer_4 = out
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv1(path_1)
+        out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+        out = self.scratch.output_conv2(out)
+        return out
+class DepthAnythingV2(nn.Module):
+    def __init__(
+        self,
+        encoder='vitl',
+        features=256,
+        out_channels=[256, 512, 1024, 1024],
+        use_bn=False,
+        use_clstoken=False
+    ):
+        super(DepthAnythingV2, self).__init__()
+        self.intermediate_layer_idx = {
+            'vits': [2, 5, 8, 11],
+            'vitb': [2, 5, 8, 11],
+            'vitl': [4, 11, 17, 23],
+            'vitg': [9, 19, 29, 39]
+        }
+        self.encoder = encoder
+        self.pretrained = DINOv2(model_name=encoder)
+        self.depth_head = DPTHead(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
+    def forward(self, x):
+        patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
+        features = self.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder], return_class_token=True)
+        depth = self.depth_head(features, patch_h, patch_w)
+        depth = F.relu(depth)
+        return depth.squeeze(1)
+    @torch.no_grad()
+    def infer_image(self, raw_image, input_size=518):
+        image, (h, w) = self.image2tensor(raw_image, input_size)
+        depth = self.forward(image)
+        depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
+        return depth.cpu().numpy()
+    def image2tensor(self, raw_image, input_size=518):
+        transform = Compose([
+            Resize(
+                width=input_size,
+                height=input_size,
+                resize_target=False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method='lower_bound',
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ])
+        h, w = raw_image.shape[:2]
+        image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
+        image = transform({'image': image})['image']
+        image = torch.from_numpy(image).unsqueeze(0)
+        DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+        image = image.to(DEVICE)
+        return image, (h, w)

depth_anything_v2/util/__pycache__/blocks.cpython-312.pyc ADDED Viewed

Binary file (5.55 kB). View file

depth_anything_v2/util/__pycache__/transform.cpython-312.pyc ADDED Viewed

Binary file (7.45 kB). View file

depth_anything_v2/util/blocks.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import torch.nn as nn
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups=1
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=None
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups=1
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size=size
+    def forward(self, *xs, size=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output

depth_anything_v2/util/transform.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import numpy as np
+import cv2
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
+        # resize sample
+        sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method)
+        if self.__resize_target:
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        return sample

environment.yml ADDED Viewed

Binary file (6.93 kB). View file

environment_export.yml ADDED Viewed

	@@ -0,0 +1,182 @@

+name: depth_copy
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - bzip2=1.0.8=h4bc722e_7
+  - ca-certificates=2025.6.15=hbd8a1cb_0
+  - ld_impl_linux-64=2.44=h1423503_0
+  - libexpat=2.7.0=h5888daf_0
+  - libffi=3.4.6=h2dba641_1
+  - libgcc=15.1.0=h767d61c_3
+  - libgcc-ng=15.1.0=h69a702a_3
+  - libgomp=15.1.0=h767d61c_3
+  - liblzma=5.8.1=hb9d3cd8_2
+  - libnsl=2.0.1=hb9d3cd8_1
+  - libsqlite=3.50.2=h6cd9bfd_0
+  - libuuid=2.38.1=h0b41bf4_0
+  - libxcrypt=4.4.36=hd590300_1
+  - libzlib=1.3.1=hb9d3cd8_2
+  - ncurses=6.5=h2d0b736_3
+  - openssl=3.5.1=h7b32b05_0
+  - pip=25.1.1=pyh8b19718_0
+  - python=3.12.11=h9e4cc4f_0_cpython
+  - readline=8.2=h8c095d6_2
+  - setuptools=80.9.0=pyhff2d567_0
+  - tk=8.6.13=noxft_hd72426e_102
+  - wheel=0.45.1=pyhd8ed1ab_1
+  - pip:
+      - absl-py==2.3.1
+      - addict==2.4.0
+      - aiofiles==24.1.0
+      - annotated-types==0.7.0
+      - anyio==4.9.0
+      - asttokens==3.0.0
+      - astunparse==1.6.3
+      - attrs==25.3.0
+      - blinker==1.9.0
+      - certifi==2025.6.15
+      - charset-normalizer==3.4.2
+      - click==8.2.1
+      - colorama==0.4.6
+      - comm==0.2.2
+      - configargparse==1.7.1
+      - contourpy==1.3.2
+      - cycler==0.12.1
+      - dash==3.1.1
+      - decorator==5.2.1
+      - executing==2.2.0
+      - fastapi==0.115.14
+      - fastjsonschema==2.21.1
+      - ffmpy==0.6.0
+      - filelock==3.18.0
+      - flask==3.1.1
+      - flatbuffers==25.2.10
+      - fonttools==4.58.4
+      - fsspec==2025.5.1
+      - gast==0.6.0
+      - google-pasta==0.2.0
+      - gradio==5.35.0
+      - gradio-client==1.10.4
+      - gradio-imageslider==0.0.20
+      - groovy==0.1.2
+      - grpcio==1.73.1
+      - h11==0.16.0
+      - h5py==3.14.0
+      - hf-xet==1.1.5
+      - httpcore==1.0.9
+      - httpx==0.28.1
+      - huggingface-hub==0.33.2
+      - idna==3.10
+      - importlib-metadata==8.7.0
+      - ipython==9.4.0
+      - ipython-pygments-lexers==1.1.1
+      - ipywidgets==8.1.7
+      - itsdangerous==2.2.0
+      - jedi==0.19.2
+      - jinja2==3.1.6
+      - joblib==1.5.1
+      - jsonschema==4.24.0
+      - jsonschema-specifications==2025.4.1
+      - jupyter-core==5.8.1
+      - jupyterlab-widgets==3.0.15
+      - keras==3.10.0
+      - kiwisolver==1.4.8
+      - libclang==18.1.1
+      - markdown==3.8.2
+      - markdown-it-py==3.0.0
+      - markupsafe==3.0.2
+      - matplotlib==3.10.3
+      - matplotlib-inline==0.1.7
+      - mdurl==0.1.2
+      - ml-dtypes==0.5.1
+      - mpmath==1.3.0
+      - namex==0.1.0
+      - narwhals==1.45.0
+      - nbformat==5.10.4
+      - nest-asyncio==1.6.0
+      - networkx==3.5
+      - numpy==2.1.3
+      - nvidia-cublas-cu12==12.6.4.1
+      - nvidia-cuda-cupti-cu12==12.6.80
+      - nvidia-cuda-nvrtc-cu12==12.6.77
+      - nvidia-cuda-runtime-cu12==12.6.77
+      - nvidia-cudnn-cu12==9.5.1.17
+      - nvidia-cufft-cu12==11.3.0.4
+      - nvidia-cufile-cu12==1.11.1.6
+      - nvidia-curand-cu12==10.3.7.77
+      - nvidia-cusolver-cu12==11.7.1.2
+      - nvidia-cusparse-cu12==12.5.4.2
+      - nvidia-cusparselt-cu12==0.6.3
+      - nvidia-nccl-cu12==2.26.2
+      - nvidia-nvjitlink-cu12==12.6.85
+      - nvidia-nvtx-cu12==12.6.77
+      - open3d==0.19.0
+      - opencv-python==4.11.0.86
+      - opt-einsum==3.4.0
+      - optree==0.16.0
+      - orjson==3.10.18
+      - packaging==25.0
+      - pandas==2.3.0
+      - parso==0.8.4
+      - pexpect==4.9.0
+      - pillow==11.3.0
+      - platformdirs==4.3.8
+      - plotly==6.2.0
+      - prompt-toolkit==3.0.51
+      - protobuf==5.29.5
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.3
+      - pydantic==2.11.7
+      - pydantic-core==2.33.2
+      - pydub==0.25.1
+      - pygments==2.19.2
+      - pyparsing==3.2.3
+      - pyquaternion==0.9.9
+      - python-dateutil==2.9.0.post0
+      - python-multipart==0.0.20
+      - pytz==2025.2
+      - pyyaml==6.0.2
+      - referencing==0.36.2
+      - requests==2.32.4
+      - retrying==1.4.0
+      - rich==14.0.0
+      - rpds-py==0.26.0
+      - ruff==0.12.1
+      - safehttpx==0.1.6
+      - scikit-learn==1.7.0
+      - scipy==1.16.0
+      - semantic-version==2.10.0
+      - shellingham==1.5.4
+      - six==1.17.0
+      - sniffio==1.3.1
+      - stack-data==0.6.3
+      - starlette==0.46.2
+      - sympy==1.14.0
+      - tensorboard==2.19.0
+      - tensorboard-data-server==0.7.2
+      - tensorflow==2.19.0
+      - termcolor==3.1.0
+      - threadpoolctl==3.6.0
+      - tomlkit==0.13.3
+      - torch==2.7.1
+      - torchaudio==2.7.1
+      - torchvision==0.22.1
+      - tqdm==4.67.1
+      - traitlets==5.14.3
+      - triton==3.3.1
+      - typer==0.16.0
+      - typing-extensions==4.14.0
+      - typing-inspection==0.4.1
+      - tzdata==2025.2
+      - urllib3==2.5.0
+      - uvicorn==0.35.0
+      - wcwidth==0.2.13
+      - websockets==15.0.1
+      - werkzeug==3.1.3
+      - widgetsnbextension==4.0.14
+      - wrapt==1.17.2
+      - zipp==3.23.0
+prefix: /home/uphen/anaconda3/envs/depth_copy

environment_from_history.yml ADDED Viewed

	@@ -0,0 +1,30 @@

+name: depth_copy
+channels:
+  - defaults
+dependencies:
+  - conda-forge/linux-64::_libgcc_mutex==0.1=conda_forge
+  - conda-forge/noarch::ca-certificates==2025.6.15=hbd8a1cb_0
+  - conda-forge/linux-64::ld_impl_linux-64==2.44=h1423503_0
+  - conda-forge/linux-64::libgomp==15.1.0=h767d61c_3
+  - conda-forge/noarch::tzdata==2025b=h78e105d_0
+  - conda-forge/linux-64::_openmp_mutex==4.5=2_gnu
+  - conda-forge/linux-64::libgcc==15.1.0=h767d61c_3
+  - conda-forge/linux-64::libexpat==2.7.0=h5888daf_0
+  - conda-forge/linux-64::libffi==3.4.6=h2dba641_1
+  - conda-forge/linux-64::libgcc-ng==15.1.0=h69a702a_3
+  - conda-forge/linux-64::liblzma==5.8.1=hb9d3cd8_2
+  - conda-forge/linux-64::libnsl==2.0.1=hb9d3cd8_1
+  - conda-forge/linux-64::libzlib==1.3.1=hb9d3cd8_2
+  - conda-forge/linux-64::ncurses==6.5=h2d0b736_3
+  - conda-forge/linux-64::openssl==3.5.1=h7b32b05_0
+  - conda-forge/linux-64::bzip2==1.0.8=h4bc722e_7
+  - conda-forge/linux-64::libsqlite==3.50.2=h6cd9bfd_0
+  - conda-forge/linux-64::libuuid==2.38.1=h0b41bf4_0
+  - conda-forge/linux-64::libxcrypt==4.4.36=hd590300_1
+  - conda-forge/linux-64::readline==8.2=h8c095d6_2
+  - conda-forge/linux-64::tk==8.6.13=noxft_hd72426e_102
+  - conda-forge/linux-64::python==3.12.11=h9e4cc4f_0_cpython
+  - conda-forge/noarch::setuptools==80.9.0=pyhff2d567_0
+  - conda-forge/noarch::wheel==0.45.1=pyhd8ed1ab_1
+  - conda-forge/noarch::pip==25.1.1=pyh8b19718_0
+prefix: /home/uphen/anaconda3/envs/depth_copy

environment_linux.yml ADDED Viewed

	@@ -0,0 +1,116 @@

+name: depth
+channels:
+  - conda-forge
+  - pytorch
+  - defaults
+dependencies:
+  - python=3.12
+  - pip
+  - pip:
+      - aiofiles==24.1.0
+      - annotated-types==0.7.0
+      - anyio==4.9.0
+      - asttokens==3.0.0
+      - attrs==25.3.0
+      - blinker==1.9.0
+      - certifi==2025.6.15
+      - charset-normalizer==3.4.2
+      - click==8.2.1
+      - colorama==0.4.6
+      - comm==0.2.2
+      - configargparse==1.7.1
+      - contourpy==1.3.2
+      - cycler==0.12.1
+      - dash==3.1.1
+      - decorator==5.2.1
+      - executing==2.2.0
+      - fastapi==0.115.14
+      - fastjsonschema==2.21.1
+      - ffmpy==0.6.0
+      - filelock==3.18.0
+      - flask==3.1.1
+      - fonttools==4.58.4
+      - fsspec==2025.5.1
+      - gradio==5.35.0
+      - gradio-client==1.10.4
+      - gradio-imageslider==0.0.20
+      - groovy==0.1.2
+      - h11==0.16.0
+      - httpcore==1.0.9
+      - httpx==0.28.1
+      - huggingface-hub==0.33.2
+      - idna==3.10
+      - importlib-metadata==8.7.0
+      - ipython==9.4.0
+      - ipython-pygments-lexers==1.1.1
+      - ipywidgets==8.1.7
+      - itsdangerous==2.2.0
+      - jedi==0.19.2
+      - jinja2==3.1.6
+      - jsonschema==4.24.0
+      - jsonschema-specifications==2025.4.1
+      - jupyter-core==5.8.1
+      - jupyterlab-widgets==3.0.15
+      - kiwisolver==1.4.8
+      - markdown-it-py==3.0.0
+      - markupsafe==3.0.2
+      - matplotlib==3.10.3
+      - matplotlib-inline==0.1.7
+      - mdurl==0.1.2
+      - mpmath==1.3.0
+      - narwhals==1.45.0
+      - nbformat==5.10.4
+      - nest-asyncio==1.6.0
+      - networkx==3.5
+      - numpy==2.3.1
+      - open3d==0.19.0
+      - opencv-python==4.11.0.86
+      - orjson==3.10.18
+      - packaging==25.0
+      - pandas==2.3.0
+      - parso==0.8.4
+      - pillow==11.3.0
+      - platformdirs==4.3.8
+      - plotly==6.2.0
+      - prompt-toolkit==3.0.51
+      - pure-eval==0.2.3
+      - pydantic==2.11.7
+      - pydantic-core==2.33.2
+      - pydub==0.25.1
+      - pygments==2.19.2
+      - pyparsing==3.2.3
+      - python-dateutil==2.9.0.post0
+      - python-multipart==0.0.20
+      - pytz==2025.2
+      - pyyaml==6.0.2
+      - referencing==0.36.2
+      - requests==2.32.4
+      - retrying==1.4.0
+      - rich==14.0.0
+      - rpds-py==0.26.0
+      - ruff==0.12.1
+      - safehttpx==0.1.6
+      - semantic-version==2.10.0
+      - shellingham==1.5.4
+      - six==1.17.0
+      - sniffio==1.3.1
+      - stack-data==0.6.3
+      - starlette==0.46.2
+      - sympy==1.14.0
+      - tomlkit==0.13.3
+      - torch==2.7.1
+      - torchaudio==2.7.1
+      - torchvision==0.22.1
+      - tqdm==4.67.1
+      - traitlets==5.14.3
+      - typer==0.16.0
+      - typing-extensions==4.14.0
+      - typing-inspection==0.4.1
+      - tzdata==2025.2
+      - urllib3==2.5.0
+      - uvicorn==0.35.0
+      - wcwidth==0.2.13
+      - websockets==15.0.1
+      - werkzeug==3.1.3
+      - widgetsnbextension==4.0.14
+      - zipp==3.23.0

keras_model_3.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4669439786977b2098b89ad88387538d8085bd129e1e6b9de7f7821aa8baa3fa
+size 2453440

labels.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+0 Burns
+1 Surgical Wou...
+2 Traumatic Wo...
+3 Diabetic Foo...

main_app.py ADDED Viewed

	@@ -0,0 +1,540 @@

+import gradio as gr
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing import image as keras_image
+from tensorflow.keras import backend as K
+import matplotlib.pyplot as plt
+from PIL import Image
+import io
+import cv2
+import glob
+import matplotlib
+import torch
+import tempfile
+from gradio_imageslider import ImageSlider
+import plotly.graph_objects as go
+import plotly.express as px
+import open3d as o3d
+from depth_anything_v2.dpt import DepthAnythingV2
+# --- Load models ---
+# Wound classification model
+try:
+    wound_model = load_model("checkpoints/keras_model.h5")
+    with open("labels.txt", "r") as f:
+        class_labels = [line.strip() for line in f]
+except:
+    wound_model = None
+    class_labels = ["No model found"]
+# Depth estimation model
+DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+model_configs = {
+    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
+    'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
+}
+encoder = 'vitl'
+try:
+    depth_model = DepthAnythingV2(**model_configs[encoder])
+    state_dict = torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu")
+    depth_model.load_state_dict(state_dict)
+    depth_model = depth_model.to(DEVICE).eval()
+except:
+    depth_model = None
+# --- Wound Classification Functions ---
+def preprocess_input(img):
+    img = img.resize((224, 224))
+    arr = keras_image.img_to_array(img)
+    arr = arr / 255.0
+    return np.expand_dims(arr, axis=0)
+def get_gradcam_heatmap(img_array, model, class_index, last_conv_layer_name="conv5_block3_out"):
+    try:
+        target_layer = model.get_layer(last_conv_layer_name)
+    except:
+        for layer in model.layers:
+            if 'conv' in layer.name.lower():
+                target_layer = layer
+                break
+        else:
+            return None
+    grad_model = tf.keras.models.Model(
+        [model.inputs], [target_layer.output, model.output]
+    )
+    with tf.GradientTape() as tape:
+        conv_outputs, predictions = grad_model(img_array)
+        loss = predictions[:, class_index]
+    grads = tape.gradient(loss, conv_outputs)
+    if grads is None:
+        return None
+    grads = grads[0]
+    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))
+    conv_outputs = conv_outputs[0]
+    heatmap = tf.reduce_sum(tf.multiply(pooled_grads, conv_outputs), axis=-1)
+    heatmap = np.maximum(heatmap, 0)
+    heatmap = heatmap / np.max(heatmap + K.epsilon())
+    return heatmap.numpy()
+def overlay_gradcam(original_img, heatmap):
+    if heatmap is None:
+        return original_img
+    heatmap = cv2.resize(heatmap, original_img.size)
+    heatmap = np.maximum(heatmap, 0)
+    if np.max(heatmap) != 0:
+        heatmap /= np.max(heatmap)
+    heatmap = np.uint8(255 * heatmap)
+    heatmap_color = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
+    original_array = np.array(original_img.convert("RGB"))
+    superimposed_img = cv2.addWeighted(original_array, 0.6, heatmap_color, 0.4, 0)
+    return Image.fromarray(superimposed_img)
+def classify_and_explain(img):
+    if img is None or wound_model is None:
+        return None, {}, "No image provided or model not available"
+    img_array = preprocess_input(img)
+    predictions = wound_model.predict(img_array, verbose=0)[0]
+    pred_idx = int(np.argmax(predictions))
+    pred_class = class_labels[pred_idx]
+    confidence_dict = {class_labels[i]: float(predictions[i]) for i in range(len(class_labels))}
+    try:
+        heatmap = get_gradcam_heatmap(img_array, wound_model, pred_idx)
+        gradcam_img = overlay_gradcam(img.resize((224, 224)), heatmap)
+    except Exception as e:
+        print(f"Grad-CAM error: {e}")
+        gradcam_img = img.resize((224, 224))
+    return gradcam_img, confidence_dict
+def create_confidence_bars(confidence_dict):
+    html_content = "<div class='confidence-container'>"
+    for class_name, confidence in confidence_dict.items():
+        percentage = confidence * 100
+        if percentage > 70:
+            css_class = "confidence-high"
+        elif percentage > 40:
+            css_class = "confidence-medium"
+        else:
+            css_class = "confidence-low"
+        html_content += f"""
+            <div style='margin: 12px 0;'>
+                <div style='display: flex; justify-content: space-between; margin-bottom: 8px;'>
+                    <span style='font-weight: bold;'>{class_name}</span>
+                    <span style='font-weight: bold;'>{percentage:.1f}%</span>
+                </div>
+                <div class='confidence-bar {css_class}' style='width: {percentage}%;'></div>
+            </div>
+        """
+    html_content += "</div>"
+    return html_content
+def enhanced_classify_and_explain(img):
+    if img is None:
+        return None, "No image provided", 0, ""
+    gradcam_img, confidence_dict = classify_and_explain(img)
+    if isinstance(confidence_dict, str):  # Error case
+        return None, confidence_dict, 0, ""
+    pred_class = max(confidence_dict, key=confidence_dict.get)
+    confidence = confidence_dict[pred_class]
+    confidence_bars_html = create_confidence_bars(confidence_dict)
+    return gradcam_img, pred_class, confidence, confidence_bars_html
+# --- Depth Estimation Functions ---
+def predict_depth(image):
+    if depth_model is None:
+        return None
+    return depth_model.infer_image(image)
+def calculate_max_points(image):
+    if image is None:
+        return 10000
+    h, w = image.shape[:2]
+    max_points = h * w * 3
+    return max(1000, min(max_points, 1000000))
+def update_slider_on_image_upload(image):
+    max_points = calculate_max_points(image)
+    default_value = min(10000, max_points // 10)
+    return gr.Slider(minimum=1000, maximum=max_points, value=default_value, step=1000,
+                     label=f"Number of 3D points (max: {max_points:,})")
+def create_point_cloud(image, depth_map, focal_length_x=470.4, focal_length_y=470.4, max_points=100000):
+    h, w = depth_map.shape
+    step = max(1, int(np.sqrt(h * w / max_points)))
+    y_coords, x_coords = np.mgrid[0:h:step, 0:w:step]
+    x_cam = (x_coords - w / 2) / focal_length_x
+    y_cam = (y_coords - h / 2) / focal_length_y
+    depth_values = depth_map[::step, ::step]
+    x_3d = x_cam * depth_values
+    y_3d = y_cam * depth_values
+    z_3d = depth_values
+    points = np.stack([x_3d.flatten(), y_3d.flatten(), z_3d.flatten()], axis=1)
+    image_colors = image[::step, ::step, :]
+    colors = image_colors.reshape(-1, 3) / 255.0
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points)
+    pcd.colors = o3d.utility.Vector3dVector(colors)
+    return pcd
+def create_enhanced_3d_visualization(image, depth_map, max_points=10000):
+    h, w = depth_map.shape
+    step = max(1, int(np.sqrt(h * w / max_points)))
+    y_coords, x_coords = np.mgrid[0:h:step, 0:w:step]
+    focal_length = 470.4
+    x_cam = (x_coords - w / 2) / focal_length
+    y_cam = (y_coords - h / 2) / focal_length
+    depth_values = depth_map[::step, ::step]
+    x_3d = x_cam * depth_values
+    y_3d = y_cam * depth_values
+    z_3d = depth_values
+    x_flat = x_3d.flatten()
+    y_flat = y_3d.flatten()
+    z_flat = z_3d.flatten()
+    image_colors = image[::step, ::step, :]
+    colors_flat = image_colors.reshape(-1, 3)
+    fig = go.Figure(data=[go.Scatter3d(
+        x=x_flat,
+        y=y_flat,
+        z=z_flat,
+        mode='markers',
+        marker=dict(
+            size=1.5,
+            color=colors_flat,
+            opacity=0.9
+        ),
+        hovertemplate='<b>3D Position:</b> (%{x:.3f}, %{y:.3f}, %{z:.3f})<br>' +
+                     '<b>Depth:</b> %{z:.2f}<br>' +
+                     '<extra></extra>'
+    )])
+    fig.update_layout(
+        title="3D Point Cloud Visualization (Camera Projection)",
+        scene=dict(
+            xaxis_title="X (meters)",
+            yaxis_title="Y (meters)",
+            zaxis_title="Z (meters)",
+            camera=dict(
+                eye=dict(x=2.0, y=2.0, z=2.0),
+                center=dict(x=0, y=0, z=0),
+                up=dict(x=0, y=0, z=1)
+            ),
+            aspectmode='data'
+        ),
+        width=700,
+        height=600
+    )
+    return fig
+def on_depth_submit(image, num_points, focal_x, focal_y):
+    if image is None or depth_model is None:
+        return None, None, None, None, None
+    original_image = image.copy()
+    h, w = image.shape[:2]
+    depth = predict_depth(image[:, :, ::-1])
+    if depth is None:
+        return None, None, None, None, None
+    raw_depth = Image.fromarray(depth.astype('uint16'))
+    tmp_raw_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+    raw_depth.save(tmp_raw_depth.name)
+    depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
+    depth = depth.astype(np.uint8)
+    cmap = matplotlib.colormaps.get_cmap('Spectral_r')
+    colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
+    gray_depth = Image.fromarray(depth)
+    tmp_gray_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+    gray_depth.save(tmp_gray_depth.name)
+    pcd = create_point_cloud(original_image, depth, focal_x, focal_y, max_points=num_points)
+    tmp_pointcloud = tempfile.NamedTemporaryFile(suffix='.ply', delete=False)
+    o3d.io.write_point_cloud(tmp_pointcloud.name, pcd)
+    depth_3d = create_enhanced_3d_visualization(original_image, depth, max_points=num_points)
+    return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name, tmp_pointcloud.name, depth_3d]
+# --- Custom CSS for Unified Interface ---
+css = """
+/* Minimal dark theme styling */
+.main-header {
+    text-align: center;
+    margin-bottom: 2rem;
+    padding: 2rem 0;
+}
+.main-header h1 {
+    font-size: 2.5rem;
+    margin-bottom: 0.5rem;
+    font-weight: 600;
+}
+.main-header p {
+    font-size: 1.1rem;
+    opacity: 0.8;
+}
+.section-title {
+    font-size: 1.2rem;
+    font-weight: 600;
+    margin-bottom: 15px;
+    padding-bottom: 8px;
+    border-bottom: 1px solid var(--border-color-primary);
+}
+.confidence-container {
+    margin: 15px 0;
+    padding: 15px;
+    border-radius: 8px;
+    background: var(--background-secondary);
+    border: 1px solid var(--border-color-primary);
+}
+.confidence-bar {
+    height: 20px;
+    border-radius: 4px;
+    margin: 6px 0;
+    background: var(--primary-500);
+    transition: width 0.3s ease;
+}
+/* Simple confidence bar colors */
+.confidence-high {
+    background: var(--success-500);
+}
+.confidence-medium {
+    background: var(--warning-500);
+}
+.confidence-low {
+    background: var(--error-500);
+}
+/* Minimal spacing and layout */
+.gradio-container {
+    max-width: 100%;
+    margin: 0;
+    padding: 20px;
+    width: 100%;
+}
+/* Clean image styling */
+.gradio-image {
+    border-radius: 8px;
+    border: 1px solid var(--border-color-primary);
+}
+/* Simple button styling */
+.gradio-button {
+    border-radius: 6px;
+    font-weight: 500;
+}
+/* Clean form elements */
+.gradio-textbox, .gradio-number, .gradio-slider {
+    border-radius: 6px;
+    border: 1px solid var(--border-color-primary);
+}
+/* Tab styling */
+.gradio-tabs {
+    border-radius: 8px;
+    overflow: hidden;
+}
+/* File upload styling */
+.gradio-file {
+    border-radius: 6px;
+    border: 1px solid var(--border-color-primary);
+}
+/* Plot styling */
+.gradio-plot {
+    border-radius: 8px;
+    border: 1px solid var(--border-color-primary);
+}
+/* Full width and height layout */
+body, html {
+    margin: 0;
+    padding: 0;
+    width: 100%;
+    height: 100%;
+}
+#root {
+    width: 100%;
+    height: 100%;
+}
+/* Ensure Gradio uses full width */
+.gradio-container {
+    min-height: 100vh;
+}
+/* Responsive adjustments */
+@media (max-width: 768px) {
+    .main-header h1 {
+        font-size: 2rem;
+    }
+    .gradio-container {
+        padding: 10px;
+    }
+}
+"""
+# --- Create Unified Interface ---
+with gr.Blocks(css=css, title="Medical AI Suite") as demo:
+    gr.HTML("""
+        <div class="main-header">
+            <h1>Medical AI Suite</h1>
+            <p>Advanced AI-powered medical image analysis and 3D visualization</p>
+        </div>
+    """)
+    with gr.Tabs() as tabs:
+        # Tab 1: Wound Classification
+        with gr.TabItem("Wound Classification", id=0):
+            gr.HTML("<div class='section-title'>Wound Classification with Grad-CAM Visualization</div>")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.HTML("<div class='section-title'>Input Image</div>")
+                    wound_input_image = gr.Image(
+                        label="Upload wound image",
+                        type="pil",
+                        height=350,
+                        container=True
+                    )
+                with gr.Column(scale=1):
+                    gr.HTML("<div class='section-title'>Analysis Results</div>")
+                    wound_prediction_output = gr.Textbox(
+                        label="Predicted Wound Type",
+                        interactive=False,
+                        container=True
+                    )
+                    wound_confidence_output = gr.Number(
+                        label="Confidence Score",
+                        interactive=False,
+                        container=True
+                    )
+                    wound_confidence_bars = gr.HTML(
+                        label="Confidence Scores by Class",
+                        container=True
+                    )
+            with gr.Row():
+                with gr.Column():
+                    gr.HTML("<div class='section-title'>Model Focus Visualization</div>")
+                    wound_cam_output = gr.Image(
+                        label="Grad-CAM Heatmap - Shows which areas the model focused on",
+                        height=350,
+                        container=True
+                    )
+            # Event handlers for wound classification
+            wound_input_image.change(
+                fn=enhanced_classify_and_explain,
+                inputs=[wound_input_image],
+                outputs=[wound_cam_output, wound_prediction_output, wound_confidence_output, wound_confidence_bars]
+            )
+        # Tab 2: Depth Estimation
+        with gr.TabItem("Depth Estimation & 3D Visualization", id=1):
+            gr.HTML("<div class='section-title'>Depth Estimation and 3D Point Cloud Generation</div>")
+            with gr.Row():
+                depth_input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input')
+                depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output')
+            with gr.Row():
+                depth_submit = gr.Button(value="Compute Depth", variant="primary")
+                depth_points_slider = gr.Slider(minimum=1000, maximum=10000, value=10000, step=1000,
+                                             label="Number of 3D points (upload image to update max)")
+            with gr.Row():
+                depth_focal_length_x = gr.Slider(minimum=100, maximum=1000, value=470.4, step=10,
+                                              label="Focal Length X (pixels)")
+                depth_focal_length_y = gr.Slider(minimum=100, maximum=1000, value=470.4, step=10,
+                                              label="Focal Length Y (pixels)")
+            with gr.Row():
+                depth_gray_depth_file = gr.File(label="Grayscale depth map", elem_id="download")
+                depth_raw_file = gr.File(label="16-bit raw output (can be considered as disparity)", elem_id="download")
+                depth_point_cloud_file = gr.File(label="Point Cloud (.ply)", elem_id="download")
+            gr.Markdown("### 3D Point Cloud Visualization")
+            gr.Markdown("Enhanced 3D visualization using proper camera projection. Hover over points to see 3D coordinates.")
+            depth_3d_plot = gr.Plot(label="3D Point Cloud")
+            # Event handlers for depth estimation
+            depth_input_image.change(
+                fn=update_slider_on_image_upload,
+                inputs=[depth_input_image],
+                outputs=[depth_points_slider]
+            )
+            depth_submit.click(
+                on_depth_submit,
+                inputs=[depth_input_image, depth_points_slider, depth_focal_length_x, depth_focal_length_y],
+                outputs=[depth_image_slider, depth_gray_depth_file, depth_raw_file, depth_point_cloud_file, depth_3d_plot]
+            )
+    # Cross-tab image sharing functionality
+    # When image is uploaded in wound classification, also update depth estimation
+    wound_input_image.change(
+        fn=lambda img: img,
+        inputs=[wound_input_image],
+        outputs=[depth_input_image]
+    )
+    # When image is uploaded in depth estimation, also update wound classification
+    depth_input_image.change(
+        fn=lambda img: img,
+        inputs=[depth_input_image],
+        outputs=[wound_input_image]
+    )
+# --- Launch the unified interface ---
+if __name__ == "__main__":
+    demo.queue().launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        show_error=True
+    )

metric_depth/README.md ADDED Viewed

	@@ -0,0 +1,114 @@

+# Depth Anything V2 for Metric Depth Estimation
+![teaser](./assets/compare_zoedepth.png)
+We here provide a simple codebase to fine-tune our Depth Anything V2 pre-trained encoder for metric depth estimation. Built on our powerful encoder, we use a simple DPT head to regress the depth. We fine-tune our pre-trained encoder on synthetic Hypersim / Virtual KITTI datasets for indoor / outdoor metric depth estimation, respectively.
+# Pre-trained Models
+We provide **six metric depth models** of three scales for indoor and outdoor scenes, respectively.
+| Base Model | Params | Indoor (Hypersim) | Outdoor (Virtual KITTI 2) |
+|:-|-:|:-:|:-:|
+| Depth-Anything-V2-Small | 24.8M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-Hypersim-Small/resolve/main/depth_anything_v2_metric_hypersim_vits.pth?download=true) | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-VKITTI-Small/resolve/main/depth_anything_v2_metric_vkitti_vits.pth?download=true) |
+| Depth-Anything-V2-Base | 97.5M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-Hypersim-Base/resolve/main/depth_anything_v2_metric_hypersim_vitb.pth?download=true) | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-VKITTI-Base/resolve/main/depth_anything_v2_metric_vkitti_vitb.pth?download=true) |
+| Depth-Anything-V2-Large | 335.3M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-Hypersim-Large/resolve/main/depth_anything_v2_metric_hypersim_vitl.pth?download=true) | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Metric-VKITTI-Large/resolve/main/depth_anything_v2_metric_vkitti_vitl.pth?download=true) |
+*We recommend to first try our larger models (if computational cost is affordable) and the indoor version.*
+## Usage
+### Prepraration
+```bash
+git clone https://github.com/DepthAnything/Depth-Anything-V2
+cd Depth-Anything-V2/metric_depth
+pip install -r requirements.txt
+```
+Download the checkpoints listed [here](#pre-trained-models) and put them under the `checkpoints` directory.
+### Use our models
+```python
+import cv2
+import torch
+from depth_anything_v2.dpt import DepthAnythingV2
+model_configs = {
+    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
+}
+encoder = 'vitl' # or 'vits', 'vitb'
+dataset = 'hypersim' # 'hypersim' for indoor model, 'vkitti' for outdoor model
+max_depth = 20 # 20 for indoor model, 80 for outdoor model
+model = DepthAnythingV2(**{**model_configs[encoder], 'max_depth': max_depth})
+model.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_metric_{dataset}_{encoder}.pth', map_location='cpu'))
+model.eval()
+raw_img = cv2.imread('your/image/path')
+depth = model.infer_image(raw_img) # HxW depth map in meters in numpy
+```
+### Running script on images
+Here, we take the `vitl` encoder as an example. You can also use `vitb` or `vits` encoders.
+```bash
+# indoor scenes
+python run.py \
+  --encoder vitl \
+  --load-from checkpoints/depth_anything_v2_metric_hypersim_vitl.pth \
+  --max-depth 20 \
+  --img-path <path> --outdir <outdir> [--input-size <size>] [--save-numpy]
+# outdoor scenes
+python run.py \
+  --encoder vitl \
+  --load-from checkpoints/depth_anything_v2_metric_vkitti_vitl.pth \
+  --max-depth 80 \
+  --img-path <path> --outdir <outdir> [--input-size <size>] [--save-numpy]
+```
+### Project 2D images to point clouds:
+```bash
+python depth_to_pointcloud.py \
+  --encoder vitl \
+  --load-from checkpoints/depth_anything_v2_metric_hypersim_vitl.pth \
+  --max-depth 20 \
+  --img-path <path> --outdir <outdir>
+```
+### Reproduce training
+Please first prepare the [Hypersim](https://github.com/apple/ml-hypersim) and [Virtual KITTI 2](https://europe.naverlabs.com/research/computer-vision/proxy-virtual-worlds-vkitti-2/) datasets. Then:
+```bash
+bash dist_train.sh
+```
+## Citation
+If you find this project useful, please consider citing:
+```bibtex
+@article{depth_anything_v2,
+  title={Depth Anything V2},
+  author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
+  journal={arXiv:2406.09414},
+  year={2024}
+}
+@inproceedings{depth_anything_v1,
+  title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data},
+  author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
+  booktitle={CVPR},
+  year={2024}
+}
+```

metric_depth/assets/compare_zoedepth.png ADDED Viewed

Git LFS Details

SHA256: 8044e39ef6cb4aaabea9a81333fa1ff2d3e07448e7f9f43f77f471aba72a12e0
Pointer size: 132 Bytes
Size of remote file: 9.19 MB

metric_depth/dataset/hypersim.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import cv2
+import h5py
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from torchvision.transforms import Compose
+from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop
+def hypersim_distance_to_depth(npyDistance):
+    intWidth, intHeight, fltFocal = 1024, 768, 886.81
+    npyImageplaneX = np.linspace((-0.5 * intWidth) + 0.5, (0.5 * intWidth) - 0.5, intWidth).reshape(
+        1, intWidth).repeat(intHeight, 0).astype(np.float32)[:, :, None]
+    npyImageplaneY = np.linspace((-0.5 * intHeight) + 0.5, (0.5 * intHeight) - 0.5,
+                                 intHeight).reshape(intHeight, 1).repeat(intWidth, 1).astype(np.float32)[:, :, None]
+    npyImageplaneZ = np.full([intHeight, intWidth, 1], fltFocal, np.float32)
+    npyImageplane = np.concatenate(
+        [npyImageplaneX, npyImageplaneY, npyImageplaneZ], 2)
+    npyDepth = npyDistance / np.linalg.norm(npyImageplane, 2, 2) * fltFocal
+    return npyDepth
+class Hypersim(Dataset):
+    def __init__(self, filelist_path, mode, size=(518, 518)):
+        self.mode = mode
+        self.size = size
+        with open(filelist_path, 'r') as f:
+            self.filelist = f.read().splitlines()
+        net_w, net_h = size
+        self.transform = Compose([
+            Resize(
+                width=net_w,
+                height=net_h,
+                resize_target=True if mode == 'train' else False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method='lower_bound',
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ] + ([Crop(size[0])] if self.mode == 'train' else []))
+    def __getitem__(self, item):
+        img_path = self.filelist[item].split(' ')[0]
+        depth_path = self.filelist[item].split(' ')[1]
+        image = cv2.imread(img_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
+        depth_fd = h5py.File(depth_path, "r")
+        distance_meters = np.array(depth_fd['dataset'])
+        depth = hypersim_distance_to_depth(distance_meters)
+        sample = self.transform({'image': image, 'depth': depth})
+        sample['image'] = torch.from_numpy(sample['image'])
+        sample['depth'] = torch.from_numpy(sample['depth'])
+        sample['valid_mask'] = (torch.isnan(sample['depth']) == 0)
+        sample['depth'][sample['valid_mask'] == 0] = 0
+        sample['image_path'] = self.filelist[item].split(' ')[0]
+        return sample
+    def __len__(self):
+        return len(self.filelist)

metric_depth/dataset/kitti.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import cv2
+import torch
+from torch.utils.data import Dataset
+from torchvision.transforms import Compose
+from dataset.transform import Resize, NormalizeImage, PrepareForNet
+class KITTI(Dataset):
+    def __init__(self, filelist_path, mode, size=(518, 518)):
+        if mode != 'val':
+            raise NotImplementedError
+        self.mode = mode
+        self.size = size
+        with open(filelist_path, 'r') as f:
+            self.filelist = f.read().splitlines()
+        net_w, net_h = size
+        self.transform = Compose([
+            Resize(
+                width=net_w,
+                height=net_h,
+                resize_target=True if mode == 'train' else False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method='lower_bound',
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ])
+    def __getitem__(self, item):
+        img_path = self.filelist[item].split(' ')[0]
+        depth_path = self.filelist[item].split(' ')[1]
+        image = cv2.imread(img_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
+        depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype('float32')
+        sample = self.transform({'image': image, 'depth': depth})
+        sample['image'] = torch.from_numpy(sample['image'])
+        sample['depth'] = torch.from_numpy(sample['depth'])
+        sample['depth'] = sample['depth'] / 256.0  # convert in meters
+        sample['valid_mask'] = sample['depth'] > 0
+        sample['image_path'] = self.filelist[item].split(' ')[0]
+        return sample
+    def __len__(self):
+        return len(self.filelist)

metric_depth/dataset/splits/hypersim/val.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

metric_depth/dataset/splits/kitti/val.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

metric_depth/dataset/splits/vkitti2/train.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

metric_depth/dataset/transform.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import cv2
+import math
+import numpy as np
+import torch
+import torch.nn.functional as F
+def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
+    """Rezise the sample to ensure the given size. Keeps aspect ratio.
+    Args:
+        sample (dict): sample
+        size (tuple): image size
+    Returns:
+        tuple: new size
+    """
+    shape = list(sample["disparity"].shape)
+    if shape[0] >= size[0] and shape[1] >= size[1]:
+        return sample
+    scale = [0, 0]
+    scale[0] = size[0] / shape[0]
+    scale[1] = size[1] / shape[1]
+    scale = max(scale)
+    shape[0] = math.ceil(scale * shape[0])
+    shape[1] = math.ceil(scale * shape[1])
+    # resize
+    sample["image"] = cv2.resize(
+        sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
+    )
+    sample["disparity"] = cv2.resize(
+        sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
+    )
+    sample["mask"] = cv2.resize(
+        sample["mask"].astype(np.float32),
+        tuple(shape[::-1]),
+        interpolation=cv2.INTER_NEAREST,
+    )
+    sample["mask"] = sample["mask"].astype(bool)
+    return tuple(shape)
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(
+                    f"resize_method {self.__resize_method} not implemented"
+                )
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, min_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, min_val=self.__width
+            )
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, max_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, max_val=self.__width
+            )
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(
+            sample["image"].shape[1], sample["image"].shape[0]
+        )
+        # resize sample
+        sample["image"] = cv2.resize(
+            sample["image"],
+            (width, height),
+            interpolation=self.__image_interpolation_method,
+        )
+        if self.__resize_target:
+            if "disparity" in sample:
+                sample["disparity"] = cv2.resize(
+                    sample["disparity"],
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(
+                    sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
+                )
+            if "semseg_mask" in sample:
+                # sample["semseg_mask"] = cv2.resize(
+                #     sample["semseg_mask"], (width, height), interpolation=cv2.INTER_NEAREST
+                # )
+                sample["semseg_mask"] = F.interpolate(torch.from_numpy(sample["semseg_mask"]).float()[None, None, ...], (height, width), mode='nearest').numpy()[0, 0]
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(
+                    sample["mask"].astype(np.float32),
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+                # sample["mask"] = sample["mask"].astype(bool)
+        # print(sample['image'].shape, sample['depth'].shape)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        if "semseg_mask" in sample:
+            sample["semseg_mask"] = sample["semseg_mask"].astype(np.float32)
+            sample["semseg_mask"] = np.ascontiguousarray(sample["semseg_mask"])
+        return sample
+class Crop(object):
+    """Crop sample for batch-wise training. Image is of shape CxHxW
+    """
+    def __init__(self, size):
+        if isinstance(size, int):
+            self.size = (size, size)
+        else:
+            self.size = size
+    def __call__(self, sample):
+        h, w = sample['image'].shape[-2:]
+        assert h >= self.size[0] and w >= self.size[1], 'Wrong size'
+        h_start = np.random.randint(0, h - self.size[0] + 1)
+        w_start = np.random.randint(0, w - self.size[1] + 1)
+        h_end = h_start + self.size[0]
+        w_end = w_start + self.size[1]
+        sample['image'] = sample['image'][:, h_start: h_end, w_start: w_end]
+        if "depth" in sample:
+            sample["depth"] = sample["depth"][h_start: h_end, w_start: w_end]
+        if "mask" in sample:
+            sample["mask"] = sample["mask"][h_start: h_end, w_start: w_end]
+        if "semseg_mask" in sample:
+            sample["semseg_mask"] = sample["semseg_mask"][h_start: h_end, w_start: w_end]
+        return sample

metric_depth/dataset/vkitti2.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import cv2
+import torch
+from torch.utils.data import Dataset
+from torchvision.transforms import Compose
+from dataset.transform import Resize, NormalizeImage, PrepareForNet, Crop
+class VKITTI2(Dataset):
+    def __init__(self, filelist_path, mode, size=(518, 518)):
+        self.mode = mode
+        self.size = size
+        with open(filelist_path, 'r') as f:
+            self.filelist = f.read().splitlines()
+        net_w, net_h = size
+        self.transform = Compose([
+            Resize(
+                width=net_w,
+                height=net_h,
+                resize_target=True if mode == 'train' else False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method='lower_bound',
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ] + ([Crop(size[0])] if self.mode == 'train' else []))
+    def __getitem__(self, item):
+        img_path = self.filelist[item].split(' ')[0]
+        depth_path = self.filelist[item].split(' ')[1]
+        image = cv2.imread(img_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
+        depth = cv2.imread(depth_path, cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH) / 100.0  # cm to m
+        sample = self.transform({'image': image, 'depth': depth})
+        sample['image'] = torch.from_numpy(sample['image'])
+        sample['depth'] = torch.from_numpy(sample['depth'])
+        sample['valid_mask'] = (sample['depth'] <= 80)
+        sample['image_path'] = self.filelist[item].split(' ')[0]
+        return sample
+    def __len__(self):
+        return len(self.filelist)

metric_depth/depth_anything_v2/dinov2.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            # (int(w0), int(h0)), # to solve the upsampling shape issue
+            mode="bicubic",
+            antialias=self.interpolate_antialias
+        )
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def DINOv2(model_name):
+    model_zoo = {
+        "vits": vit_small,
+        "vitb": vit_base,
+        "vitl": vit_large,
+        "vitg": vit_giant2
+    }
+    return model_zoo[model_name](
+        img_size=518,
+        patch_size=14,
+        init_values=1.0,
+        ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
+        block_chunks=0,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1
+    )

metric_depth/depth_anything_v2/dinov2_layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

metric_depth/depth_anything_v2/dinov2_layers/attention.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

metric_depth/depth_anything_v2/dinov2_layers/block.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError