Spaces:
Build error
Build error
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
# SPDX-License-Identifier: Apache-2.0 | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import torch | |
import warp as wp | |
import numpy as np | |
# Initialize Warp with CUDA | |
wp.init() | |
def ray_triangle_intersection_kernel( | |
ray_origins: wp.array2d(dtype=wp.float32), # (H*W, 3) | |
ray_directions: wp.array2d(dtype=wp.float32), # (H*W, 3) | |
vertices: wp.array2d(dtype=wp.float32), # (N, 3) | |
faces: wp.array2d(dtype=wp.int32), # (M, 3) | |
depth_map: wp.array(dtype=wp.float32), # (H*W,) | |
num_triangles: wp.int32, | |
epsilon: wp.float32 | |
): | |
""" | |
Warp kernel for ray-triangle intersection using Möller–Trumbore algorithm. | |
Each thread processes one ray against all triangles. | |
""" | |
# Get thread index (ray index) | |
ray_idx = wp.tid() | |
# Get ray origin and direction | |
ray_origin = wp.vec3( | |
ray_origins[ray_idx, 0], | |
ray_origins[ray_idx, 1], | |
ray_origins[ray_idx, 2] | |
) | |
ray_dir = wp.vec3( | |
ray_directions[ray_idx, 0], | |
ray_directions[ray_idx, 1], | |
ray_directions[ray_idx, 2] | |
) | |
# Initialize minimum distance | |
min_t = wp.float32(1e10) | |
# Iterate through all triangles | |
for tri_idx in range(num_triangles): | |
# Get triangle vertex indices | |
i0 = faces[tri_idx, 0] | |
i1 = faces[tri_idx, 1] | |
i2 = faces[tri_idx, 2] | |
# Get triangle vertices | |
v0 = wp.vec3(vertices[i0, 0], vertices[i0, 1], vertices[i0, 2]) | |
v1 = wp.vec3(vertices[i1, 0], vertices[i1, 1], vertices[i1, 2]) | |
v2 = wp.vec3(vertices[i2, 0], vertices[i2, 1], vertices[i2, 2]) | |
# Compute edges | |
edge1 = v1 - v0 | |
edge2 = v2 - v0 | |
# Möller–Trumbore algorithm | |
h = wp.cross(ray_dir, edge2) | |
a = wp.dot(edge1, h) | |
# Check if ray is parallel to triangle | |
if wp.abs(a) < epsilon: | |
continue | |
f = 1.0 / a | |
s = ray_origin - v0 | |
u = f * wp.dot(s, h) | |
# Check if intersection is within triangle (u >= 0 and u <= 1) | |
if u < 0.0 or u > 1.0: | |
continue | |
q = wp.cross(s, edge1) | |
v = f * wp.dot(ray_dir, q) | |
# Check if intersection is within triangle (v >= 0 and u + v <= 1) | |
if v < 0.0 or (u + v) > 1.0: | |
continue | |
# Compute t (distance along ray) | |
t = f * wp.dot(edge2, q) | |
# Only consider intersections in front of camera (t > 0) | |
if t > epsilon and t < min_t: | |
min_t = t | |
# Write result | |
if min_t < 1e10: | |
depth_map[ray_idx] = min_t | |
else: | |
depth_map[ray_idx] = 0.0 | |
def ray_triangle_intersection_tiled_kernel( | |
ray_origins: wp.array2d(dtype=wp.float32), # (H*W, 3) | |
ray_directions: wp.array2d(dtype=wp.float32), # (H*W, 3) | |
vertices: wp.array2d(dtype=wp.float32), # (N, 3) | |
faces: wp.array2d(dtype=wp.int32), # (M, 3) | |
depth_map: wp.array(dtype=wp.float32), # (H*W,) | |
tri_start: wp.int32, # Start triangle index for this tile | |
tri_end: wp.int32, # End triangle index for this tile | |
epsilon: wp.float32 | |
): | |
""" | |
Tiled version of ray-triangle intersection kernel. | |
Processes a subset of triangles to improve memory access patterns. | |
""" | |
# Get thread index (ray index) | |
ray_idx = wp.tid() | |
# Get ray origin and direction | |
ray_origin = wp.vec3( | |
ray_origins[ray_idx, 0], | |
ray_origins[ray_idx, 1], | |
ray_origins[ray_idx, 2] | |
) | |
ray_dir = wp.vec3( | |
ray_directions[ray_idx, 0], | |
ray_directions[ray_idx, 1], | |
ray_directions[ray_idx, 2] | |
) | |
# Get current minimum distance | |
min_t = depth_map[ray_idx] | |
if min_t == 0.0: | |
min_t = wp.float32(1e10) | |
# Process triangles in this tile | |
for tri_idx in range(tri_start, tri_end): | |
# Get triangle vertex indices | |
i0 = faces[tri_idx, 0] | |
i1 = faces[tri_idx, 1] | |
i2 = faces[tri_idx, 2] | |
# Get triangle vertices | |
v0 = wp.vec3(vertices[i0, 0], vertices[i0, 1], vertices[i0, 2]) | |
v1 = wp.vec3(vertices[i1, 0], vertices[i1, 1], vertices[i1, 2]) | |
v2 = wp.vec3(vertices[i2, 0], vertices[i2, 1], vertices[i2, 2]) | |
# Compute edges | |
edge1 = v1 - v0 | |
edge2 = v2 - v0 | |
# Möller–Trumbore algorithm | |
h = wp.cross(ray_dir, edge2) | |
a = wp.dot(edge1, h) | |
# Check if ray is parallel to triangle | |
if wp.abs(a) < epsilon: | |
continue | |
f = 1.0 / a | |
s = ray_origin - v0 | |
u = f * wp.dot(s, h) | |
# Check if intersection is within triangle (u >= 0 and u <= 1) | |
if u < 0.0 or u > 1.0: | |
continue | |
q = wp.cross(s, edge1) | |
v = f * wp.dot(ray_dir, q) | |
# Check if intersection is within triangle (v >= 0 and u + v <= 1) | |
if v < 0.0 or (u + v) > 1.0: | |
continue | |
# Compute t (distance along ray) | |
t = f * wp.dot(edge2, q) | |
# Only consider intersections in front of camera (t > 0) | |
if t > epsilon and t < min_t: | |
min_t = t | |
# Write result using atomic min to handle concurrent updates | |
if min_t < 1e10: | |
wp.atomic_min(depth_map, ray_idx, min_t) | |
def ray_triangle_intersection_warp( | |
ray_origins: torch.Tensor, # (H, W, 3) | |
ray_directions: torch.Tensor, # (H, W, 3) | |
vertices: torch.Tensor, # (N, 3) | |
faces: torch.Tensor, # (M, 3) | |
device: torch.device | |
) -> torch.Tensor: | |
""" | |
Compute ray-triangle intersections using NVIDIA Warp for maximum GPU acceleration. | |
This implementation uses Warp kernels to achieve the best possible performance | |
on NVIDIA GPUs by: | |
1. Using native CUDA kernels through Warp | |
2. Tiling triangles for better memory access patterns | |
3. Using atomic operations for concurrent updates | |
4. Minimizing memory transfers | |
Args: | |
ray_origins: (H, W, 3) ray origins in camera space | |
ray_directions: (H, W, 3) ray directions (should be normalized) | |
vertices: (N, 3) mesh vertices | |
faces: (M, 3) triangle face indices | |
device: torch device (must be CUDA) | |
Returns: | |
depth_map: (H, W) depth values, 0 where no intersection | |
""" | |
H, W = ray_origins.shape[:2] | |
num_rays = H * W | |
num_triangles = faces.shape[0] | |
# Reshape rays to 2D arrays | |
ray_origins_flat = ray_origins.reshape(-1, 3).contiguous() | |
ray_directions_flat = ray_directions.reshape(-1, 3).contiguous() | |
# Convert PyTorch tensors to Warp arrays (as float arrays, not vec3) | |
wp_ray_origins = wp.from_torch(ray_origins_flat, dtype=wp.float32) | |
wp_ray_directions = wp.from_torch(ray_directions_flat, dtype=wp.float32) | |
wp_vertices = wp.from_torch(vertices.contiguous(), dtype=wp.float32) | |
wp_faces = wp.from_torch(faces.int().contiguous(), dtype=wp.int32) | |
# Create output depth map | |
depth_map_flat = torch.zeros(num_rays, device=device, dtype=torch.float32) | |
wp_depth_map = wp.from_torch(depth_map_flat, dtype=wp.float32) | |
# Choose implementation based on problem size | |
if num_triangles < 10000: | |
# For smaller meshes, use simple kernel | |
wp.launch( | |
kernel=ray_triangle_intersection_kernel, | |
dim=num_rays, | |
inputs=[ | |
wp_ray_origins, | |
wp_ray_directions, | |
wp_vertices, | |
wp_faces, | |
wp_depth_map, | |
num_triangles, | |
1e-8 # epsilon | |
], | |
device=f"cuda:{device.index}" if device.index is not None else "cuda:0" | |
) | |
else: | |
# For larger meshes, use tiled approach for better memory access | |
triangle_tile_size = 10000 # Process triangles in tiles | |
# Initialize depth map to infinity | |
depth_map_flat.fill_(float('inf')) | |
# Process triangles in tiles | |
for tri_start in range(0, num_triangles, triangle_tile_size): | |
tri_end = min(tri_start + triangle_tile_size, num_triangles) | |
wp.launch( | |
kernel=ray_triangle_intersection_tiled_kernel, | |
dim=num_rays, | |
inputs=[ | |
wp_ray_origins, | |
wp_ray_directions, | |
wp_vertices, | |
wp_faces, | |
wp_depth_map, | |
tri_start, | |
tri_end, | |
1e-8 # epsilon | |
], | |
device=f"cuda:{device.index}" if device.index is not None else "cuda:0" | |
) | |
# Convert infinity back to 0 | |
depth_map_flat[depth_map_flat == float('inf')] = 0.0 | |
# Synchronize to ensure kernel completion | |
wp.synchronize() | |
# Reshape back to 2D | |
depth_map = depth_map_flat.reshape(H, W) | |
return depth_map | |