# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import warp as wp
import numpy as np

# Initialize Warp with CUDA
wp.init()

@wp.kernel
def ray_triangle_intersection_kernel(
    ray_origins: wp.array2d(dtype=wp.float32),      # (H*W, 3)
    ray_directions: wp.array2d(dtype=wp.float32),   # (H*W, 3)
    vertices: wp.array2d(dtype=wp.float32),         # (N, 3)
    faces: wp.array2d(dtype=wp.int32),              # (M, 3)
    depth_map: wp.array(dtype=wp.float32),          # (H*W,)
    num_triangles: wp.int32,
    epsilon: wp.float32
):
    """
    Warp kernel for ray-triangle intersection using Möller–Trumbore algorithm.
    Each thread processes one ray against all triangles.
    """
    # Get thread index (ray index)
    ray_idx = wp.tid()
    
    # Get ray origin and direction
    ray_origin = wp.vec3(
        ray_origins[ray_idx, 0],
        ray_origins[ray_idx, 1],
        ray_origins[ray_idx, 2]
    )
    ray_dir = wp.vec3(
        ray_directions[ray_idx, 0],
        ray_directions[ray_idx, 1],
        ray_directions[ray_idx, 2]
    )
    
    # Initialize minimum distance
    min_t = wp.float32(1e10)
    
    # Iterate through all triangles
    for tri_idx in range(num_triangles):
        # Get triangle vertex indices
        i0 = faces[tri_idx, 0]
        i1 = faces[tri_idx, 1]
        i2 = faces[tri_idx, 2]
        
        # Get triangle vertices
        v0 = wp.vec3(vertices[i0, 0], vertices[i0, 1], vertices[i0, 2])
        v1 = wp.vec3(vertices[i1, 0], vertices[i1, 1], vertices[i1, 2])
        v2 = wp.vec3(vertices[i2, 0], vertices[i2, 1], vertices[i2, 2])
        
        # Compute edges
        edge1 = v1 - v0
        edge2 = v2 - v0
        
        # Möller–Trumbore algorithm
        h = wp.cross(ray_dir, edge2)
        a = wp.dot(edge1, h)
        
        # Check if ray is parallel to triangle
        if wp.abs(a) < epsilon:
            continue
            
        f = 1.0 / a
        s = ray_origin - v0
        u = f * wp.dot(s, h)
        
        # Check if intersection is within triangle (u >= 0 and u <= 1)
        if u < 0.0 or u > 1.0:
            continue
            
        q = wp.cross(s, edge1)
        v = f * wp.dot(ray_dir, q)
        
        # Check if intersection is within triangle (v >= 0 and u + v <= 1)
        if v < 0.0 or (u + v) > 1.0:
            continue
            
        # Compute t (distance along ray)
        t = f * wp.dot(edge2, q)
        
        # Only consider intersections in front of camera (t > 0)
        if t > epsilon and t < min_t:
            min_t = t
    
    # Write result
    if min_t < 1e10:
        depth_map[ray_idx] = min_t
    else:
        depth_map[ray_idx] = 0.0


@wp.kernel
def ray_triangle_intersection_tiled_kernel(
    ray_origins: wp.array2d(dtype=wp.float32),      # (H*W, 3)
    ray_directions: wp.array2d(dtype=wp.float32),   # (H*W, 3)
    vertices: wp.array2d(dtype=wp.float32),         # (N, 3)
    faces: wp.array2d(dtype=wp.int32),              # (M, 3)
    depth_map: wp.array(dtype=wp.float32),          # (H*W,)
    tri_start: wp.int32,                            # Start triangle index for this tile
    tri_end: wp.int32,                              # End triangle index for this tile
    epsilon: wp.float32
):
    """
    Tiled version of ray-triangle intersection kernel.
    Processes a subset of triangles to improve memory access patterns.
    """
    # Get thread index (ray index)
    ray_idx = wp.tid()
    
    # Get ray origin and direction
    ray_origin = wp.vec3(
        ray_origins[ray_idx, 0],
        ray_origins[ray_idx, 1],
        ray_origins[ray_idx, 2]
    )
    ray_dir = wp.vec3(
        ray_directions[ray_idx, 0],
        ray_directions[ray_idx, 1],
        ray_directions[ray_idx, 2]
    )
    
    # Get current minimum distance
    min_t = depth_map[ray_idx]
    if min_t == 0.0:
        min_t = wp.float32(1e10)
    
    # Process triangles in this tile
    for tri_idx in range(tri_start, tri_end):
        # Get triangle vertex indices
        i0 = faces[tri_idx, 0]
        i1 = faces[tri_idx, 1]
        i2 = faces[tri_idx, 2]
        
        # Get triangle vertices
        v0 = wp.vec3(vertices[i0, 0], vertices[i0, 1], vertices[i0, 2])
        v1 = wp.vec3(vertices[i1, 0], vertices[i1, 1], vertices[i1, 2])
        v2 = wp.vec3(vertices[i2, 0], vertices[i2, 1], vertices[i2, 2])
        
        # Compute edges
        edge1 = v1 - v0
        edge2 = v2 - v0
        
        # Möller–Trumbore algorithm
        h = wp.cross(ray_dir, edge2)
        a = wp.dot(edge1, h)
        
        # Check if ray is parallel to triangle
        if wp.abs(a) < epsilon:
            continue
            
        f = 1.0 / a
        s = ray_origin - v0
        u = f * wp.dot(s, h)
        
        # Check if intersection is within triangle (u >= 0 and u <= 1)
        if u < 0.0 or u > 1.0:
            continue
            
        q = wp.cross(s, edge1)
        v = f * wp.dot(ray_dir, q)
        
        # Check if intersection is within triangle (v >= 0 and u + v <= 1)
        if v < 0.0 or (u + v) > 1.0:
            continue
            
        # Compute t (distance along ray)
        t = f * wp.dot(edge2, q)
        
        # Only consider intersections in front of camera (t > 0)
        if t > epsilon and t < min_t:
            min_t = t
    
    # Write result using atomic min to handle concurrent updates
    if min_t < 1e10:
        wp.atomic_min(depth_map, ray_idx, min_t)


def ray_triangle_intersection_warp(
    ray_origins: torch.Tensor,      # (H, W, 3)
    ray_directions: torch.Tensor,   # (H, W, 3)
    vertices: torch.Tensor,         # (N, 3)
    faces: torch.Tensor,            # (M, 3)
    device: torch.device
) -> torch.Tensor:
    """
    Compute ray-triangle intersections using NVIDIA Warp for maximum GPU acceleration.
    
    This implementation uses Warp kernels to achieve the best possible performance
    on NVIDIA GPUs by:
    1. Using native CUDA kernels through Warp
    2. Tiling triangles for better memory access patterns
    3. Using atomic operations for concurrent updates
    4. Minimizing memory transfers
    
    Args:
        ray_origins: (H, W, 3) ray origins in camera space
        ray_directions: (H, W, 3) ray directions (should be normalized)
        vertices: (N, 3) mesh vertices
        faces: (M, 3) triangle face indices
        device: torch device (must be CUDA)
    
    Returns:
        depth_map: (H, W) depth values, 0 where no intersection
    """
    H, W = ray_origins.shape[:2]
    num_rays = H * W
    num_triangles = faces.shape[0]
    
    # Reshape rays to 2D arrays
    ray_origins_flat = ray_origins.reshape(-1, 3).contiguous()
    ray_directions_flat = ray_directions.reshape(-1, 3).contiguous()
    
    # Convert PyTorch tensors to Warp arrays (as float arrays, not vec3)
    wp_ray_origins = wp.from_torch(ray_origins_flat, dtype=wp.float32)
    wp_ray_directions = wp.from_torch(ray_directions_flat, dtype=wp.float32)
    wp_vertices = wp.from_torch(vertices.contiguous(), dtype=wp.float32)
    wp_faces = wp.from_torch(faces.int().contiguous(), dtype=wp.int32)
    
    # Create output depth map
    depth_map_flat = torch.zeros(num_rays, device=device, dtype=torch.float32)
    wp_depth_map = wp.from_torch(depth_map_flat, dtype=wp.float32)
    
    # Choose implementation based on problem size
    if num_triangles < 10000:
        # For smaller meshes, use simple kernel
        wp.launch(
            kernel=ray_triangle_intersection_kernel,
            dim=num_rays,
            inputs=[
                wp_ray_origins,
                wp_ray_directions,
                wp_vertices,
                wp_faces,
                wp_depth_map,
                num_triangles,
                1e-8  # epsilon
            ],
            device=f"cuda:{device.index}" if device.index is not None else "cuda:0"
        )
    else:
        # For larger meshes, use tiled approach for better memory access
        triangle_tile_size = 10000  # Process triangles in tiles
        
        # Initialize depth map to infinity
        depth_map_flat.fill_(float('inf'))
        
        # Process triangles in tiles
        for tri_start in range(0, num_triangles, triangle_tile_size):
            tri_end = min(tri_start + triangle_tile_size, num_triangles)
            
            wp.launch(
                kernel=ray_triangle_intersection_tiled_kernel,
                dim=num_rays,
                inputs=[
                    wp_ray_origins,
                    wp_ray_directions,
                    wp_vertices,
                    wp_faces,
                    wp_depth_map,
                    tri_start,
                    tri_end,
                    1e-8  # epsilon
                ],
                device=f"cuda:{device.index}" if device.index is not None else "cuda:0"
            )
        
        # Convert infinity back to 0
        depth_map_flat[depth_map_flat == float('inf')] = 0.0
    
    # Synchronize to ensure kernel completion
    wp.synchronize()
    
    # Reshape back to 2D
    depth_map = depth_map_flat.reshape(H, W)
    
    return depth_map