Spaces:

viranchi123
/

gradui-1

Running

App Files Files Community

gradui-1 / app.py

viranchi123

Update app.py

94e0c4c verified 5 days ago

raw

history blame contribute delete

10.7 kB

	import glob
	import gradio as gr
	import matplotlib
	import numpy as np
	from PIL import Image
	import torch
	import tempfile
	from gradio_imageslider import ImageSlider
	import plotly.graph_objects as go
	import plotly.express as px
	import open3d as o3d
	from depth_anything_v2.dpt import DepthAnythingV2
	import os
	import gdown



	# Define path and file ID
	checkpoint_dir = "checkpoints"
	os.makedirs(checkpoint_dir, exist_ok=True)

	model_file = os.path.join(checkpoint_dir, "depth_anything_v2_vitl.pth")
	gdrive_url = "https://drive.google.com/uc?id=141Mhq2jonkUBcVBnNqNSeyIZYtH5l4K5"

	# Download if not already present
	if not os.path.exists(model_file):
	print("Downloading model from Google Drive...")
	gdown.download(gdrive_url, model_file, quiet=False)

	css = """
	#img-display-container {
	max-height: 100vh;
	}
	#img-display-input {
	max-height: 80vh;
	}
	#img-display-output {
	max-height: 80vh;
	}
	#download {
	height: 62px;
	}
	h1 {
	text-align: center;
	font-size: 3rem;
	font-weight: bold;
	margin: 2rem 0;
	color: #2c3e50;
	}
	"""
	DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
	model_configs = {
	'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
	'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
	'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
	'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
	}

	encoder = 'vitl'
	model = DepthAnythingV2(**model_configs[encoder])
	state_dict = torch.load(f'/home/user/app/checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu")
	model.load_state_dict(state_dict)
	model = model.to(DEVICE).eval()

	title = "Depth Estimation, 3D Visualization"
	description = """Official demo for Depth Estimation, 3D Visualization."""

	def predict_depth(image):
	return model.infer_image(image)

	def calculate_max_points(image):
	"""Calculate maximum points based on image dimensions (3x pixel count)"""
	if image is None:
	return 10000 # Default value
	h, w = image.shape[:2]
	max_points = h * w * 3
	# Ensure minimum and reasonable maximum values
	return max(1000, min(max_points, 1000000))

	def update_slider_on_image_upload(image):
	"""Update the points slider when an image is uploaded"""
	max_points = calculate_max_points(image)
	default_value = min(10000, max_points // 10) # 10% of max points as default
	return gr.Slider(minimum=1000, maximum=max_points, value=default_value, step=1000,
	label=f"Number of 3D points (max: {max_points:,})")

	def create_3d_depth_visualization(image, depth_map, max_points=10000):
	"""Create an interactive 3D visualization of the depth map"""
	h, w = depth_map.shape

	# Downsample to avoid too many points for performance
	step = max(1, int(np.sqrt(h * w / max_points)))

	# Create coordinate grids
	y_coords, x_coords = np.mgrid[0:h:step, 0:w:step]
	depth_values = depth_map[::step, ::step]

	# Flatten arrays
	x_flat = x_coords.flatten()
	y_flat = y_coords.flatten()
	z_flat = depth_values.flatten()

	# Get corresponding image colors
	image_colors = image[::step, ::step, :]
	colors_flat = image_colors.reshape(-1, 3)

	# Create 3D scatter plot
	fig = go.Figure(data=[go.Scatter3d(
	x=x_flat,
	y=y_flat,
	z=z_flat,
	mode='markers',
	marker=dict(
	size=2,
	color=colors_flat,
	opacity=0.8
	),
	hovertemplate='<b>Position:</b> (%{x:.0f}, %{y:.0f})<br>' +
	'<b>Depth:</b> %{z:.2f}<br>' +
	'<extra></extra>'
	)])

	fig.update_layout(
	title="3D Depth Visualization (Hover to see depth values)",
	scene=dict(
	xaxis_title="X (pixels)",
	yaxis_title="Y (pixels)",
	zaxis_title="Depth",
	camera=dict(
	eye=dict(x=1.5, y=1.5, z=1.5)
	)
	),
	width=600,
	height=500
	)

	return fig

	def create_point_cloud(image, depth_map, focal_length_x=470.4, focal_length_y=470.4, max_points=100000):
	"""Create a point cloud from depth map using camera intrinsics"""
	h, w = depth_map.shape

	# Downsample to avoid too many points for performance
	step = max(1, int(np.sqrt(h * w / max_points)))

	# Create mesh grid for camera coordinates
	y_coords, x_coords = np.mgrid[0:h:step, 0:w:step]

	# Convert to camera coordinates (normalized by focal length)
	x_cam = (x_coords - w / 2) / focal_length_x
	y_cam = (y_coords - h / 2) / focal_length_y

	# Get depth values
	depth_values = depth_map[::step, ::step]

	# Calculate 3D points: (x_cam * depth, y_cam * depth, depth)
	x_3d = x_cam * depth_values
	y_3d = y_cam * depth_values
	z_3d = depth_values

	# Flatten arrays
	points = np.stack([x_3d.flatten(), y_3d.flatten(), z_3d.flatten()], axis=1)

	# Get corresponding image colors
	image_colors = image[::step, ::step, :]
	colors = image_colors.reshape(-1, 3) / 255.0

	# Create Open3D point cloud
	pcd = o3d.geometry.PointCloud()
	pcd.points = o3d.utility.Vector3dVector(points)
	pcd.colors = o3d.utility.Vector3dVector(colors)

	return pcd

	def create_enhanced_3d_visualization(image, depth_map, max_points=10000):
	"""Create an enhanced 3D visualization using proper camera projection"""
	h, w = depth_map.shape

	# Downsample to avoid too many points for performance
	step = max(1, int(np.sqrt(h * w / max_points)))

	# Create mesh grid for camera coordinates
	y_coords, x_coords = np.mgrid[0:h:step, 0:w:step]

	# Convert to camera coordinates (normalized by focal length)
	focal_length = 470.4 # Default focal length
	x_cam = (x_coords - w / 2) / focal_length
	y_cam = (y_coords - h / 2) / focal_length

	# Get depth values
	depth_values = depth_map[::step, ::step]

	# Calculate 3D points: (x_cam * depth, y_cam * depth, depth)
	x_3d = x_cam * depth_values
	y_3d = y_cam * depth_values
	z_3d = depth_values

	# Flatten arrays
	x_flat = x_3d.flatten()
	y_flat = y_3d.flatten()
	z_flat = z_3d.flatten()

	# Get corresponding image colors
	image_colors = image[::step, ::step, :]
	colors_flat = image_colors.reshape(-1, 3)

	# Create 3D scatter plot with proper camera projection
	fig = go.Figure(data=[go.Scatter3d(
	x=x_flat,
	y=y_flat,
	z=z_flat,
	mode='markers',
	marker=dict(
	size=1.5,
	color=colors_flat,
	opacity=0.9
	),
	hovertemplate='<b>3D Position:</b> (%{x:.3f}, %{y:.3f}, %{z:.3f})<br>' +
	'<b>Depth:</b> %{z:.2f}<br>' +
	'<extra></extra>'
	)])

	fig.update_layout(
	title="3D Point Cloud Visualization (Camera Projection)",
	scene=dict(
	xaxis_title="X (meters)",
	yaxis_title="Y (meters)",
	zaxis_title="Z (meters)",
	camera=dict(
	eye=dict(x=2.0, y=2.0, z=2.0),
	center=dict(x=0, y=0, z=0),
	up=dict(x=0, y=0, z=1)
	),
	aspectmode='data'
	),
	width=700,
	height=600
	)

	return fig

	with gr.Blocks(css=css) as demo:
	gr.HTML(f"<h1>{title}</h1>")
	gr.Markdown(description)
	gr.Markdown("### Depth Prediction demo")

	with gr.Row():
	input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input')
	depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output')

	with gr.Row():
	submit = gr.Button(value="Compute Depth", variant="primary")
	points_slider = gr.Slider(minimum=1000, maximum=10000, value=10000, step=1000,
	label="Number of 3D points (upload image to update max)")

	with gr.Row():
	focal_length_x = gr.Slider(minimum=100, maximum=1000, value=470.4, step=10,
	label="Focal Length X (pixels)")
	focal_length_y = gr.Slider(minimum=100, maximum=1000, value=470.4, step=10,
	label="Focal Length Y (pixels)")

	with gr.Row():
	gray_depth_file = gr.File(label="Grayscale depth map", elem_id="download")
	raw_file = gr.File(label="16-bit raw output (can be considered as disparity)", elem_id="download")
	point_cloud_file = gr.File(label="Point Cloud (.ply)", elem_id="download")

	# 3D Visualization
	gr.Markdown("### 3D Point Cloud Visualization")
	gr.Markdown("Enhanced 3D visualization using proper camera projection. Hover over points to see 3D coordinates.")
	depth_3d_plot = gr.Plot(label="3D Point Cloud")

	cmap = matplotlib.colormaps.get_cmap('Spectral_r')

	def on_submit(image, num_points, focal_x, focal_y):
	original_image = image.copy()

	h, w = image.shape[:2]

	depth = predict_depth(image[:, :, ::-1])

	raw_depth = Image.fromarray(depth.astype('uint16'))
	tmp_raw_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
	raw_depth.save(tmp_raw_depth.name)

	depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
	depth = depth.astype(np.uint8)
	colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)

	gray_depth = Image.fromarray(depth)
	tmp_gray_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
	gray_depth.save(tmp_gray_depth.name)

	# Create point cloud
	pcd = create_point_cloud(original_image, depth, focal_x, focal_y, max_points=num_points)
	tmp_pointcloud = tempfile.NamedTemporaryFile(suffix='.ply', delete=False)
	o3d.io.write_point_cloud(tmp_pointcloud.name, pcd)

	# Create enhanced 3D visualization
	depth_3d = create_enhanced_3d_visualization(original_image, depth, max_points=num_points)

	return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name, tmp_pointcloud.name, depth_3d]

	# Update slider when image is uploaded
	input_image.change(
	fn=update_slider_on_image_upload,
	inputs=[input_image],
	outputs=[points_slider]
	)

	submit.click(on_submit, inputs=[input_image, points_slider, focal_length_x, focal_length_y],
	outputs=[depth_image_slider, gray_depth_file, raw_file, point_cloud_file, depth_3d_plot])



	if __name__ == '__main__':
	demo.queue().launch()