alexnasa commited on
Commit
2568013
·
verified ·
1 Parent(s): b0a8307

Upload 243 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +29 -0
  2. LICENSE +21 -0
  3. assets/demo_gradio.gif +3 -0
  4. assets/pipeline.jpg +3 -0
  5. config/compute_metrics.yaml +28 -0
  6. config/dataset/base_dataset.yaml +7 -0
  7. config/dataset/co3d.yaml +15 -0
  8. config/dataset/dl3dv.yaml +19 -0
  9. config/dataset/scannetpp.yaml +22 -0
  10. config/dataset/view_sampler/all.yaml +1 -0
  11. config/dataset/view_sampler/arbitrary.yaml +7 -0
  12. config/dataset/view_sampler/bounded.yaml +16 -0
  13. config/dataset/view_sampler/evaluation.yaml +4 -0
  14. config/dataset/view_sampler/rank.yaml +14 -0
  15. config/experiment/co3d.yaml +90 -0
  16. config/experiment/dl3dv.yaml +92 -0
  17. config/experiment/multi-dataset.yaml +121 -0
  18. config/experiment/scannetpp.yaml +90 -0
  19. config/generate_evaluation_index.yaml +36 -0
  20. config/loss/chamfer_distance.yaml +5 -0
  21. config/loss/depth.yaml +4 -0
  22. config/loss/depth_consis.yaml +4 -0
  23. config/loss/depthgt.yaml +3 -0
  24. config/loss/lod.yaml +3 -0
  25. config/loss/lpips.yaml +3 -0
  26. config/loss/mse.yaml +2 -0
  27. config/loss/normal_consis.yaml +5 -0
  28. config/loss/opacity.yaml +3 -0
  29. config/main.yaml +81 -0
  30. config/model/decoder/splatting_cuda.yaml +3 -0
  31. config/model/encoder/anysplat.yaml +62 -0
  32. config/model/encoder/backbone/croco.yaml +9 -0
  33. demo_gradio.py +459 -0
  34. examples/video/bungeenerf_colosseum.mp4 +3 -0
  35. examples/video/dtu_scan_106.mp4 +3 -0
  36. examples/video/fillerbuster_hand_hand.mp4 +3 -0
  37. examples/video/fillerbuster_ramen.mp4 +3 -0
  38. examples/video/fox.mp4 +3 -0
  39. examples/video/horizongs_hillside_summer.mp4 +3 -0
  40. examples/video/kitti360.mp4 +3 -0
  41. examples/video/llff_fortress.mp4 +3 -0
  42. examples/video/llff_horns.mp4 +3 -0
  43. examples/video/matrixcity_street.mp4 +3 -0
  44. examples/video/meganerf_rubble.mp4 +3 -0
  45. examples/video/re10k_1eca36ec55b88fe4.mp4 +0 -0
  46. examples/video/vrnerf_apartment.mp4 +3 -0
  47. examples/video/vrnerf_kitchen.mp4 +3 -0
  48. examples/video/vrnerf_riverview.mp4 +3 -0
  49. examples/video/vrnerf_workshop.mp4 +3 -0
  50. examples/vrnerf/riverview/21_DSC0001.jpg +3 -0
.gitattributes CHANGED
@@ -33,3 +33,32 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/demo_gradio.gif filter=lfs diff=lfs merge=lfs -text
37
+ assets/pipeline.jpg filter=lfs diff=lfs merge=lfs -text
38
+ examples/video/bungeenerf_colosseum.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ examples/video/dtu_scan_106.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ examples/video/fillerbuster_hand_hand.mp4 filter=lfs diff=lfs merge=lfs -text
41
+ examples/video/fillerbuster_ramen.mp4 filter=lfs diff=lfs merge=lfs -text
42
+ examples/video/fox.mp4 filter=lfs diff=lfs merge=lfs -text
43
+ examples/video/horizongs_hillside_summer.mp4 filter=lfs diff=lfs merge=lfs -text
44
+ examples/video/kitti360.mp4 filter=lfs diff=lfs merge=lfs -text
45
+ examples/video/llff_fortress.mp4 filter=lfs diff=lfs merge=lfs -text
46
+ examples/video/llff_horns.mp4 filter=lfs diff=lfs merge=lfs -text
47
+ examples/video/matrixcity_street.mp4 filter=lfs diff=lfs merge=lfs -text
48
+ examples/video/meganerf_rubble.mp4 filter=lfs diff=lfs merge=lfs -text
49
+ examples/video/vrnerf_apartment.mp4 filter=lfs diff=lfs merge=lfs -text
50
+ examples/video/vrnerf_kitchen.mp4 filter=lfs diff=lfs merge=lfs -text
51
+ examples/video/vrnerf_riverview.mp4 filter=lfs diff=lfs merge=lfs -text
52
+ examples/video/vrnerf_workshop.mp4 filter=lfs diff=lfs merge=lfs -text
53
+ examples/vrnerf/riverview/21_DSC0001.jpg filter=lfs diff=lfs merge=lfs -text
54
+ examples/vrnerf/riverview/21_DSC0010.jpg filter=lfs diff=lfs merge=lfs -text
55
+ examples/vrnerf/riverview/21_DSC0019.jpg filter=lfs diff=lfs merge=lfs -text
56
+ examples/vrnerf/riverview/21_DSC0028.jpg filter=lfs diff=lfs merge=lfs -text
57
+ examples/vrnerf/riverview/21_DSC0037.jpg filter=lfs diff=lfs merge=lfs -text
58
+ examples/vrnerf/riverview/21_DSC0046.jpg filter=lfs diff=lfs merge=lfs -text
59
+ examples/vrnerf/riverview/21_DSC0055.jpg filter=lfs diff=lfs merge=lfs -text
60
+ examples/vrnerf/riverview/21_DSC0064.jpg filter=lfs diff=lfs merge=lfs -text
61
+ examples/vrnerf/riverview/21_DSC0073.jpg filter=lfs diff=lfs merge=lfs -text
62
+ examples/vrnerf/riverview/21_DSC0082.jpg filter=lfs diff=lfs merge=lfs -text
63
+ examples/vrnerf/riverview/21_DSC0091.jpg filter=lfs diff=lfs merge=lfs -text
64
+ examples/vrnerf/riverview/21_DSC0100.jpg filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Lihan Jiang and Yucheng Mao
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
assets/demo_gradio.gif ADDED

Git LFS Details

  • SHA256: d2de19dc0b15b0d64b408355b016ef8da1ce455913ee37fda935c5b7a43df248
  • Pointer size: 132 Bytes
  • Size of remote file: 3.77 MB
assets/pipeline.jpg ADDED

Git LFS Details

  • SHA256: eafeeddafbf266caf2a1ea911aec24fb08d4d1177813b7621081b0f92d4a63aa
  • Pointer size: 131 Bytes
  • Size of remote file: 111 kB
config/compute_metrics.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - model/encoder: noposplat
3
+ - loss: []
4
+ - override dataset/[email protected]_sampler: evaluation
5
+
6
+ dataset:
7
+ re10k:
8
+ view_sampler:
9
+ index_path: assets/evaluation_index_re10k.json
10
+
11
+ data_loader:
12
+ train:
13
+ num_workers: 0
14
+ persistent_workers: true
15
+ batch_size: 1
16
+ seed: 1234
17
+ test:
18
+ num_workers: 4
19
+ persistent_workers: false
20
+ batch_size: 1
21
+ seed: 2345
22
+ val:
23
+ num_workers: 0
24
+ persistent_workers: true
25
+ batch_size: 1
26
+ seed: 3456
27
+
28
+ seed: 111123
config/dataset/base_dataset.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ make_baseline_1: true
2
+ relative_pose: true
3
+ augment: true
4
+ background_color: [1.0, 1.0, 1.0]
5
+ overfit_to_scene: null
6
+ skip_bad_shape: true
7
+ rescale_to_1cube: false
config/dataset/co3d.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - base_dataset
3
+ - view_sampler: rank
4
+
5
+ name: co3d
6
+ roots: [datasets/co3dv2]
7
+
8
+ input_image_shape: [256, 256]
9
+ original_image_shape: [540, 960]
10
+ cameras_are_circular: false
11
+
12
+ baseline_min: 1e-3
13
+ baseline_max: 1e2
14
+ max_fov: 110.0
15
+ avg_pose: false
config/dataset/dl3dv.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - base_dataset
3
+ - view_sampler: bounded
4
+
5
+ name: dl3dv
6
+ roots: [datasets/dl3dv]
7
+
8
+ input_image_shape: [256, 256]
9
+ original_image_shape: [540, 960]
10
+ cameras_are_circular: false
11
+
12
+ baseline_min: 1e-3
13
+ baseline_max: 1e2
14
+ max_fov: 100.0
15
+ avg_pose: false
16
+
17
+ rescale_to_1cube: true
18
+ make_baseline_1: false
19
+ intr_augment: true
config/dataset/scannetpp.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - base_dataset
3
+ - view_sampler: rank
4
+
5
+ name: scannetpp
6
+ roots: [datasets/scannetpp]
7
+
8
+ input_image_shape: [256, 256]
9
+ original_image_shape: [690, 1035]
10
+ cameras_are_circular: false
11
+
12
+ baseline_min: 1e-3
13
+ baseline_max: 1e2
14
+ max_fov: 130.0 # 120.0
15
+ metric_thre: 0.5 # aggressive metric threshold!!
16
+
17
+ skip_bad_shape: true # if use dlsr and iphone, set to false
18
+
19
+ rescale_to_1cube: true
20
+ make_baseline_1: false
21
+ intr_augment: true
22
+ normalize_by_pts3d: false
config/dataset/view_sampler/all.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ name: all
config/dataset/view_sampler/arbitrary.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ name: arbitrary
2
+
3
+ num_target_views: 1
4
+ num_context_views: 2
5
+
6
+ # If you want to hard-code context views, do so here.
7
+ context_views: null
config/dataset/view_sampler/bounded.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: bounded
2
+
3
+ num_target_views: 1
4
+ num_context_views: 24
5
+
6
+ min_distance_between_context_views: 2
7
+ max_distance_between_context_views: 6
8
+ min_distance_to_context_views: 0
9
+
10
+ warm_up_steps: 0
11
+ initial_min_distance_between_context_views: 2
12
+ initial_max_distance_between_context_views: 6
13
+
14
+ max_img_per_gpu: 24
15
+ min_gap_multiplier: 3
16
+ max_gap_multiplier: 5
config/dataset/view_sampler/evaluation.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ name: evaluation
2
+
3
+ index_path: assets/evaluation_index_re10k.json
4
+ num_context_views: 2
config/dataset/view_sampler/rank.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: rank
2
+
3
+ num_target_views: 4
4
+ num_context_views: 24
5
+
6
+ min_distance_between_context_views: 8
7
+ max_distance_between_context_views: 22
8
+ min_distance_to_context_views: 0
9
+
10
+ warm_up_steps: 0
11
+ initial_min_distance_between_context_views: 5
12
+ initial_max_distance_between_context_views: 7
13
+
14
+ max_img_per_gpu: 24
config/experiment/co3d.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ defaults:
4
+ - /dataset@_group_.co3d: co3d
5
+ - override /model/encoder: anysplat
6
+ - override /model/encoder/backbone: croco
7
+ - override /loss: [mse, lpips, depth_consis] # ablate: opacity loss
8
+
9
+ wandb:
10
+ name: co3d
11
+ tags: [co3d, 448x448]
12
+
13
+ model:
14
+ encoder:
15
+ gs_params_head_type: dpt_gs
16
+ pose_free: true
17
+ intrinsics_embed_loc: encoder
18
+ intrinsics_embed_type: token
19
+ pretrained_weights: ''
20
+ voxel_size: 0.002
21
+ pred_pose: true
22
+ anchor_feat_dim: 128
23
+ gs_prune: false # ablate: opacity loss
24
+ pred_head_type: depth
25
+ freeze_backbone: false
26
+ distill: true
27
+ render_conf: false
28
+ conf_threshold: 0.1
29
+ freeze_module: patch_embed
30
+ voxelize: true
31
+ intermediate_layer_idx: [4, 11, 17, 23]
32
+
33
+ dataset:
34
+ co3d:
35
+ input_image_shape: [224, 448]
36
+ view_sampler:
37
+ num_context_views: 24
38
+ num_target_views: 1
39
+ min_distance_between_context_views: 32
40
+ max_distance_between_context_views: 256
41
+ max_img_per_gpu: 24 # keep the same as num_context_views
42
+ avg_pose: false
43
+ intr_augment: true
44
+ normalize_by_pts3d: false
45
+ rescale_to_1cube: false
46
+
47
+ optimizer:
48
+ lr: 2e-4
49
+ warm_up_steps: 1000
50
+ backbone_lr_multiplier: 0.1
51
+
52
+ data_loader:
53
+ train:
54
+ batch_size: 1 # not used here
55
+
56
+ trainer:
57
+ max_steps: 30000
58
+ val_check_interval: 500
59
+ num_nodes: 1
60
+ accumulate_grad_batches: 1
61
+ precision: bf16-mixed
62
+
63
+ checkpointing:
64
+ load: null
65
+ every_n_train_steps: 200
66
+ save_weights_only: false
67
+ save_top_k: 5
68
+
69
+ train:
70
+ pose_loss_alpha: 1.0
71
+ pose_loss_delta: 1.0
72
+ cxt_depth_weight: 0.0
73
+ weight_pose: 10.0
74
+ weight_depth: 0.0
75
+ weight_normal: 0.0
76
+
77
+ hydra:
78
+ run:
79
+ dir: output/exp_${wandb.name}/${now:%Y-%m-%d_%H-%M-%S}
80
+
81
+ loss:
82
+ mse:
83
+ conf: false
84
+ lpips:
85
+ conf: false
86
+ depth_consis:
87
+ weight: 0.1
88
+ loss_type: MSE
89
+
90
+
config/experiment/dl3dv.yaml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ defaults:
4
+ - /dataset@_group_.dl3dv: dl3dv
5
+ - override /model/encoder: anysplat
6
+ - override /model/encoder/backbone: croco
7
+ - override /loss: [mse, lpips, depth_consis] # ablate: opacity loss
8
+
9
+ wandb:
10
+ name: dl3dv
11
+ tags: [dl3dv, 448x448]
12
+
13
+ model:
14
+ encoder:
15
+ gs_params_head_type: dpt_gs
16
+ pose_free: true
17
+ intrinsics_embed_loc: encoder
18
+ intrinsics_embed_type: token
19
+ pretrained_weights: ''
20
+ voxel_size: 0.002
21
+ pred_pose: true
22
+ anchor_feat_dim: 128
23
+ gs_prune: false # ablate: opacity loss
24
+ pred_head_type: depth
25
+ freeze_backbone: false
26
+ distill: true
27
+ render_conf: false
28
+ conf_threshold: 0.1
29
+ freeze_module: patch_embed
30
+ voxelize: true
31
+ intermediate_layer_idx: [4, 11, 17, 23]
32
+
33
+ dataset:
34
+ dl3dv:
35
+ input_image_shape: [224, 448]
36
+ view_sampler:
37
+
38
+ num_target_views: 2
39
+ min_distance_between_context_views: 32
40
+ max_distance_between_context_views: 256
41
+
42
+ min_gap_multiplier: 3
43
+ max_gap_multiplier: 5
44
+ avg_pose: false
45
+ intr_augment: true
46
+ normalize_by_pts3d: false
47
+ rescale_to_1cube: false
48
+
49
+ optimizer:
50
+ lr: 2e-4
51
+ warm_up_steps: 1000
52
+ backbone_lr_multiplier: 0.1
53
+
54
+ data_loader:
55
+ train:
56
+ batch_size: 1 # not used here
57
+
58
+ trainer:
59
+ max_steps: 30000
60
+ val_check_interval: 500
61
+ num_nodes: 1
62
+ accumulate_grad_batches: 1
63
+ precision: bf16-mixed
64
+
65
+ checkpointing:
66
+ load: null
67
+ every_n_train_steps: 200
68
+ save_weights_only: false
69
+ save_top_k: 5
70
+
71
+ train:
72
+ pose_loss_alpha: 1.0
73
+ pose_loss_delta: 1.0
74
+ cxt_depth_weight: 0.0
75
+ weight_pose: 10.0
76
+ weight_depth: 1.0
77
+ weight_normal: 0.0
78
+
79
+ hydra:
80
+ run:
81
+ dir: output/exp_${wandb.name}/${now:%Y-%m-%d_%H-%M-%S}
82
+
83
+ loss:
84
+ mse:
85
+ conf: false
86
+ lpips:
87
+ conf: false
88
+ depth_consis:
89
+ weight: 0.1
90
+ loss_type: MSE
91
+
92
+
config/experiment/multi-dataset.yaml ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ defaults:
4
+ - /dataset@_group_.dl3dv: dl3dv
5
+ - /dataset@_group_.co3d: co3d
6
+ - /dataset@_group_.scannetpp: scannetpp
7
+ - override /model/encoder: anysplat
8
+ - override /model/encoder/backbone: croco
9
+ - override /loss: [mse, lpips, depth_consis] # ablate: opacity loss
10
+
11
+ wandb:
12
+ name: multidataset-16gpu
13
+ tags: [multidataset, 448x448]
14
+
15
+ model:
16
+ encoder:
17
+ gs_params_head_type: dpt_gs
18
+ pose_free: true
19
+ intrinsics_embed_loc: encoder
20
+ intrinsics_embed_type: token
21
+ pretrained_weights: ''
22
+ voxel_size: 0.002
23
+ pred_pose: true
24
+ anchor_feat_dim: 128
25
+ gs_prune: false # ablate: opacity loss
26
+ pred_head_type: depth
27
+ freeze_backbone: false
28
+ distill: true
29
+ render_conf: false
30
+ conf_threshold: 0.1
31
+ freeze_module: patch_embed
32
+ voxelize: true
33
+ intermediate_layer_idx: [4, 11, 17, 23]
34
+
35
+ dataset:
36
+ dl3dv:
37
+ input_image_shape: [224, 448]
38
+ view_sampler:
39
+
40
+ num_target_views: 2
41
+ min_distance_between_context_views: 32
42
+ max_distance_between_context_views: 256
43
+
44
+ min_gap_multiplier: 3
45
+ max_gap_multiplier: 5
46
+ max_img_per_gpu: 24
47
+ avg_pose: false
48
+ intr_augment: true
49
+ normalize_by_pts3d: false
50
+ rescale_to_1cube: false
51
+
52
+ co3d:
53
+ input_image_shape: [224, 448]
54
+ view_sampler:
55
+
56
+ num_target_views: 1
57
+ min_distance_between_context_views: 32
58
+ max_distance_between_context_views: 256
59
+ max_img_per_gpu: 24
60
+
61
+ avg_pose: false
62
+ intr_augment: true
63
+ normalize_by_pts3d: false
64
+ rescale_to_1cube: false
65
+
66
+ scannetpp:
67
+ input_image_shape: [224, 448]
68
+ view_sampler:
69
+ num_target_views: 2
70
+ min_distance_between_context_views: 128
71
+ max_distance_between_context_views: 512
72
+ max_img_per_gpu: 24
73
+ avg_pose: false
74
+ intr_augment: true
75
+ normalize_by_pts3d: false
76
+ rescale_to_1cube: false
77
+
78
+ optimizer:
79
+ lr: 2e-4
80
+ warm_up_steps: 1000
81
+ backbone_lr_multiplier: 0.1
82
+
83
+ data_loader:
84
+ train:
85
+ batch_size: 1 # not used here
86
+
87
+ trainer:
88
+ max_steps: 30000
89
+ val_check_interval: 500
90
+ num_nodes: 2
91
+ accumulate_grad_batches: 1
92
+ precision: bf16-mixed
93
+
94
+ checkpointing:
95
+ load: null
96
+ every_n_train_steps: 200
97
+ save_weights_only: false
98
+ save_top_k: 5
99
+
100
+ train:
101
+ pose_loss_alpha: 1.0
102
+ pose_loss_delta: 1.0
103
+ cxt_depth_weight: 0.0
104
+ weight_pose: 10.0
105
+ weight_depth: 0.0
106
+ weight_normal: 0.0
107
+
108
+ hydra:
109
+ run:
110
+ dir: output/exp_${wandb.name}/${now:%Y-%m-%d_%H-%M-%S}
111
+
112
+ loss:
113
+ mse:
114
+ conf: false
115
+ lpips:
116
+ conf: false
117
+ depth_consis:
118
+ weight: 0.1
119
+ loss_type: MSE
120
+
121
+
config/experiment/scannetpp.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ defaults:
4
+ - /dataset@_group_.scannetpp: scannetpp
5
+ - override /model/encoder: anysplat
6
+ - override /model/encoder/backbone: croco
7
+ - override /loss: [mse, lpips, depth_consis] # ablate: opacity loss
8
+
9
+ wandb:
10
+ name: vggt-mdataset-new-scannetpp-dynamic_batchsampler
11
+ tags: [multidataset, 448x448]
12
+
13
+ model:
14
+ encoder:
15
+ gs_params_head_type: dpt_gs
16
+ pose_free: true
17
+ intrinsics_embed_loc: encoder
18
+ intrinsics_embed_type: token
19
+ pretrained_weights: ''
20
+ voxel_size: 0.002
21
+ pred_pose: true
22
+ anchor_feat_dim: 128
23
+ gs_prune: false # ablate: opacity loss
24
+ pred_head_type: depth
25
+ freeze_backbone: false
26
+ distill: true
27
+ render_conf: false
28
+ conf_threshold: 0.1
29
+ freeze_module: patch_embed
30
+ voxelize: true
31
+ intermediate_layer_idx: [4, 11, 17, 23]
32
+
33
+ dataset:
34
+ scannetpp:
35
+ input_image_shape: [224, 448]
36
+ view_sampler:
37
+ num_context_views: 24
38
+ num_target_views: 2
39
+ min_distance_between_context_views: 128
40
+ max_distance_between_context_views: 512
41
+ max_img_per_gpu: 24 # keep the same as num_context_views
42
+ avg_pose: false
43
+ intr_augment: true
44
+ normalize_by_pts3d: false
45
+ rescale_to_1cube: false
46
+
47
+ optimizer:
48
+ lr: 2e-4
49
+ warm_up_steps: 1000
50
+ backbone_lr_multiplier: 0.1
51
+
52
+ data_loader:
53
+ train:
54
+ batch_size: 1 # not used here
55
+
56
+ trainer:
57
+ max_steps: 30000
58
+ val_check_interval: 500
59
+ num_nodes: 1
60
+ accumulate_grad_batches: 1
61
+ precision: bf16-mixed
62
+
63
+ checkpointing:
64
+ load: null
65
+ every_n_train_steps: 200
66
+ save_weights_only: false
67
+ save_top_k: 5
68
+
69
+ train:
70
+ pose_loss_alpha: 1.0
71
+ pose_loss_delta: 1.0
72
+ cxt_depth_weight: 0.0
73
+ weight_pose: 10.0
74
+ weight_depth: 0.0
75
+ weight_normal: 0.0
76
+
77
+ hydra:
78
+ run:
79
+ dir: output/exp_${wandb.name}/${now:%Y-%m-%d_%H-%M-%S}
80
+
81
+ loss:
82
+ mse:
83
+ conf: false
84
+ lpips:
85
+ conf: false
86
+ depth_consis:
87
+ weight: 0.1
88
+ loss_type: MSE
89
+
90
+
config/generate_evaluation_index.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - dataset: re10k
3
+ - optional dataset/view_sampler_dataset_specific_config: ${dataset/view_sampler}_${dataset}
4
+ - override dataset/view_sampler: all
5
+
6
+ dataset:
7
+ overfit_to_scene: null
8
+
9
+ data_loader:
10
+ train:
11
+ num_workers: 0
12
+ persistent_workers: true
13
+ batch_size: 1
14
+ seed: 1234
15
+ test:
16
+ num_workers: 8
17
+ persistent_workers: false
18
+ batch_size: 1
19
+ seed: 2345
20
+ val:
21
+ num_workers: 0
22
+ persistent_workers: true
23
+ batch_size: 1
24
+ seed: 3456
25
+
26
+ index_generator:
27
+ num_target_views: 3
28
+ min_overlap: 0.6
29
+ max_overlap: 1.0
30
+ min_distance: 45
31
+ max_distance: 135
32
+ output_path: outputs/evaluation_index_re10k
33
+ save_previews: false
34
+ seed: 123
35
+
36
+ seed: 456
config/loss/chamfer_distance.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ chamfer_distance:
2
+ weight: 0.01
3
+ down_sample_ratio: 0.1
4
+ sigma_image: null
5
+
config/loss/depth.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ depth:
2
+ weight: 0.01
3
+ sigma_image: null
4
+ use_second_derivative: false
config/loss/depth_consis.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ depth_consis:
2
+ weight: 1.0
3
+ sigma_image: null
4
+ use_second_derivative: false
config/loss/depthgt.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ depthgt:
2
+ weight: 0.1
3
+ type: l1+gradient
config/loss/lod.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ lod:
2
+ mse_weight: 1.0
3
+ lpips_weight: 0.05
config/loss/lpips.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ lpips:
2
+ weight: 0.05
3
+ apply_after_step: 0
config/loss/mse.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ mse:
2
+ weight: 1.0
config/loss/normal_consis.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ normal_consis:
2
+ normal_weight: 1.0
3
+ smooth_weight: 1.0
4
+ sigma_image: null
5
+ use_second_derivative: false
config/loss/opacity.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ opacity:
2
+ weight: 0.1
3
+ type: exp+mean
config/main.yaml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - model/encoder: anysplat
3
+ - model/decoder: splatting_cuda
4
+ - loss: [mse]
5
+
6
+ wandb:
7
+ project: anysplat
8
+ entity: scene-representation-group
9
+ name: debug
10
+ mode: online
11
+ mode: train
12
+
13
+ #dataset:
14
+ # overfit_to_scene: null
15
+
16
+ data_loader:
17
+ # Avoid having to spin up new processes to print out visualizations.
18
+ train:
19
+ num_workers: 16 # 16
20
+ persistent_workers: true
21
+ batch_size: 4
22
+ seed: 1234
23
+ test:
24
+ num_workers: 4
25
+ persistent_workers: false
26
+ batch_size: 1
27
+ seed: 2345
28
+ val:
29
+ num_workers: 1
30
+ persistent_workers: true
31
+ batch_size: 1
32
+ seed: 3456
33
+
34
+ optimizer:
35
+ lr: 1.5e-4
36
+ warm_up_steps: 2000
37
+ backbone_lr_multiplier: 0.1
38
+
39
+ checkpointing:
40
+ load: null
41
+ every_n_train_steps: 5000
42
+ save_top_k: 1
43
+ save_weights_only: true
44
+
45
+ train:
46
+ output_path: ${hydra.run.dir}
47
+ depth_mode: null
48
+ extended_visualization: false
49
+ print_log_every_n_steps: 10
50
+ distiller: ''
51
+ distill_max_steps: 1000000
52
+ random_context_views: false
53
+
54
+ test:
55
+ output_path: outputs/test-nopo
56
+ align_pose: true
57
+ pose_align_steps: 100
58
+ rot_opt_lr: 0.005
59
+ trans_opt_lr: 0.005
60
+ compute_scores: true
61
+ save_image: true
62
+ save_video: false
63
+ save_compare: true
64
+ generate_video: false
65
+ mode: inference
66
+ image_folder: examples/bungeenerf
67
+
68
+ seed: 111123
69
+
70
+ trainer:
71
+ max_steps: -1
72
+ val_check_interval: 250
73
+ gradient_clip_val: 0.5
74
+ num_nodes: 1
75
+ accumulate_grad_batches: 1
76
+
77
+ hydra:
78
+ run:
79
+ dir: output-debug/exp_${wandb.name}/${now:%Y-%m-%d_%H-%M-%S}
80
+ # run:
81
+ # dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}_rank${oc.env:LOCAL_RANK,0}
config/model/decoder/splatting_cuda.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ name: splatting_cuda
2
+ background_color: [1.0, 1.0, 1.0]
3
+ make_scale_invariant: false
config/model/encoder/anysplat.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - backbone: croco
3
+
4
+ name: anysplat
5
+
6
+ opacity_mapping:
7
+ initial: 0.0
8
+ final: 0.0
9
+ warm_up: 1
10
+
11
+ num_monocular_samples: 32
12
+ num_surfaces: 1
13
+ predict_opacity: false
14
+
15
+ gaussians_per_pixel: 1
16
+
17
+ gaussian_adapter:
18
+ gaussian_scale_min: 0.5
19
+ gaussian_scale_max: 15.0
20
+ sh_degree: 4
21
+
22
+ d_feature: 32
23
+
24
+ visualizer:
25
+ num_samples: 8
26
+ min_resolution: 256
27
+ export_ply: false
28
+
29
+ apply_bounds_shim: true
30
+
31
+ gs_params_head_type: dpt_gs
32
+ pose_free: true
33
+ pretrained_weights: ""
34
+ scale_align: false
35
+
36
+ voxel_size: 0.001
37
+ n_offsets: 2
38
+ anchor_feat_dim: 83 # 32
39
+ add_view: false
40
+ color_attr: 3D # 3D or RGB
41
+ mlp_type: unified
42
+ scaffold: true
43
+
44
+ # unet3d:
45
+ # # lifter_params:
46
+ # # img_in_dim: 32
47
+ # # voxel_out_dim: 32
48
+ # img_feature_source: dino
49
+ # in_channels: 83 # 32 keep same as anchor_feat_dim
50
+ # num_blocks: 2 # 512 -> 128
51
+ # f_maps: 83 # 32
52
+ # # f_maps_2d: 32
53
+ # neck_dense_type: "UNCHANGED"
54
+ # neck_bound: 4
55
+ # use_attention: true
56
+ # gs_enhanced: "original"
57
+ # gsplat_upsample: 4
58
+ # occ_upsample: 1
59
+ # max_scaling: 10
60
+ # max_return: 2
61
+ # feature_pooling_2d: "max"
62
+ # gs_free_space: "free-1"
config/model/encoder/backbone/croco.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ name: croco
2
+
3
+ model: ViTLarge_BaseDecoder
4
+ patch_embed_cls: PatchEmbedDust3R
5
+ asymmetry_decoder: true
6
+
7
+ intrinsics_embed_loc: 'encoder'
8
+ intrinsics_embed_degree: 4
9
+ intrinsics_embed_type: 'token'
demo_gradio.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import functools
3
+ import gc
4
+ import os
5
+ import shutil
6
+ import sys
7
+ import tempfile
8
+ import time
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+
12
+ import cv2
13
+ import gradio as gr
14
+ import torch
15
+ from huggingface_hub import hf_hub_download
16
+ from PIL import Image
17
+
18
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
19
+
20
+ from src.misc.image_io import save_interpolated_video
21
+ from src.model.model.anysplat import AnySplat
22
+ from src.model.ply_export import export_ply
23
+ from src.utils.image import process_image
24
+
25
+
26
+ # 1) Core model inference
27
+ def get_reconstructed_scene(outdir, model, device):
28
+ # Load Images
29
+ image_files = sorted(
30
+ [
31
+ os.path.join(outdir, "images", f)
32
+ for f in os.listdir(os.path.join(outdir, "images"))
33
+ ]
34
+ )
35
+ images = [process_image(img_path) for img_path in image_files]
36
+ images = torch.stack(images, dim=0).unsqueeze(0).to(device) # [1, K, 3, 448, 448]
37
+ b, v, c, h, w = images.shape
38
+
39
+ assert c == 3, "Images must have 3 channels"
40
+
41
+ # Run Inference
42
+ gaussians, pred_context_pose = model.inference((images + 1) * 0.5)
43
+
44
+ # Save the results
45
+ pred_all_extrinsic = pred_context_pose["extrinsic"]
46
+ pred_all_intrinsic = pred_context_pose["intrinsic"]
47
+ video, depth_colored = save_interpolated_video(
48
+ pred_all_extrinsic,
49
+ pred_all_intrinsic,
50
+ b,
51
+ h,
52
+ w,
53
+ gaussians,
54
+ outdir,
55
+ model.decoder,
56
+ )
57
+
58
+ plyfile = os.path.join(outdir, "gaussians.ply")
59
+ export_ply(
60
+ gaussians.means[0],
61
+ gaussians.scales[0],
62
+ gaussians.rotations[0],
63
+ gaussians.harmonics[0],
64
+ gaussians.opacities[0],
65
+ Path(plyfile),
66
+ save_sh_dc_only=True,
67
+ )
68
+
69
+ # Clean up
70
+ torch.cuda.empty_cache()
71
+ return plyfile, video, depth_colored
72
+
73
+
74
+ # 2) Handle uploaded video/images --> produce target_dir + images
75
+ def handle_uploads(input_video, input_images):
76
+ """
77
+ Create a new 'target_dir' + 'images' subfolder, and place user-uploaded
78
+ images or extracted frames from video into it. Return (target_dir, image_paths).
79
+ """
80
+ start_time = time.time()
81
+ gc.collect()
82
+ torch.cuda.empty_cache()
83
+
84
+ # Create a unique folder name
85
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
86
+ target_dir = f"input_images_{timestamp}"
87
+ target_dir_images = os.path.join(target_dir, "images")
88
+
89
+ # Clean up if somehow that folder already exists
90
+ if os.path.exists(target_dir):
91
+ shutil.rmtree(target_dir)
92
+ os.makedirs(target_dir)
93
+ os.makedirs(target_dir_images)
94
+
95
+ image_paths = []
96
+
97
+ # --- Handle images ---
98
+ if input_images is not None:
99
+ for file_data in input_images:
100
+ if isinstance(file_data, dict) and "name" in file_data:
101
+ file_path = file_data["name"]
102
+ else:
103
+ file_path = file_data
104
+ dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
105
+ shutil.copy(file_path, dst_path)
106
+ image_paths.append(dst_path)
107
+
108
+ # --- Handle video ---
109
+ if input_video is not None:
110
+ if isinstance(input_video, dict) and "name" in input_video:
111
+ video_path = input_video["name"]
112
+ else:
113
+ video_path = input_video
114
+
115
+ vs = cv2.VideoCapture(video_path)
116
+ fps = vs.get(cv2.CAP_PROP_FPS)
117
+ frame_interval = int(fps * 1) # 1 frame/sec
118
+
119
+ count = 0
120
+ video_frame_num = 0
121
+ while True:
122
+ gotit, frame = vs.read()
123
+ if not gotit:
124
+ break
125
+ count += 1
126
+ if count % frame_interval == 0:
127
+ image_path = os.path.join(
128
+ target_dir_images, f"{video_frame_num:06}.png"
129
+ )
130
+ cv2.imwrite(image_path, frame)
131
+ image_paths.append(image_path)
132
+ video_frame_num += 1
133
+
134
+ # Sort final images for gallery
135
+ image_paths = sorted(image_paths)
136
+
137
+ end_time = time.time()
138
+ print(
139
+ f"Files copied to {target_dir_images}; took {end_time - start_time:.3f} seconds"
140
+ )
141
+ return target_dir, image_paths
142
+
143
+
144
+ # 3) Update gallery on upload
145
+ def update_gallery_on_upload(input_video, input_images):
146
+ """
147
+ Whenever user uploads or changes files, immediately handle them
148
+ and show in the gallery. Return (target_dir, image_paths).
149
+ If nothing is uploaded, returns "None" and empty list.
150
+ """
151
+ if not input_video and not input_images:
152
+ return None, None, None
153
+ target_dir, image_paths = handle_uploads(input_video, input_images)
154
+ return None, target_dir, image_paths
155
+
156
+
157
+ # 4) Reconstruction: uses the target_dir plus any viz parameters
158
+ def gradio_demo(
159
+ target_dir,
160
+ ):
161
+ """
162
+ Perform reconstruction using the already-created target_dir/images.
163
+ """
164
+ if not os.path.isdir(target_dir) or target_dir == "None":
165
+ return None, None, None
166
+
167
+ start_time = time.time()
168
+ gc.collect()
169
+ torch.cuda.empty_cache()
170
+
171
+ # Prepare frame_filter dropdown
172
+ target_dir_images = os.path.join(target_dir, "images")
173
+ all_files = (
174
+ sorted(os.listdir(target_dir_images))
175
+ if os.path.isdir(target_dir_images)
176
+ else []
177
+ )
178
+ all_files = [f"{i}: {filename}" for i, filename in enumerate(all_files)]
179
+
180
+ print("Running run_model...")
181
+ with torch.no_grad():
182
+ plyfile, video, depth_colored = get_reconstructed_scene(
183
+ target_dir, model, device
184
+ )
185
+
186
+ end_time = time.time()
187
+ print(f"Total time: {end_time - start_time:.2f} seconds (including IO)")
188
+
189
+ return plyfile, video, depth_colored
190
+
191
+
192
+ def clear_fields():
193
+ """
194
+ Clears the 3D viewer, the stored target_dir, and empties the gallery.
195
+ """
196
+ return None, None, None
197
+
198
+
199
+ if __name__ == "__main__":
200
+ server_name = "127.0.0.1"
201
+ server_port = None
202
+ share = True
203
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
204
+
205
+ # Load model
206
+ model = AnySplat.from_pretrained(
207
+ "lhjiang/anysplat"
208
+ )
209
+ model = model.to(device)
210
+ model.eval()
211
+ for param in model.parameters():
212
+ param.requires_grad = False
213
+
214
+ theme = gr.themes.Ocean()
215
+ theme.set(
216
+ checkbox_label_background_fill_selected="*button_primary_background_fill",
217
+ checkbox_label_text_color_selected="*button_primary_text_color",
218
+ )
219
+ css = """
220
+ .custom-log * {
221
+ font-style: italic;
222
+ font-size: 22px !important;
223
+ background-image: linear-gradient(120deg, #0ea5e9 0%, #6ee7b7 60%, #34d399 100%);
224
+ -webkit-background-clip: text;
225
+ background-clip: text;
226
+ font-weight: bold !important;
227
+ color: transparent !important;
228
+ text-align: center !important;
229
+ }
230
+
231
+ .example-log * {
232
+ font-style: italic;
233
+ font-size: 16px !important;
234
+ background-image: linear-gradient(120deg, #0ea5e9 0%, #6ee7b7 60%, #34d399 100%);
235
+ -webkit-background-clip: text;
236
+ background-clip: text;
237
+ color: transparent !important;
238
+ }
239
+
240
+ #my_radio .wrap {
241
+ display: flex;
242
+ flex-wrap: nowrap;
243
+ justify-content: center;
244
+ align-items: center;
245
+ }
246
+
247
+ #my_radio .wrap label {
248
+ display: flex;
249
+ width: 50%;
250
+ justify-content: center;
251
+ align-items: center;
252
+ margin: 0;
253
+ padding: 10px 0;
254
+ box-sizing: border-box;
255
+ }
256
+ """
257
+ with gr.Blocks(css=css, title="AnySplat Demo", theme=theme) as demo:
258
+ gr.Markdown(
259
+ """
260
+ <h1 style='text-align: center;'>AnySplat: Feed-forward 3D Gaussian Splatting from Unconstrained Views</h1>
261
+ """
262
+ )
263
+
264
+ with gr.Row():
265
+ gr.Markdown(
266
+ """
267
+ <p align="center">
268
+ <a title="Website" href="https://city-super.github.io/anysplat/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
269
+ <img src="https://www.obukhov.ai/img/badges/badge-website.svg">
270
+ </a>
271
+ <a title="arXiv" href="https://arxiv.org/pdf/2505.23716" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
272
+ <img src="https://www.obukhov.ai/img/badges/badge-pdf.svg">
273
+ </a>
274
+ <a title="Github" href="https://github.com/OpenRobotLab/AnySplat" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
275
+ <img src="https://img.shields.io/badge/Github-Page-black" alt="badge-github-stars">
276
+ </a>
277
+
278
+ </p>
279
+ """
280
+ )
281
+ with gr.Row():
282
+ gr.Markdown(
283
+ """
284
+ ### Getting Started:
285
+
286
+ 1. Upload Your Data: Use the "Upload Video" or "Upload Images" buttons on the left to provide your input. Videos will be automatically split into individual frames (one frame per second).
287
+
288
+ 2. Preview: Your uploaded images will appear in the gallery on the left.
289
+
290
+ 3. Reconstruct: Click the "Reconstruct" button to start the 3D reconstruction process.
291
+
292
+ 4. Visualize: The reconstructed 3D Gaussian Splat will appear in the viewer on the right, along with the rendered RGB and depth videos. The trajectory of the rendered video is obtained by interpolating the estimated input image poses.
293
+
294
+ <strong style="color: #0ea5e9;">Please note:</strong> <span style="color: #0ea5e9; font-weight: bold;">The generated splats are large in size, so they may not load successfully in the Hugging Face demo. You can download the .ply file and render it using other viewers, such as [SuperSplat](https://playcanvas.com/supersplat/editor).</span>
295
+ """
296
+ )
297
+
298
+ target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
299
+ is_example = gr.Textbox(label="is_example", visible=False, value="None")
300
+ num_images = gr.Textbox(label="num_images", visible=False, value="None")
301
+ dataset_name = gr.Textbox(label="dataset_name", visible=False, value="None")
302
+ scene_name = gr.Textbox(label="scene_name", visible=False, value="None")
303
+ image_type = gr.Textbox(label="image_type", visible=False, value="None")
304
+
305
+ with gr.Row():
306
+ with gr.Column(scale=2):
307
+ with gr.Tabs():
308
+ with gr.Tab("Input Data"):
309
+ input_video = gr.Video(label="Upload Video", interactive=True)
310
+ input_images = gr.File(
311
+ file_count="multiple",
312
+ label="Upload Images",
313
+ interactive=True,
314
+ )
315
+
316
+ image_gallery = gr.Gallery(
317
+ label="Preview",
318
+ columns=4,
319
+ height="300px",
320
+ show_download_button=True,
321
+ object_fit="contain",
322
+ preview=True,
323
+ )
324
+
325
+ with gr.Column(scale=4):
326
+ with gr.Tabs():
327
+ with gr.Tab("AnySplat Output"):
328
+ with gr.Column():
329
+ reconstruction_output = gr.Model3D(
330
+ label="3D Reconstructed Gaussian Splat",
331
+ height=540,
332
+ zoom_speed=0.5,
333
+ pan_speed=0.5,
334
+ camera_position=[20, 20, 20],
335
+ )
336
+
337
+ with gr.Row():
338
+ with gr.Row():
339
+ rgb_video = gr.Video(
340
+ label="RGB Video", interactive=False, autoplay=True
341
+ )
342
+ depth_video = gr.Video(
343
+ label="Depth Video",
344
+ interactive=False,
345
+ autoplay=True,
346
+ )
347
+
348
+ with gr.Row():
349
+ submit_btn = gr.Button(
350
+ "Reconstruct", scale=1, variant="primary"
351
+ )
352
+ clear_btn = gr.ClearButton(
353
+ [
354
+ input_video,
355
+ input_images,
356
+ reconstruction_output,
357
+ target_dir_output,
358
+ image_gallery,
359
+ rgb_video,
360
+ depth_video,
361
+ ],
362
+ scale=1,
363
+ )
364
+
365
+ # ---------------------- Examples section ----------------------
366
+
367
+ examples = [
368
+ [None, "examples/video/re10k_1eca36ec55b88fe4.mp4", "re10k", "1eca36ec55b88fe4", "2", "Real", "True",],
369
+ [None, "examples/video/bungeenerf_colosseum.mp4", "bungeenerf", "colosseum", "8", "Synthetic", "True",],
370
+ [None, "examples/video/fox.mp4", "InstantNGP", "fox", "14", "Real", "True",],
371
+ [None, "examples/video/matrixcity_street.mp4", "matrixcity", "street", "32", "Synthetic", "True",],
372
+ [None, "examples/video/vrnerf_apartment.mp4", "vrnerf", "apartment", "32", "Real", "True",],
373
+ [None, "examples/video/vrnerf_kitchen.mp4", "vrnerf", "kitchen", "17", "Real", "True",],
374
+ [None, "examples/video/vrnerf_riverview.mp4", "vrnerf", "riverview", "12", "Real", "True",],
375
+ [None, "examples/video/vrnerf_workshop.mp4", "vrnerf", "workshop", "32", "Real", "True",],
376
+ [None, "examples/video/fillerbuster_ramen.mp4", "fillerbuster", "ramen", "32", "Real", "True",],
377
+ [None, "examples/video/meganerf_rubble.mp4", "meganerf", "rubble", "10", "Real", "True",],
378
+ [None, "examples/video/llff_horns.mp4", "llff", "horns", "12", "Real", "True",],
379
+ [None, "examples/video/llff_fortress.mp4", "llff", "fortress", "7", "Real", "True",],
380
+ [None, "examples/video/dtu_scan_106.mp4", "dtu", "scan_106", "20", "Real", "True",],
381
+ [None, "examples/video/horizongs_hillside_summer.mp4", "horizongs", "hillside_summer", "55", "Synthetic", "True",],
382
+ [None, "examples/video/kitti360.mp4", "kitti360", "kitti360", "64", "Real", "True",],
383
+ ]
384
+
385
+ def example_pipeline(
386
+ input_images,
387
+ input_video,
388
+ dataset_name,
389
+ scene_name,
390
+ num_images_str,
391
+ image_type,
392
+ is_example,
393
+ ):
394
+ """
395
+ 1) Copy example images to new target_dir
396
+ 2) Reconstruct
397
+ 3) Return model3D + logs + new_dir + updated dropdown + gallery
398
+ We do NOT return is_example. It's just an input.
399
+ """
400
+ target_dir, image_paths = handle_uploads(input_video, input_images)
401
+ plyfile, video, depth_colored = gradio_demo(target_dir)
402
+ return plyfile, video, depth_colored, target_dir, image_paths
403
+
404
+ gr.Markdown("Click any row to load an example.", elem_classes=["example-log"])
405
+
406
+ gr.Examples(
407
+ examples=examples,
408
+ inputs=[
409
+ input_images,
410
+ input_video,
411
+ dataset_name,
412
+ scene_name,
413
+ num_images,
414
+ image_type,
415
+ is_example,
416
+ ],
417
+ outputs=[
418
+ reconstruction_output,
419
+ rgb_video,
420
+ depth_video,
421
+ target_dir_output,
422
+ image_gallery,
423
+ ],
424
+ fn=example_pipeline,
425
+ cache_examples=False,
426
+ examples_per_page=50,
427
+ )
428
+
429
+ gr.Markdown("<p style='text-align: center; font-style: italic; color: #666;'>We thank VGGT for their excellent gradio implementation!</p>")
430
+
431
+ submit_btn.click(
432
+ fn=clear_fields,
433
+ inputs=[],
434
+ outputs=[reconstruction_output, rgb_video, depth_video],
435
+ ).then(
436
+ fn=gradio_demo,
437
+ inputs=[
438
+ target_dir_output,
439
+ ],
440
+ outputs=[reconstruction_output, rgb_video, depth_video],
441
+ ).then(
442
+ fn=lambda: "False", inputs=[], outputs=[is_example]
443
+ )
444
+
445
+ input_video.change(
446
+ fn=update_gallery_on_upload,
447
+ inputs=[input_video, input_images],
448
+ outputs=[reconstruction_output, target_dir_output, image_gallery],
449
+ )
450
+ input_images.change(
451
+ fn=update_gallery_on_upload,
452
+ inputs=[input_video, input_images],
453
+ outputs=[reconstruction_output, target_dir_output, image_gallery],
454
+ )
455
+
456
+ # demo.launch(share=share, server_name=server_name, server_port=server_port)
457
+ demo.queue(max_size=20).launch(show_error=True, share=True)
458
+
459
+ # We thank VGGT for their excellent gradio implementation
examples/video/bungeenerf_colosseum.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:416b6af945547b5d19476823672de552944c7b5a147d29e9e8243e91a16aee3e
3
+ size 329073
examples/video/dtu_scan_106.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16d7a06325cd368b134908e600a6c0741c7d0d188f1db690532b8ac85d65fef5
3
+ size 352188
examples/video/fillerbuster_hand_hand.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b4ca982672bc92342b3e722c171d9d2e4d67a5a8116cd9f346956fbe01e253f
3
+ size 319404
examples/video/fillerbuster_ramen.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60346a64a0a0d6805131d0d57edeeb0dae24f24c3f10560e95df65531221229
3
+ size 660736
examples/video/fox.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3fa2ccff78e5d8085bb58f3def2d482e8df285ced5ef1b56abfe3766f0d90e0
3
+ size 2361921
examples/video/horizongs_hillside_summer.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5dff78d9c00b3776bfca3a370061698bddead2ae940fe5a42d082ccf2ca80d1
3
+ size 1606537
examples/video/kitti360.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c6b13929b2c2aae8b95921d8626f5be06f6afffe05ea4e47940ffeb9906f9fc
3
+ size 1843629
examples/video/llff_fortress.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90ea046a0ec78651975529ebe6b9c72b60c19561fe61b15b15b9df0e44d9fe9a
3
+ size 196243
examples/video/llff_horns.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bc4c443c2a3f889f0c1283e98bd6a7026c36858fb37808bb2e8699ad1a2c1d8
3
+ size 372570
examples/video/matrixcity_street.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa415f27177398b4e06f580beb3778701ca55784afade2fd6a058212213febc8
3
+ size 3163684
examples/video/meganerf_rubble.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3410c759eb73ca2403ab8fe35d5ebabdbc25e3a0e67d8670a89fe17686246ed0
3
+ size 450116
examples/video/re10k_1eca36ec55b88fe4.mp4 ADDED
Binary file (35.1 kB). View file
 
examples/video/vrnerf_apartment.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fdd5f165a4293cd95e3dd88d84b1f370decdd86308aa67a9d3832e01f4d6906
3
+ size 2076392
examples/video/vrnerf_kitchen.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3db5d766ec86a7abdfe1f033b252337e6d934ea15035fafb4d0fc0c0e9e9740a
3
+ size 775715
examples/video/vrnerf_riverview.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b8187936cc49910ef330a37b1bbdab0076096d6c01f33b097c11937184de168
3
+ size 768290
examples/video/vrnerf_workshop.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0f1334acc74bd70086a9be94d0c36838ebd7499af27f942c315e1ba282e285b
3
+ size 1718918
examples/vrnerf/riverview/21_DSC0001.jpg ADDED

Git LFS Details

  • SHA256: 7600a24a0725bf42c2260c748f11f39ef495a065f187dd894a6a6b643d209a79
  • Pointer size: 131 Bytes
  • Size of remote file: 478 kB