Spaces:

yslan
/

LN3Diff_I23D

Running on Zero

App Files Files Community

NIRVANALAN commited on Aug 20, 2024

Commit

52d2875

1 Parent(s): c00df70

update

Browse files

Files changed (4) hide show

app.py +2 -1
nsr/lsgm/flow_matching_trainer.py +21 -8
nsr/train_util_diffusion.py +31 -17
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -341,7 +341,8 @@ def main(args):
                 with gr.Row():
                     with gr.Tab("Reconstruction"):
                         with gr.Column():
-                            output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True)
                             output_model = gr.Model3D(
                                 height=384,
                                 clear_color=(1,1,1,1),

                 with gr.Row():
                     with gr.Tab("Reconstruction"):
                         with gr.Column():
+                            # output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True)
+                            output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True, loop=True)
                             output_model = gr.Model3D(
                                 height=384,
                                 clear_color=(1,1,1,1),

nsr/lsgm/flow_matching_trainer.py CHANGED Viewed

@@ -678,25 +678,36 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
         self.ddpm_model.train()
     @th.inference_mode()
     def eval_i23d_and_export(
         self,
         inp_img,
         # camera,
         prompt="",
         save_img=False,
         use_train_trajectory=False,
         num_samples=1,
         num_instances=1,
-        unconditional_guidance_scale=4.0, # default value in neural ode
         export_mesh=True,
         **kwargs,
     ):
-        output_model= './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/mesh/cfg=4.0_sample-0.ply'
-        output_video =  './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/triplane_cfg=4.0_sample-0.mp4'
-        return output_video, output_model
         camera = th.load('assets/objv_eval_pose.pt', map_location=dist_util.dev())[:]
         inp_img = th.from_numpy(inp_img).permute(2,0,1).unsqueeze(0) / 127.5 - 1 # to [-1,1]
@@ -722,7 +733,7 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
         ucg_keys = [self.cond_key] # i23d
-        sampling_kwargs = {'cfg_scale': unconditional_guidance_scale}
         N = num_samples  # hard coded, to update
         z_shape = (
@@ -769,7 +780,7 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
                 th.cuda.empty_cache()
                 # ! render sampled latent
-                name_prefix = f'cfg={unconditional_guidance_scale}_sample-{i}'
                 if self.cond_key == 'caption':
                     name_prefix = f'{name_prefix}_{prompt}'
@@ -784,7 +795,9 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
                         save_img=save_img,
                         render_reference=batch,
                         export_mesh=export_mesh,
-                        render_all=True)
                     all_vid_dump_path.append(vid_dump_path)
                     all_mesh_dump_path.append(mesh_dump_path)
@@ -810,4 +823,4 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
                 # else:
             batch_c = {self.cond_key: inp_img.to(dist_util.dev()).to(self.dtype)}
-            return sample_and_save(batch_c)

         self.ddpm_model.train()
     @th.inference_mode()
     def eval_i23d_and_export(
         self,
         inp_img,
+        num_steps=250,
+        seed=42,
+        mesh_size=192,
+        mesh_thres=10,
+        unconditional_guidance_scale=4.0, # default value in neural ode
         # camera,
         prompt="",
         save_img=False,
         use_train_trajectory=False,
         num_samples=1,
         num_instances=1,
         export_mesh=True,
         **kwargs,
     ):
+        # output_model, output_video = './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/mesh/cfg=4.0_sample-0.obj', './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/triplane_cfg=4.0_sample-0.mp4'
+        # return output_model, output_video
+        logger.log(
+            num_steps,
+            unconditional_guidance_scale,
+            seed,
+            mesh_size,
+            mesh_thres,
+        )
         camera = th.load('assets/objv_eval_pose.pt', map_location=dist_util.dev())[:]
         inp_img = th.from_numpy(inp_img).permute(2,0,1).unsqueeze(0) / 127.5 - 1 # to [-1,1]
         ucg_keys = [self.cond_key] # i23d
+        sampling_kwargs = {'cfg_scale': unconditional_guidance_scale, 'num_steps': num_steps, 'seed': seed}
         N = num_samples  # hard coded, to update
         z_shape = (
                 th.cuda.empty_cache()
                 # ! render sampled latent
+                name_prefix = f'cfg_{unconditional_guidance_scale}_sample-{i}'
                 if self.cond_key == 'caption':
                     name_prefix = f'{name_prefix}_{prompt}'
                         save_img=save_img,
                         render_reference=batch,
                         export_mesh=export_mesh,
+                        render_all=True,
+                        mesh_size=mesh_size,
+                        mesh_thres=mesh_thres)
                     all_vid_dump_path.append(vid_dump_path)
                     all_mesh_dump_path.append(mesh_dump_path)
                 # else:
             batch_c = {self.cond_key: inp_img.to(dist_util.dev()).to(self.dtype)}
+            return sample_and_save(batch_c)

nsr/train_util_diffusion.py CHANGED Viewed

@@ -18,6 +18,8 @@ from torch.utils.tensorboard.writer import SummaryWriter
 from tqdm import tqdm
 import matplotlib.pyplot as plt
 from guided_diffusion.gaussian_diffusion import _extract_into_tensor
 from guided_diffusion import dist_util, logger
 from guided_diffusion.fp16_util import MixedPrecisionTrainer
@@ -31,14 +33,10 @@ from guided_diffusion.train_util import (TrainLoop, calc_average_loss,
                                          log_rec3d_loss_dict,
                                          parse_resume_step_from_filename)
-import mcubes
-import trimesh
 import dnnlib
-from safetensors.torch import load_file
-from huggingface_hub import hf_hub_download
 from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
 # AMP
 # from accelerate import Accelerator
@@ -48,6 +46,16 @@ from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
 # use_amp = False
 # use_amp = True
 class TrainLoopDiffusionWithRec(TrainLoop):
     """an interface with rec_model required apis
@@ -173,7 +181,9 @@ class TrainLoopDiffusionWithRec(TrainLoop):
                                     save_img=False,
                                     render_reference=None,
                                     export_mesh=False,
-                                    render_all=False):
         planes *= self.triplane_scaling_divider  # if setting clip_denoised=True, the sampled planes will lie in [-1,1]. Thus, values beyond [+- std] will be abandoned in this version. Move to IN for later experiments.
@@ -196,9 +206,8 @@ class TrainLoopDiffusionWithRec(TrainLoop):
                       behaviour='decode_after_vae_no_render'))
         if export_mesh:
-            # if True:
-            mesh_size = 192 # avoid OOM on V100
-            mesh_thres = 10  # TODO, requires tuning
             dump_path = f'{logger.get_dir()}/mesh/'
             os.makedirs(dump_path, exist_ok=True)
@@ -220,6 +229,10 @@ class TrainLoopDiffusionWithRec(TrainLoop):
             vtx_colors = rec_model.decoder.forward_points(ddpm_latent['latent_after_vit'], vtx_tensor)['rgb'].float().squeeze(0).cpu().numpy()  # (0, 1)
             vtx_colors = (vtx_colors.clip(0,1) * 255).astype(np.uint8)
             mesh = trimesh.Trimesh(vertices=vtx, faces=faces, vertex_colors=vtx_colors)
             # st()
             # mesh = trimesh.Trimesh(
@@ -227,16 +240,17 @@ class TrainLoopDiffusionWithRec(TrainLoop):
             #     faces=faces,
             # )
-            mesh_dump_path = os.path.join(dump_path, f'{name_prefix}.ply')
-            mesh.export(mesh_dump_path, 'ply')
-            print(f"Mesh dumped to {dump_path}")
             del grid_out, mesh
             th.cuda.empty_cache()
             # return
         video_out = imageio.get_writer(
-            f'{logger.get_dir()}/triplane_{name_prefix}.mp4',
             mode='I',
             fps=15,
             codec='libx264')
@@ -331,8 +345,7 @@ class TrainLoopDiffusionWithRec(TrainLoop):
                     ],
                     dim=-1)  # B, 3, H, W
-            if False:
-            # if save_img:
                 for batch_idx in range(gen_img.shape[0]):
                     sampled_img = Image.fromarray(
                         (gen_img[batch_idx].permute(1, 2, 0).cpu().numpy() *
@@ -357,11 +370,12 @@ class TrainLoopDiffusionWithRec(TrainLoop):
         # if not save_img:
         video_out.close()
         del video_out
-        print('logged video to: ',
-              f'{logger.get_dir()}/triplane_{name_prefix}.mp4')
         del vis, pred_vis, micro, pred,
     def _init_optim_groups(self, rec_model, freeze_decoder=False):
         """for initializing the reconstruction model; fixing decoder part.
         """

 from tqdm import tqdm
 import matplotlib.pyplot as plt
+from safetensors.torch import load_file
 from guided_diffusion.gaussian_diffusion import _extract_into_tensor
 from guided_diffusion import dist_util, logger
 from guided_diffusion.fp16_util import MixedPrecisionTrainer
                                          log_rec3d_loss_dict,
                                          parse_resume_step_from_filename)
 import dnnlib
 from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
+from huggingface_hub import hf_hub_download
 # AMP
 # from accelerate import Accelerator
 # use_amp = False
 # use_amp = True
+# Function to generate a rotation matrix for an arbitrary theta along the x-axis
+def rotation_matrix_x(theta_degrees):
+    theta = np.radians(theta_degrees)  # Convert degrees to radians
+    cos_theta = np.cos(theta)
+    sin_theta = np.sin(theta)
+    rotation_matrix = np.array([[1, 0, 0],
+                                [0, cos_theta, -sin_theta],
+                                [0, sin_theta, cos_theta]])
+    return rotation_matrix
 class TrainLoopDiffusionWithRec(TrainLoop):
     """an interface with rec_model required apis
                                     save_img=False,
                                     render_reference=None,
                                     export_mesh=False,
+                                    render_all=False,
+                                    mesh_size=192,
+                                    mesh_thres=10):
         planes *= self.triplane_scaling_divider  # if setting clip_denoised=True, the sampled planes will lie in [-1,1]. Thus, values beyond [+- std] will be abandoned in this version. Move to IN for later experiments.
                       behaviour='decode_after_vae_no_render'))
         if export_mesh:
+            import mcubes
+            import trimesh
             dump_path = f'{logger.get_dir()}/mesh/'
             os.makedirs(dump_path, exist_ok=True)
             vtx_colors = rec_model.decoder.forward_points(ddpm_latent['latent_after_vit'], vtx_tensor)['rgb'].float().squeeze(0).cpu().numpy()  # (0, 1)
             vtx_colors = (vtx_colors.clip(0,1) * 255).astype(np.uint8)
+            # rotate mesh along x dim
+            vtx = np.transpose(rotation_matrix_x(-90) @ np.transpose(vtx))
             mesh = trimesh.Trimesh(vertices=vtx, faces=faces, vertex_colors=vtx_colors)
             # st()
             # mesh = trimesh.Trimesh(
             #     faces=faces,
             # )
+            mesh_dump_path = os.path.join(dump_path, f'{name_prefix}.obj')
+            mesh.export(mesh_dump_path, 'obj')
+            logger.log(f"Mesh dumped to {mesh_dump_path}")
             del grid_out, mesh
             th.cuda.empty_cache()
             # return
+        vid_dump_path = f'{logger.get_dir()}/triplane_{name_prefix}.mp4'
         video_out = imageio.get_writer(
+            vid_dump_path,
             mode='I',
             fps=15,
             codec='libx264')
                     ],
                     dim=-1)  # B, 3, H, W
+            if save_img:
                 for batch_idx in range(gen_img.shape[0]):
                     sampled_img = Image.fromarray(
                         (gen_img[batch_idx].permute(1, 2, 0).cpu().numpy() *
         # if not save_img:
         video_out.close()
         del video_out
+        print('logged video to: ', f'{vid_dump_path}')
         del vis, pred_vis, micro, pred,
+        return vid_dump_path, mesh_dump_path
     def _init_optim_groups(self, rec_model, freeze_decoder=False):
         """for initializing the reconstruction model; fixing decoder part.
         """

requirements.txt CHANGED Viewed

@@ -31,4 +31,5 @@ safetensors
 matplotlib
 git+https://github.com/nupurkmr9/vision-aided-gan
 PyMCubes
-trimesh

 matplotlib
 git+https://github.com/nupurkmr9/vision-aided-gan
 PyMCubes
+trimesh
+gradio==4.29