Spaces:
Running
on
Zero
Running
on
Zero
NIRVANALAN
commited on
Commit
·
52d2875
1
Parent(s):
c00df70
update
Browse files- app.py +2 -1
- nsr/lsgm/flow_matching_trainer.py +21 -8
- nsr/train_util_diffusion.py +31 -17
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -341,7 +341,8 @@ def main(args):
|
|
| 341 |
with gr.Row():
|
| 342 |
with gr.Tab("Reconstruction"):
|
| 343 |
with gr.Column():
|
| 344 |
-
output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True)
|
|
|
|
| 345 |
output_model = gr.Model3D(
|
| 346 |
height=384,
|
| 347 |
clear_color=(1,1,1,1),
|
|
|
|
| 341 |
with gr.Row():
|
| 342 |
with gr.Tab("Reconstruction"):
|
| 343 |
with gr.Column():
|
| 344 |
+
# output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True)
|
| 345 |
+
output_video = gr.Video(value=None, width=384, label="Rendered Video", autoplay=True, loop=True)
|
| 346 |
output_model = gr.Model3D(
|
| 347 |
height=384,
|
| 348 |
clear_color=(1,1,1,1),
|
nsr/lsgm/flow_matching_trainer.py
CHANGED
|
@@ -678,25 +678,36 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
|
|
| 678 |
|
| 679 |
self.ddpm_model.train()
|
| 680 |
|
|
|
|
| 681 |
@th.inference_mode()
|
| 682 |
def eval_i23d_and_export(
|
| 683 |
self,
|
| 684 |
inp_img,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
# camera,
|
| 686 |
prompt="",
|
| 687 |
save_img=False,
|
| 688 |
use_train_trajectory=False,
|
| 689 |
num_samples=1,
|
| 690 |
num_instances=1,
|
| 691 |
-
unconditional_guidance_scale=4.0, # default value in neural ode
|
| 692 |
export_mesh=True,
|
| 693 |
**kwargs,
|
| 694 |
):
|
| 695 |
|
| 696 |
-
output_model= './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/mesh/cfg=4.0_sample-0.
|
| 697 |
-
output_video = './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/triplane_cfg=4.0_sample-0.mp4'
|
| 698 |
|
| 699 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
|
| 701 |
camera = th.load('assets/objv_eval_pose.pt', map_location=dist_util.dev())[:]
|
| 702 |
inp_img = th.from_numpy(inp_img).permute(2,0,1).unsqueeze(0) / 127.5 - 1 # to [-1,1]
|
|
@@ -722,7 +733,7 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
|
|
| 722 |
|
| 723 |
ucg_keys = [self.cond_key] # i23d
|
| 724 |
|
| 725 |
-
sampling_kwargs = {'cfg_scale': unconditional_guidance_scale}
|
| 726 |
|
| 727 |
N = num_samples # hard coded, to update
|
| 728 |
z_shape = (
|
|
@@ -769,7 +780,7 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
|
|
| 769 |
th.cuda.empty_cache()
|
| 770 |
|
| 771 |
# ! render sampled latent
|
| 772 |
-
name_prefix = f'
|
| 773 |
|
| 774 |
if self.cond_key == 'caption':
|
| 775 |
name_prefix = f'{name_prefix}_{prompt}'
|
|
@@ -784,7 +795,9 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
|
|
| 784 |
save_img=save_img,
|
| 785 |
render_reference=batch,
|
| 786 |
export_mesh=export_mesh,
|
| 787 |
-
render_all=True
|
|
|
|
|
|
|
| 788 |
|
| 789 |
all_vid_dump_path.append(vid_dump_path)
|
| 790 |
all_mesh_dump_path.append(mesh_dump_path)
|
|
@@ -810,4 +823,4 @@ class FlowMatchingEngine(TrainLoop3DDiffusionLSGM_crossattn):
|
|
| 810 |
|
| 811 |
# else:
|
| 812 |
batch_c = {self.cond_key: inp_img.to(dist_util.dev()).to(self.dtype)}
|
| 813 |
-
return sample_and_save(batch_c)
|
|
|
|
| 678 |
|
| 679 |
self.ddpm_model.train()
|
| 680 |
|
| 681 |
+
|
| 682 |
@th.inference_mode()
|
| 683 |
def eval_i23d_and_export(
|
| 684 |
self,
|
| 685 |
inp_img,
|
| 686 |
+
num_steps=250,
|
| 687 |
+
seed=42,
|
| 688 |
+
mesh_size=192,
|
| 689 |
+
mesh_thres=10,
|
| 690 |
+
unconditional_guidance_scale=4.0, # default value in neural ode
|
| 691 |
# camera,
|
| 692 |
prompt="",
|
| 693 |
save_img=False,
|
| 694 |
use_train_trajectory=False,
|
| 695 |
num_samples=1,
|
| 696 |
num_instances=1,
|
|
|
|
| 697 |
export_mesh=True,
|
| 698 |
**kwargs,
|
| 699 |
):
|
| 700 |
|
| 701 |
+
# output_model, output_video = './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/mesh/cfg=4.0_sample-0.obj', './logs/LSGM/inference/Objaverse/i23d/dit-L2/gradio_app/triplane_cfg=4.0_sample-0.mp4'
|
|
|
|
| 702 |
|
| 703 |
+
# return output_model, output_video
|
| 704 |
+
logger.log(
|
| 705 |
+
num_steps,
|
| 706 |
+
unconditional_guidance_scale,
|
| 707 |
+
seed,
|
| 708 |
+
mesh_size,
|
| 709 |
+
mesh_thres,
|
| 710 |
+
)
|
| 711 |
|
| 712 |
camera = th.load('assets/objv_eval_pose.pt', map_location=dist_util.dev())[:]
|
| 713 |
inp_img = th.from_numpy(inp_img).permute(2,0,1).unsqueeze(0) / 127.5 - 1 # to [-1,1]
|
|
|
|
| 733 |
|
| 734 |
ucg_keys = [self.cond_key] # i23d
|
| 735 |
|
| 736 |
+
sampling_kwargs = {'cfg_scale': unconditional_guidance_scale, 'num_steps': num_steps, 'seed': seed}
|
| 737 |
|
| 738 |
N = num_samples # hard coded, to update
|
| 739 |
z_shape = (
|
|
|
|
| 780 |
th.cuda.empty_cache()
|
| 781 |
|
| 782 |
# ! render sampled latent
|
| 783 |
+
name_prefix = f'cfg_{unconditional_guidance_scale}_sample-{i}'
|
| 784 |
|
| 785 |
if self.cond_key == 'caption':
|
| 786 |
name_prefix = f'{name_prefix}_{prompt}'
|
|
|
|
| 795 |
save_img=save_img,
|
| 796 |
render_reference=batch,
|
| 797 |
export_mesh=export_mesh,
|
| 798 |
+
render_all=True,
|
| 799 |
+
mesh_size=mesh_size,
|
| 800 |
+
mesh_thres=mesh_thres)
|
| 801 |
|
| 802 |
all_vid_dump_path.append(vid_dump_path)
|
| 803 |
all_mesh_dump_path.append(mesh_dump_path)
|
|
|
|
| 823 |
|
| 824 |
# else:
|
| 825 |
batch_c = {self.cond_key: inp_img.to(dist_util.dev()).to(self.dtype)}
|
| 826 |
+
return sample_and_save(batch_c)
|
nsr/train_util_diffusion.py
CHANGED
|
@@ -18,6 +18,8 @@ from torch.utils.tensorboard.writer import SummaryWriter
|
|
| 18 |
from tqdm import tqdm
|
| 19 |
import matplotlib.pyplot as plt
|
| 20 |
|
|
|
|
|
|
|
| 21 |
from guided_diffusion.gaussian_diffusion import _extract_into_tensor
|
| 22 |
from guided_diffusion import dist_util, logger
|
| 23 |
from guided_diffusion.fp16_util import MixedPrecisionTrainer
|
|
@@ -31,14 +33,10 @@ from guided_diffusion.train_util import (TrainLoop, calc_average_loss,
|
|
| 31 |
log_rec3d_loss_dict,
|
| 32 |
parse_resume_step_from_filename)
|
| 33 |
|
| 34 |
-
|
| 35 |
-
import mcubes
|
| 36 |
-
import trimesh
|
| 37 |
import dnnlib
|
| 38 |
-
from safetensors.torch import load_file
|
| 39 |
-
from huggingface_hub import hf_hub_download
|
| 40 |
|
| 41 |
from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
|
|
|
|
| 42 |
|
| 43 |
# AMP
|
| 44 |
# from accelerate import Accelerator
|
|
@@ -48,6 +46,16 @@ from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
|
|
| 48 |
# use_amp = False
|
| 49 |
# use_amp = True
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
class TrainLoopDiffusionWithRec(TrainLoop):
|
| 53 |
"""an interface with rec_model required apis
|
|
@@ -173,7 +181,9 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
| 173 |
save_img=False,
|
| 174 |
render_reference=None,
|
| 175 |
export_mesh=False,
|
| 176 |
-
render_all=False
|
|
|
|
|
|
|
| 177 |
|
| 178 |
planes *= self.triplane_scaling_divider # if setting clip_denoised=True, the sampled planes will lie in [-1,1]. Thus, values beyond [+- std] will be abandoned in this version. Move to IN for later experiments.
|
| 179 |
|
|
@@ -196,9 +206,8 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
| 196 |
behaviour='decode_after_vae_no_render'))
|
| 197 |
|
| 198 |
if export_mesh:
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
mesh_thres = 10 # TODO, requires tuning
|
| 202 |
dump_path = f'{logger.get_dir()}/mesh/'
|
| 203 |
|
| 204 |
os.makedirs(dump_path, exist_ok=True)
|
|
@@ -220,6 +229,10 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
| 220 |
vtx_colors = rec_model.decoder.forward_points(ddpm_latent['latent_after_vit'], vtx_tensor)['rgb'].float().squeeze(0).cpu().numpy() # (0, 1)
|
| 221 |
vtx_colors = (vtx_colors.clip(0,1) * 255).astype(np.uint8)
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
mesh = trimesh.Trimesh(vertices=vtx, faces=faces, vertex_colors=vtx_colors)
|
| 224 |
# st()
|
| 225 |
# mesh = trimesh.Trimesh(
|
|
@@ -227,16 +240,17 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
| 227 |
# faces=faces,
|
| 228 |
# )
|
| 229 |
|
| 230 |
-
mesh_dump_path = os.path.join(dump_path, f'{name_prefix}.
|
| 231 |
-
mesh.export(mesh_dump_path, '
|
| 232 |
|
| 233 |
-
|
| 234 |
del grid_out, mesh
|
| 235 |
th.cuda.empty_cache()
|
| 236 |
# return
|
| 237 |
|
|
|
|
| 238 |
video_out = imageio.get_writer(
|
| 239 |
-
|
| 240 |
mode='I',
|
| 241 |
fps=15,
|
| 242 |
codec='libx264')
|
|
@@ -331,8 +345,7 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
| 331 |
],
|
| 332 |
dim=-1) # B, 3, H, W
|
| 333 |
|
| 334 |
-
if
|
| 335 |
-
# if save_img:
|
| 336 |
for batch_idx in range(gen_img.shape[0]):
|
| 337 |
sampled_img = Image.fromarray(
|
| 338 |
(gen_img[batch_idx].permute(1, 2, 0).cpu().numpy() *
|
|
@@ -357,11 +370,12 @@ class TrainLoopDiffusionWithRec(TrainLoop):
|
|
| 357 |
# if not save_img:
|
| 358 |
video_out.close()
|
| 359 |
del video_out
|
| 360 |
-
print('logged video to: ',
|
| 361 |
-
f'{logger.get_dir()}/triplane_{name_prefix}.mp4')
|
| 362 |
|
| 363 |
del vis, pred_vis, micro, pred,
|
| 364 |
|
|
|
|
|
|
|
| 365 |
def _init_optim_groups(self, rec_model, freeze_decoder=False):
|
| 366 |
"""for initializing the reconstruction model; fixing decoder part.
|
| 367 |
"""
|
|
|
|
| 18 |
from tqdm import tqdm
|
| 19 |
import matplotlib.pyplot as plt
|
| 20 |
|
| 21 |
+
from safetensors.torch import load_file
|
| 22 |
+
|
| 23 |
from guided_diffusion.gaussian_diffusion import _extract_into_tensor
|
| 24 |
from guided_diffusion import dist_util, logger
|
| 25 |
from guided_diffusion.fp16_util import MixedPrecisionTrainer
|
|
|
|
| 33 |
log_rec3d_loss_dict,
|
| 34 |
parse_resume_step_from_filename)
|
| 35 |
|
|
|
|
|
|
|
|
|
|
| 36 |
import dnnlib
|
|
|
|
|
|
|
| 37 |
|
| 38 |
from nsr.camera_utils import FOV_to_intrinsics, LookAtPoseSampler
|
| 39 |
+
from huggingface_hub import hf_hub_download
|
| 40 |
|
| 41 |
# AMP
|
| 42 |
# from accelerate import Accelerator
|
|
|
|
| 46 |
# use_amp = False
|
| 47 |
# use_amp = True
|
| 48 |
|
| 49 |
+
# Function to generate a rotation matrix for an arbitrary theta along the x-axis
|
| 50 |
+
def rotation_matrix_x(theta_degrees):
|
| 51 |
+
theta = np.radians(theta_degrees) # Convert degrees to radians
|
| 52 |
+
cos_theta = np.cos(theta)
|
| 53 |
+
sin_theta = np.sin(theta)
|
| 54 |
+
|
| 55 |
+
rotation_matrix = np.array([[1, 0, 0],
|
| 56 |
+
[0, cos_theta, -sin_theta],
|
| 57 |
+
[0, sin_theta, cos_theta]])
|
| 58 |
+
return rotation_matrix
|
| 59 |
|
| 60 |
class TrainLoopDiffusionWithRec(TrainLoop):
|
| 61 |
"""an interface with rec_model required apis
|
|
|
|
| 181 |
save_img=False,
|
| 182 |
render_reference=None,
|
| 183 |
export_mesh=False,
|
| 184 |
+
render_all=False,
|
| 185 |
+
mesh_size=192,
|
| 186 |
+
mesh_thres=10):
|
| 187 |
|
| 188 |
planes *= self.triplane_scaling_divider # if setting clip_denoised=True, the sampled planes will lie in [-1,1]. Thus, values beyond [+- std] will be abandoned in this version. Move to IN for later experiments.
|
| 189 |
|
|
|
|
| 206 |
behaviour='decode_after_vae_no_render'))
|
| 207 |
|
| 208 |
if export_mesh:
|
| 209 |
+
import mcubes
|
| 210 |
+
import trimesh
|
|
|
|
| 211 |
dump_path = f'{logger.get_dir()}/mesh/'
|
| 212 |
|
| 213 |
os.makedirs(dump_path, exist_ok=True)
|
|
|
|
| 229 |
vtx_colors = rec_model.decoder.forward_points(ddpm_latent['latent_after_vit'], vtx_tensor)['rgb'].float().squeeze(0).cpu().numpy() # (0, 1)
|
| 230 |
vtx_colors = (vtx_colors.clip(0,1) * 255).astype(np.uint8)
|
| 231 |
|
| 232 |
+
# rotate mesh along x dim
|
| 233 |
+
vtx = np.transpose(rotation_matrix_x(-90) @ np.transpose(vtx))
|
| 234 |
+
|
| 235 |
+
|
| 236 |
mesh = trimesh.Trimesh(vertices=vtx, faces=faces, vertex_colors=vtx_colors)
|
| 237 |
# st()
|
| 238 |
# mesh = trimesh.Trimesh(
|
|
|
|
| 240 |
# faces=faces,
|
| 241 |
# )
|
| 242 |
|
| 243 |
+
mesh_dump_path = os.path.join(dump_path, f'{name_prefix}.obj')
|
| 244 |
+
mesh.export(mesh_dump_path, 'obj')
|
| 245 |
|
| 246 |
+
logger.log(f"Mesh dumped to {mesh_dump_path}")
|
| 247 |
del grid_out, mesh
|
| 248 |
th.cuda.empty_cache()
|
| 249 |
# return
|
| 250 |
|
| 251 |
+
vid_dump_path = f'{logger.get_dir()}/triplane_{name_prefix}.mp4'
|
| 252 |
video_out = imageio.get_writer(
|
| 253 |
+
vid_dump_path,
|
| 254 |
mode='I',
|
| 255 |
fps=15,
|
| 256 |
codec='libx264')
|
|
|
|
| 345 |
],
|
| 346 |
dim=-1) # B, 3, H, W
|
| 347 |
|
| 348 |
+
if save_img:
|
|
|
|
| 349 |
for batch_idx in range(gen_img.shape[0]):
|
| 350 |
sampled_img = Image.fromarray(
|
| 351 |
(gen_img[batch_idx].permute(1, 2, 0).cpu().numpy() *
|
|
|
|
| 370 |
# if not save_img:
|
| 371 |
video_out.close()
|
| 372 |
del video_out
|
| 373 |
+
print('logged video to: ', f'{vid_dump_path}')
|
|
|
|
| 374 |
|
| 375 |
del vis, pred_vis, micro, pred,
|
| 376 |
|
| 377 |
+
return vid_dump_path, mesh_dump_path
|
| 378 |
+
|
| 379 |
def _init_optim_groups(self, rec_model, freeze_decoder=False):
|
| 380 |
"""for initializing the reconstruction model; fixing decoder part.
|
| 381 |
"""
|
requirements.txt
CHANGED
|
@@ -31,4 +31,5 @@ safetensors
|
|
| 31 |
matplotlib
|
| 32 |
git+https://github.com/nupurkmr9/vision-aided-gan
|
| 33 |
PyMCubes
|
| 34 |
-
trimesh
|
|
|
|
|
|
| 31 |
matplotlib
|
| 32 |
git+https://github.com/nupurkmr9/vision-aided-gan
|
| 33 |
PyMCubes
|
| 34 |
+
trimesh
|
| 35 |
+
gradio==4.29
|