Spaces:

yslan
/

worldmem

Running on Zero

App Files Files Community

xizaoqu commited on Apr 10

Commit

0d5deae

1 Parent(s): 1e18469

update precision

Browse files

Files changed (3) hide show

algorithms/worldmem/df_video.py +79 -70
algorithms/worldmem/models/dit.py +4 -0
app.py +30 -28

algorithms/worldmem/df_video.py CHANGED Viewed

@@ -791,22 +791,22 @@ class WorldMemMinecraft(DiffusionForcingBase):
         return
     @torch.no_grad()
-    def interactive(self, first_frame, curr_actions, first_pose, context_frames_idx, device,
                     self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx):
         condition_similar_length = self.condition_similar_length
         if self_frames is None:
             first_frame = torch.from_numpy(first_frame)
-            curr_actions = torch.from_numpy(curr_actions)
             first_pose = torch.from_numpy(first_pose)
             first_frame_encode = self.encode(first_frame[None, None].to(device))
             self_frames = first_frame_encode.cpu()
-            self_actions = curr_actions[None, None].to(device)
             self_poses = first_pose[None, None].to(device)
             new_c2w_mat = euler_to_camera_to_world_matrix(first_pose)
             self_memory_c2w = new_c2w_mat[None, None].to(device)
-            self_frame_idx = torch.tensor([[context_frames_idx]]).to(device)
             return first_frame.cpu().numpy(), self_frames.cpu().numpy(), self_actions.cpu().numpy(), self_poses.cpu().numpy(), self_memory_c2w.cpu().numpy(), self_frame_idx.cpu().numpy()
         else:
             self_frames = torch.from_numpy(self_frames)
@@ -814,9 +814,26 @@ class WorldMemMinecraft(DiffusionForcingBase):
             self_poses = torch.from_numpy(self_poses).to(device)
             self_memory_c2w = torch.from_numpy(self_memory_c2w).to(device)
             self_frame_idx = torch.from_numpy(self_frame_idx).to(device)
-            curr_actions = curr_actions.to(device)
-            last_frame = self_frames[-1].clone()
             last_pose_condition = self_poses[-1].clone()
             last_pose_condition[:,3:] = last_pose_condition[:,3:] // 15
             new_pose_condition_offset = self.pose_prediction_model(last_frame.to(device), curr_actions[None], last_pose_condition)
@@ -829,88 +846,80 @@ class WorldMemMinecraft(DiffusionForcingBase):
             self_poses = torch.cat([self_poses, new_pose_condition[None]])
             new_c2w_mat = euler_to_camera_to_world_matrix(new_pose_condition)
             self_memory_c2w = torch.cat([self_memory_c2w, new_c2w_mat[None]])
-            self_frame_idx = torch.cat([self_frame_idx, torch.tensor([[context_frames_idx]]).to(device)])
-        conditions = self_actions.clone()
-        pose_conditions = self_poses.clone()
-        c2w_mat = self_memory_c2w .clone()
-        frame_idx = self_frame_idx.clone()
-        curr_frame = 0
-        horizon = 1
-        batch_size = 1
-        n_frames = curr_frame + horizon
-        # context
-        n_context_frames = context_frames_idx // self.frame_stack
-        xs_pred = self_frames[:n_context_frames].clone()
-        curr_frame += n_context_frames
-        pbar = tqdm(total=n_frames, initial=curr_frame, desc="Sampling")
-        # generation on frame
-        scheduling_matrix = self._generate_scheduling_matrix(horizon)
-        chunk = torch.randn((horizon, batch_size, *xs_pred.shape[2:])).to(xs_pred.device)
-        chunk = torch.clamp(chunk, -self.clip_noise, self.clip_noise)
-        xs_pred = torch.cat([xs_pred, chunk], 0)
-        # sliding window: only input the last n_tokens frames
-        start_frame = max(0, curr_frame + horizon - self.n_tokens)
-        pbar.set_postfix(
-            {
-                "start": start_frame,
-                "end": curr_frame + horizon,
-            }
-        )
-        # Handle condition similarity logic
-        if condition_similar_length:
-            random_idx = self._generate_condition_indices(
-                curr_frame, condition_similar_length, xs_pred, pose_conditions, frame_idx
-            )
-            # random_idx = np.unique(random_idx)[:, None]
-            # condition_similar_length = len(random_idx)
-            xs_pred = torch.cat([xs_pred, xs_pred[random_idx[:, range(xs_pred.shape[1])], range(xs_pred.shape[1])].clone()], 0)
-        # Prepare input conditions and pose conditions
-        input_condition, input_pose_condition, frame_idx_list = self._prepare_conditions(
-            start_frame, curr_frame, horizon, conditions, pose_conditions, c2w_mat, frame_idx, random_idx,
-            image_width=first_frame.shape[-1], image_height=first_frame.shape[-2]
-        )
-        # Perform sampling for each step in the scheduling matrix
-        for m in range(scheduling_matrix.shape[0] - 1):
-            from_noise_levels, to_noise_levels = self._prepare_noise_levels(
-                scheduling_matrix, m, curr_frame, batch_size, condition_similar_length
-            )
-            xs_pred[start_frame:] = self.diffusion_model.sample_step(
-                xs_pred[start_frame:].to(input_condition.device),
-                input_condition,
-                input_pose_condition,
-                from_noise_levels[start_frame:],
-                to_noise_levels[start_frame:],
-                current_frame=curr_frame,
-                mode="validation",
-                reference_length=condition_similar_length,
-                frame_idx=frame_idx_list
-            ).cpu()
-        if condition_similar_length:
-            xs_pred = xs_pred[:-condition_similar_length]
-        curr_frame += horizon
-        pbar.update(horizon)
         self_frames = torch.cat([self_frames, xs_pred[n_context_frames:]])
         xs_pred = self.decode(xs_pred[n_context_frames:].to(device)).cpu()
-        return xs_pred[-1,0].cpu().numpy(), self_frames.cpu().numpy(), self_actions.cpu().numpy(), \
             self_poses.cpu().numpy(), self_memory_c2w.cpu().numpy(), self_frame_idx.cpu().numpy()

         return
     @torch.no_grad()
+    def interactive(self, first_frame, new_actions, first_pose, device,
                     self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx):
         condition_similar_length = self.condition_similar_length
         if self_frames is None:
             first_frame = torch.from_numpy(first_frame)
+            new_actions = torch.from_numpy(new_actions)
             first_pose = torch.from_numpy(first_pose)
             first_frame_encode = self.encode(first_frame[None, None].to(device))
             self_frames = first_frame_encode.cpu()
+            self_actions = new_actions[None, None].to(device)
             self_poses = first_pose[None, None].to(device)
             new_c2w_mat = euler_to_camera_to_world_matrix(first_pose)
             self_memory_c2w = new_c2w_mat[None, None].to(device)
+            self_frame_idx = torch.tensor([[0]]).to(device)
             return first_frame.cpu().numpy(), self_frames.cpu().numpy(), self_actions.cpu().numpy(), self_poses.cpu().numpy(), self_memory_c2w.cpu().numpy(), self_frame_idx.cpu().numpy()
         else:
             self_frames = torch.from_numpy(self_frames)
             self_poses = torch.from_numpy(self_poses).to(device)
             self_memory_c2w = torch.from_numpy(self_memory_c2w).to(device)
             self_frame_idx = torch.from_numpy(self_frame_idx).to(device)
+            new_actions = new_actions.to(device)
+        curr_frame = 0
+        horizon = 1
+        batch_size = 1
+        n_frames = curr_frame + horizon
+        # context
+        n_context_frames = len(self_frames)
+        xs_pred = self_frames[:n_context_frames].clone()
+        curr_frame += n_context_frames
+        pbar = tqdm(total=n_frames, initial=curr_frame, desc="Sampling")
+        for ai in range(len(new_actions)):
+            from time import time
+            start_time = time()
+            last_frame = xs_pred[-1].clone()
+            curr_actions = new_actions[ai]
             last_pose_condition = self_poses[-1].clone()
             last_pose_condition[:,3:] = last_pose_condition[:,3:] // 15
             new_pose_condition_offset = self.pose_prediction_model(last_frame.to(device), curr_actions[None], last_pose_condition)
             self_poses = torch.cat([self_poses, new_pose_condition[None]])
             new_c2w_mat = euler_to_camera_to_world_matrix(new_pose_condition)
             self_memory_c2w = torch.cat([self_memory_c2w, new_c2w_mat[None]])
+            self_frame_idx = torch.cat([self_frame_idx, torch.tensor([[self_frame_idx[-1,0]+1]]).to(device)])
+            conditions = self_actions.clone()
+            pose_conditions = self_poses.clone()
+            c2w_mat = self_memory_c2w .clone()
+            frame_idx = self_frame_idx.clone()
+            # generation on frame
+            scheduling_matrix = self._generate_scheduling_matrix(horizon)
+            chunk = torch.randn((horizon, batch_size, *xs_pred.shape[2:])).to(xs_pred.device)
+            chunk = torch.clamp(chunk, -self.clip_noise, self.clip_noise)
+            xs_pred = torch.cat([xs_pred, chunk], 0)
+            # sliding window: only input the last n_tokens frames
+            start_frame = max(0, curr_frame + horizon - self.n_tokens)
+            pbar.set_postfix(
+                {
+                    "start": start_frame,
+                    "end": curr_frame + horizon,
+                }
+            )
+            # Handle condition similarity logic
+            if condition_similar_length:
+                random_idx = self._generate_condition_indices(
+                    curr_frame, condition_similar_length, xs_pred, pose_conditions, frame_idx
+                )
+                # random_idx = np.unique(random_idx)[:, None]
+                # condition_similar_length = len(random_idx)
+                xs_pred = torch.cat([xs_pred, xs_pred[random_idx[:, range(xs_pred.shape[1])], range(xs_pred.shape[1])].clone()], 0)
+            # Prepare input conditions and pose conditions
+            input_condition, input_pose_condition, frame_idx_list = self._prepare_conditions(
+                start_frame, curr_frame, horizon, conditions, pose_conditions, c2w_mat, frame_idx, random_idx,
+                image_width=first_frame.shape[-1], image_height=first_frame.shape[-2]
+            )
+            mid_time = time()
+            # Perform sampling for each step in the scheduling matrix
+            for m in range(scheduling_matrix.shape[0] - 1):
+                from_noise_levels, to_noise_levels = self._prepare_noise_levels(
+                    scheduling_matrix, m, curr_frame, batch_size, condition_similar_length
+                )
+                xs_pred[start_frame:] = self.diffusion_model.sample_step(
+                    xs_pred[start_frame:].to(input_condition.device),
+                    input_condition,
+                    input_pose_condition,
+                    from_noise_levels[start_frame:],
+                    to_noise_levels[start_frame:],
+                    current_frame=curr_frame,
+                    mode="validation",
+                    reference_length=condition_similar_length,
+                    frame_idx=frame_idx_list
+                ).cpu()
+            end_time = time()
+            print("time:", end_time - start_time, "mid time:", mid_time - start_time)
+            if condition_similar_length:
+                xs_pred = xs_pred[:-condition_similar_length]
+            curr_frame += horizon
+            pbar.update(horizon)
         self_frames = torch.cat([self_frames, xs_pred[n_context_frames:]])
         xs_pred = self.decode(xs_pred[n_context_frames:].to(device)).cpu()
+        return xs_pred.cpu().numpy(), self_frames.cpu().numpy(), self_actions.cpu().numpy(), \
             self_poses.cpu().numpy(), self_memory_c2w.cpu().numpy(), self_frame_idx.cpu().numpy()

algorithms/worldmem/models/dit.py CHANGED Viewed

@@ -487,6 +487,8 @@ class DiT(nn.Module):
         t: (B, T,) tensor of diffusion timesteps
         """
         B, T, C, H, W = x.shape
         # add spatial embeddings
@@ -550,6 +552,8 @@ class DiT(nn.Module):
         # print("self.blocks[0].r_adaLN_modulation[1].weight:", self.blocks[0].r_adaLN_modulation[1].weight)
         # print("self.blocks[0].t_adaLN_modulation[1].weight:", self.blocks[0].t_adaLN_modulation[1].weight)
         return x

         t: (B, T,) tensor of diffusion timesteps
         """
+        from time import time
+        start = time()
         B, T, C, H, W = x.shape
         # add spatial embeddings
         # print("self.blocks[0].r_adaLN_modulation[1].weight:", self.blocks[0].r_adaLN_modulation[1].weight)
         # print("self.blocks[0].t_adaLN_modulation[1].weight:", self.blocks[0].t_adaLN_modulation[1].weight)
+        end_time = time()
+        print("in model time:", end_time - start)
         return x

app.py CHANGED Viewed

@@ -26,6 +26,8 @@ import spaces
 from algorithms.worldmem import WorldMemMinecraft
 from huggingface_hub import hf_hub_download
 ACTION_KEYS = [
     "inventory",
     "ESC",
@@ -142,6 +144,16 @@ def run_local(cfg: DictConfig):
     experiment = build_experiment(cfg, None, None)
     return experiment.exec_interactive(cfg.experiment.tasks[0])
 memory_frames = []
 memory_curr_frame = 0
 input_history = ""
@@ -175,12 +187,12 @@ load_custom_checkpoint(algo=worldmem.diffusion_model, checkpoint_path=cfg.diffus
 load_custom_checkpoint(algo=worldmem.vae, checkpoint_path=cfg.vae_path)
 load_custom_checkpoint(algo=worldmem.pose_prediction_model, checkpoint_path=cfg.pose_predictor_path)
 worldmem.to("cuda").eval()
 actions = np.zeros((1, 25), dtype=np.float32)
 poses = np.zeros((1, 5), dtype=np.float32)
-memory_frames.append(load_image_as_tensor(DEFAULT_IMAGE))
 self_frames = None
 self_actions = None
@@ -190,12 +202,11 @@ self_frame_idx = None
 @spaces.GPU()
-def run_interactive(first_frame, action, first_pose, curr_frame, device, self_frames, self_actions,
                             self_poses, self_memory_c2w, self_frame_idx):
     new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = worldmem.interactive(first_frame,
                                     action,
                                     first_pose,
-                                    curr_frame,
                                     device=device,
                                     self_frames=self_frames,
                                     self_actions=self_actions,
@@ -216,6 +227,7 @@ def generate(keys):
     # print("algo frame:", len(worldmem.frames))
     actions = parse_input_to_tensor(keys)
     global input_history
     global memory_curr_frame
     global self_frames
     global self_actions
@@ -223,26 +235,19 @@ def generate(keys):
     global self_memory_c2w
     global self_frame_idx
-    for i in range(len(actions)):
-        memory_curr_frame += 1
-        new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
-                                      actions[i],
-                                      None,
-                                      memory_curr_frame,
-                                      device=device,
-                                      self_frames=self_frames,
-                                      self_actions=self_actions,
-                                      self_poses=self_poses,
-                                      self_memory_c2w=self_memory_c2w,
-                                      self_frame_idx=self_frame_idx)
-        # print("algo frame:", len(runner.algo.frames))
-        memory_frames.append(new_frame)
-    out_video = np.stack(memory_frames)
-    out_video = out_video.transpose(0,2,3,1)
     out_video = np.clip(out_video, a_min=0.0, a_max=1.0)
     out_video = (out_video * 255).astype(np.uint8)
@@ -268,15 +273,12 @@ def reset():
     self_poses = None
     self_memory_c2w = None
     self_frame_idx = None
-    memory_frames = []
-    memory_frames.append(load_image_as_tensor(DEFAULT_IMAGE).numpy())
-    memory_curr_frame = 0
     input_history = ""
     new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
                                 actions[0],
                                 poses[0],
-                                memory_curr_frame,
                                 device=device,
                                 self_frames=self_frames,
                                 self_actions=self_actions,

 from algorithms.worldmem import WorldMemMinecraft
 from huggingface_hub import hf_hub_download
+torch.set_float32_matmul_precision("high")
 ACTION_KEYS = [
     "inventory",
     "ESC",
     experiment = build_experiment(cfg, None, None)
     return experiment.exec_interactive(cfg.experiment.tasks[0])
+def enable_amp(model, precision="16-mixed"):
+    original_forward = model.forward
+    def amp_forward(*args, **kwargs):
+        with torch.autocast("cuda", dtype=torch.float16 if precision == "16-mixed" else torch.bfloat16):
+            return original_forward(*args, **kwargs)
+    model.forward = amp_forward
+    return model
 memory_frames = []
 memory_curr_frame = 0
 input_history = ""
 load_custom_checkpoint(algo=worldmem.vae, checkpoint_path=cfg.vae_path)
 load_custom_checkpoint(algo=worldmem.pose_prediction_model, checkpoint_path=cfg.pose_predictor_path)
 worldmem.to("cuda").eval()
+worldmem = enable_amp(worldmem, precision="16-mixed")
 actions = np.zeros((1, 25), dtype=np.float32)
 poses = np.zeros((1, 5), dtype=np.float32)
+memory_frames = load_image_as_tensor(DEFAULT_IMAGE)[None].numpy()
 self_frames = None
 self_actions = None
 @spaces.GPU()
+def run_interactive(first_frame, action, first_pose, device, self_frames, self_actions,
                             self_poses, self_memory_c2w, self_frame_idx):
     new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = worldmem.interactive(first_frame,
                                     action,
                                     first_pose,
                                     device=device,
                                     self_frames=self_frames,
                                     self_actions=self_actions,
     # print("algo frame:", len(worldmem.frames))
     actions = parse_input_to_tensor(keys)
     global input_history
+    global memory_frames
     global memory_curr_frame
     global self_frames
     global self_actions
     global self_memory_c2w
     global self_frame_idx
+    new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
+                                    actions,
+                                    None,
+                                    device=device,
+                                    self_frames=self_frames,
+                                    self_actions=self_actions,
+                                    self_poses=self_poses,
+                                    self_memory_c2w=self_memory_c2w,
+                                    self_frame_idx=self_frame_idx)
+    memory_frames = np.concatenate([memory_frames, new_frame[:,0]])
+    out_video = memory_frames.transpose(0,2,3,1)
     out_video = np.clip(out_video, a_min=0.0, a_max=1.0)
     out_video = (out_video * 255).astype(np.uint8)
     self_poses = None
     self_memory_c2w = None
     self_frame_idx = None
+    memory_frames = load_image_as_tensor(DEFAULT_IMAGE).numpy()[None]
     input_history = ""
     new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
                                 actions[0],
                                 poses[0],
                                 device=device,
                                 self_frames=self_frames,
                                 self_actions=self_actions,