neural-os

Runtime error

App Files Files Community

da03 commited on Feb 16

Commit

9d0127f

1 Parent(s): 487aaae

.

Browse files

Files changed (2) hide show

main.py +21 -13
utils.py +8 -7

main.py CHANGED Viewed

@@ -168,6 +168,15 @@ def normalize_images(images, target_range=(-1, 1)):
         return images / 255.0
     else:
         raise ValueError(f"Unsupported target range: {target_range}")
 def denormalize_image(image, source_range=(-1, 1)):
     if source_range == (-1, 1):
@@ -195,28 +204,27 @@ def format_action(action_str, is_padding=False, is_leftclick=False):
     # Format with sign and proper spacing
     return prefix + " " + f"{'+ ' if x >= 0 else '- '}{x_spaced} : {'+ ' if y >= 0 else '- '}{y_spaced}"
-def predict_next_frame(previous_frames: List[np.ndarray], previous_actions: List[Tuple[str, List[int]]]) -> np.ndarray:
     width, height = 512, 384
     all_click_positions = []
     initial_images = load_initial_images(width, height)
     print ('length of previous_frames', len(previous_frames))
     # Prepare the image sequence for the model
     assert len(initial_images) == 32
     image_sequence = previous_frames[-32:]  # Take the last 7 frames
     i = 1
     while len(image_sequence) < 32:
-        image_sequence.insert(0, initial_images[-i])
         i += 1
         #image_sequence.append(initial_images[len(image_sequence)])
     # Convert the image sequence to a tensor and concatenate in the channel dimension
-    image_sequence_tensor = torch.from_numpy(normalize_images(image_sequence, target_range=(-1, 1)))
-    image_sequence_tensor = image_sequence_tensor.to(device)
-    data_mean = -0.54
-    data_std = 6.78
-    data_min = -27.681446075439453
-    data_max = 30.854148864746094
     #image_sequence_tensor = (image_sequence_tensor - data_mean) / data_std
     # Prepare the prompt based on the previous actions
@@ -318,7 +326,7 @@ def predict_next_frame(previous_frames: List[np.ndarray], previous_actions: List
     #print ('changing L to N')
     # Generate the next frame
-    new_frame = sample_frame(model, prompt, image_sequence_tensor, pos_maps=pos_maps, leftclick_maps=leftclick_maps)
     # Convert the generated frame to the correct format
     new_frame = new_frame.transpose(1, 2, 0)
@@ -333,7 +341,7 @@ def predict_next_frame(previous_frames: List[np.ndarray], previous_actions: List
     #x, y, action_type = parse_action_string(action_descriptions[-1])
-    return new_frame_with_trace, new_frame_denormalized
 # WebSocket endpoint for continuous user interaction
 @app.websocket("/ws")
@@ -513,10 +521,10 @@ async def websocket_endpoint(websocket: WebSocket):
                 #if DEBUG_TEACHER_FORCING:
                 #    print ('predicting', f"record_10003/image_{117+len(previous_frames)}.png")
                 print ('previous_actions', previous_actions)
-                next_frame, next_frame_append = predict_next_frame(previous_frames, previous_actions)
                 feedback = True
                 if feedback:
-                    previous_frames.append(next_frame_append)
                 else:
                     #previous_frames = []
                     previous_actions = []

         return images / 255.0
     else:
         raise ValueError(f"Unsupported target range: {target_range}")
+def normalize_image(image, target_range=(-1, 1)):
+    image = image.astype(np.float32)
+    if target_range == (-1, 1):
+        return image / 127.5 - 1
+    elif target_range == (0, 1):
+        return image / 255.0
+    else:
+        raise ValueError(f"Unsupported target range: {target_range}")
 def denormalize_image(image, source_range=(-1, 1)):
     if source_range == (-1, 1):
     # Format with sign and proper spacing
     return prefix + " " + f"{'+ ' if x >= 0 else '- '}{x_spaced} : {'+ ' if y >= 0 else '- '}{y_spaced}"
+def predict_next_frame(previous_frames: List[np.ndarray, Tuple[str, np.ndarray]], previous_actions: List[Tuple[str, List[int]]]) -> np.ndarray:
     width, height = 512, 384
     all_click_positions = []
     initial_images = load_initial_images(width, height)
     print ('length of previous_frames', len(previous_frames))
+    padding_image = torch.zeros((height//8, width//8, 4)).to(device)
     # Prepare the image sequence for the model
     assert len(initial_images) == 32
     image_sequence = previous_frames[-32:]  # Take the last 7 frames
     i = 1
     while len(image_sequence) < 32:
+        image_sequence.insert(0, padding_image)
         i += 1
         #image_sequence.append(initial_images[len(image_sequence)])
     # Convert the image sequence to a tensor and concatenate in the channel dimension
+    #image_sequence_tensor = torch.from_numpy(normalize_images(image_sequence_list, target_range=(-1, 1)))
+    #image_sequence_tensor = image_sequence_tensor.to(device)
+    image_sequence_tensor = torch.cat(image_sequence, dim=1)
     #image_sequence_tensor = (image_sequence_tensor - data_mean) / data_std
     # Prepare the prompt based on the previous actions
     #print ('changing L to N')
     # Generate the next frame
+    new_frame, new_frame_feedback = sample_frame(model, prompt, image_sequence_tensor, pos_maps=pos_maps, leftclick_maps=leftclick_maps)
     # Convert the generated frame to the correct format
     new_frame = new_frame.transpose(1, 2, 0)
     #x, y, action_type = parse_action_string(action_descriptions[-1])
+    return new_frame_with_trace, new_frame_denormalized, new_frame_feedback
 # WebSocket endpoint for continuous user interaction
 @app.websocket("/ws")
                 #if DEBUG_TEACHER_FORCING:
                 #    print ('predicting', f"record_10003/image_{117+len(previous_frames)}.png")
                 print ('previous_actions', previous_actions)
+                next_frame, next_frame_append, next_frame_feedback = predict_next_frame(previous_frames, previous_actions)
                 feedback = True
                 if feedback:
+                    previous_frames.append(next_frame_feedback)
                 else:
                     #previous_frames = []
                     previous_actions = []

utils.py CHANGED Viewed

@@ -45,13 +45,13 @@ def sample_frame(model: LatentDiffusion, prompt: str, image_sequence: torch.Tens
         #print (c['c_crossattn'][0])
         print (prompt)
         c = {}
-        c = model.enc_concat_seq(c, c_dict, 'c_concat')
         # Zero out the corresponding subtensors in c_concat for padding images
-        padding_mask = torch.isclose(image_sequence, torch.tensor(-1.0), rtol=1e-5, atol=1e-5).all(dim=(1, 2, 3)).unsqueeze(0)
-        print (padding_mask)
-        padding_mask = padding_mask.repeat(1, 4)  # Repeat mask 4 times for each projected channel
-        print (image_sequence.shape, padding_mask.shape, c['c_concat'].shape)
-        c['c_concat'] = c['c_concat'] * (~padding_mask.unsqueeze(-1).unsqueeze(-1))  # Zero out the corresponding features
         data_mean = -0.54
         data_std = 6.78
         data_min = -27.681446075439453
@@ -108,6 +108,7 @@ def sample_frame(model: LatentDiffusion, prompt: str, image_sequence: torch.Tens
             data_max = 30.854148864746094
             x_samples_ddim = samples_ddim
             x_samples_ddim = x_samples_ddim * data_std + data_mean
             x_samples_ddim = model.decode_first_stage(x_samples_ddim)
         print ('dfsf3')
         #x_samples_ddim = pos_map.to(c['c_concat'].device).unsqueeze(0).expand(-1, 3, -1, -1)
@@ -115,7 +116,7 @@ def sample_frame(model: LatentDiffusion, prompt: str, image_sequence: torch.Tens
         #x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
         x_samples_ddim = torch.clamp(x_samples_ddim, min=-1.0, max=1.0)
-        return x_samples_ddim.squeeze(0).cpu().numpy()
 # Global variables for model and device
 #model = None

         #print (c['c_crossattn'][0])
         print (prompt)
         c = {}
+        #c = model.enc_concat_seq(c, c_dict, 'c_concat')
         # Zero out the corresponding subtensors in c_concat for padding images
+        #padding_mask = torch.isclose(image_sequence, torch.tensor(-1.0), rtol=1e-5, atol=1e-5).all(dim=(1, 2, 3)).unsqueeze(0)
+        #print (padding_mask)
+        #padding_mask = padding_mask.repeat(1, 4)  # Repeat mask 4 times for each projected channel
+        #print (image_sequence.shape, padding_mask.shape, c['c_concat'].shape)
+        #c['c_concat'] = c['c_concat'] * (~padding_mask.unsqueeze(-1).unsqueeze(-1))  # Zero out the corresponding features
         data_mean = -0.54
         data_std = 6.78
         data_min = -27.681446075439453
             data_max = 30.854148864746094
             x_samples_ddim = samples_ddim
             x_samples_ddim = x_samples_ddim * data_std + data_mean
+            x_samples_ddim_feedback = x_samples_ddim
             x_samples_ddim = model.decode_first_stage(x_samples_ddim)
         print ('dfsf3')
         #x_samples_ddim = pos_map.to(c['c_concat'].device).unsqueeze(0).expand(-1, 3, -1, -1)
         #x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
         x_samples_ddim = torch.clamp(x_samples_ddim, min=-1.0, max=1.0)
+        return x_samples_ddim.squeeze(0).cpu().numpy(), x_samples_ddim_feedback.squeeze(0)
 # Global variables for model and device
 #model = None