EdgeTAM

Runtime error

App Files Files Community

bla commited on 25 days ago

Commit

2a466e4

verified ·

1 Parent(s): 99d098f

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -25

app.py CHANGED Viewed

@@ -76,8 +76,9 @@ OBJ_ID = 0
 sam2_checkpoint = "checkpoints/edgetam.pt"
 model_cfg = "edgetam.yaml"
 # Ensure predictor is explicitly built for CPU
 predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
-predictor.to("cpu") # Explicitly move to CPU, though device="cpu" should handle it
 print("predictor loaded on CPU")
 # Removed autocast block for maximum CPU compatibility
@@ -121,6 +122,7 @@ def preprocess_video_in(video_path, session_state):
                 "input_points": [],
                 "input_labels": [],
                 "inference_state": None,
             }
         )
@@ -143,6 +145,7 @@ def preprocess_video_in(video_path, session_state):
                 "input_points": [],
                 "input_labels": [],
                 "inference_state": None,
             }
         )
@@ -178,16 +181,18 @@ def preprocess_video_in(video_path, session_state):
                 "input_points": [],
                 "input_labels": [],
                 "inference_state": None,
             }
         )
     session_state["first_frame"] = copy.deepcopy(first_frame) # Store a copy
     session_state["all_frames"] = all_frames
     session_state["input_points"] = []
     session_state["input_labels"] = []
-    # Initialize state explicitly for CPU
-    session_state["inference_state"] = predictor.init_state(video_path=video_path, device="cpu")
     print("Video loaded and predictor state initialized.")
     return [
@@ -213,9 +218,10 @@ def reset(session_state):
         predictor.reset_state(session_state["inference_state"])
         # After reset, we also discard the state object as a new video might be loaded
         session_state["inference_state"] = None
-    # Clear frames
     session_state["first_frame"] = None
     session_state["all_frames"] = None
     # Update UI elements to their initial state
     return (
@@ -238,18 +244,19 @@ def clear_points(session_state):
     session_state["input_points"] = []
     session_state["input_labels"] = []
-    # If inference state exists, reset it. This clears internal masks/features
     # but keeps the video context initialized by preprocess_video_in.
     if session_state["inference_state"] is not None:
         predictor.reset_state(session_state["inference_state"])
-         # After resetting the state, we need to re-initialize it to be ready for new points.
-         # Pass the original video path stored in the state.
-        if "video_path" in session_state["inference_state"] and session_state["inference_state"]["video_path"] is not None:
-             session_state["inference_state"] = predictor.init_state(video_path=session_state["inference_state"]["video_path"])
         else:
-             # This case should ideally not happen if preprocess_video_in ran correctly
              print("Warning: Could not re-initialize state after clear_points (video_path missing).")
-             session_state["inference_state"] = None
     # Re-render the points_map with no points drawn (just the first frame)
@@ -324,6 +331,7 @@ def segment_with_points(
     points = np.array(session_state["input_points"], dtype=np.float32)
     labels = np.array(session_state["input_labels"], np.int32)
     points_tensor = torch.tensor(points, dtype=torch.float32, device="cpu").unsqueeze(0) # Add batch dim
     labels_tensor = torch.tensor(labels, dtype=torch.int32, device="cpu").unsqueeze(0) # Add batch dim
@@ -340,6 +348,7 @@ def segment_with_points(
         # Process logits: detach from graph, move to CPU, apply threshold
         # out_mask_logits is [batch_size, H, W] (batch_size=1 here)
         mask_tensor = (out_mask_logits[0][0].detach().cpu() > 0.0) # Apply threshold and get the single mask tensor [H, W]
         mask_numpy = mask_tensor.numpy() # Convert to numpy
@@ -363,7 +372,8 @@ def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
     # Ensure mask is a numpy array (and boolean)
     if isinstance(mask, torch.Tensor):
          mask = mask.detach().cpu().numpy() # Ensure it's on CPU and converted to numpy
-    mask = mask.astype(bool) # Ensure mask is boolean
     if random_color:
         color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) # RGBA with 0.6 alpha
@@ -374,15 +384,15 @@ def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
     # Ensure mask has H, W dimensions
     if mask.ndim == 3:
-        mask = mask.squeeze() # Remove singular dimensions
     if mask.ndim != 2:
         print(f"Warning: show_mask received mask with shape {mask.shape}. Expected 2D.")
         # Create an empty transparent image if mask shape is unexpected
         if convert_to_image:
-             return Image.fromarray(np.zeros((*mask.shape[:2], 4), dtype=np.uint8), "RGBA")
         else:
-             return np.zeros((*mask.shape[:2], 4), dtype=np.uint8)
     h, w = mask.shape
     # Create an RGBA image from the mask and color
@@ -403,7 +413,9 @@ def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
 # Removed @spaces.GPU decorator
 def propagate_to_all(
-    video_in, # Keep video_in path to potentially get FPS again if needed
     session_state,
 ):
     """Runs mask propagation through the video and generates the output video."""
@@ -413,6 +425,7 @@ def propagate_to_all(
         len(session_state["input_points"]) == 0 # Need at least one point
         or session_state["all_frames"] is None
         or session_state["inference_state"] is None
     ):
         print("Error: Cannot propagate. No points selected, video not loaded, or inference state missing.")
         return (
@@ -424,13 +437,16 @@ def propagate_to_all(
     # The generator yields (frame_idx, obj_ids, mask_logits)
     video_segments = {}
     try:
         for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
             session_state["inference_state"]
         ):
             # Process logits: detach from graph, move to CPU, convert to numpy boolean mask
              # Ensure tensor is on CPU before converting to numpy
              video_segments[out_frame_idx] = {
-                 out_obj_id: (out_mask_logits[i].detach().cpu() > 0.0).numpy()
                  for i, out_obj_id in enumerate(out_obj_ids)
              }
              # Optional: print progress
@@ -447,7 +463,8 @@ def propagate_to_all(
     output_frames = []
     # Iterate through all original frames to generate output video
-    for out_frame_idx in range(len(session_state["all_frames"])):
         original_frame_rgb = session_state["all_frames"][out_frame_idx]
         # Convert original frame to RGBA for compositing
         transparent_background = Image.fromarray(original_frame_rgb).convert("RGBA")
@@ -471,16 +488,17 @@ def propagate_to_all(
     # Define output path in a temporary directory
     unique_id = datetime.now().strftime("%Y%m%d%H%M%S%f") # Use microseconds for more uniqueness
     final_vid_filename = f"output_video_{unique_id}.mp4"
-    # Use os.path.join for cross-platform compatibility
     final_vid_output_path = os.path.join(tempfile.gettempdir(), final_vid_filename)
     print(f"Output video path: {final_vid_output_path}")
     # Create a video clip from the image sequence
     # Get original FPS or default
-    original_fps = get_video_fps(video_in) # Re-get FPS from the input file path
     fps = original_fps if original_fps is not None and original_fps > 0 else 30 # Default to 30 if detection fails or is zero
     print(f"Creating output video with FPS: {fps}")
@@ -512,7 +530,7 @@ def propagate_to_all(
             final_vid_output_path,
             codec="libx264",
             fps=fps, # Ensure correct FPS is used during writing
-            preset="medium", # CPU optimization: 'fast', 'faster', 'veryfast' are options for speed
             threads="auto", # CPU optimization: Use multiple cores
             logger=None # Suppress moviepy output
         )
@@ -714,8 +732,8 @@ with gr.Blocks() as demo:
     ).then( # Then, run the propagation function
         fn=propagate_to_all,
         inputs=[
-            video_in,      # Get the input video path
-            session_state, # Pass session state (contains frames, points, inference_state)
         ],
         outputs=[
             output_video,  # Update output video player with result

 sam2_checkpoint = "checkpoints/edgetam.pt"
 model_cfg = "edgetam.yaml"
 # Ensure predictor is explicitly built for CPU
+# The device is set here and with .to("cpu")
 predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
+predictor.to("cpu") # Explicitly move to CPU after building
 print("predictor loaded on CPU")
 # Removed autocast block for maximum CPU compatibility
                 "input_points": [],
                 "input_labels": [],
                 "inference_state": None,
+                "video_path": None,
             }
         )
                 "input_points": [],
                 "input_labels": [],
                 "inference_state": None,
+                "video_path": None,
             }
         )
                 "input_points": [],
                 "input_labels": [],
                 "inference_state": None,
+                "video_path": None,
             }
         )
+    # Update session state with frames and path
     session_state["first_frame"] = copy.deepcopy(first_frame) # Store a copy
     session_state["all_frames"] = all_frames
+    session_state["video_path"] = video_path # Store the path
     session_state["input_points"] = []
     session_state["input_labels"] = []
+    # Initialize state *without* the device argument
+    session_state["inference_state"] = predictor.init_state(video_path=video_path)
     print("Video loaded and predictor state initialized.")
     return [
         predictor.reset_state(session_state["inference_state"])
         # After reset, we also discard the state object as a new video might be loaded
         session_state["inference_state"] = None
+    # Clear frames and video path
     session_state["first_frame"] = None
     session_state["all_frames"] = None
+    session_state["video_path"] = None
     # Update UI elements to their initial state
     return (
     session_state["input_points"] = []
     session_state["input_labels"] = []
+    # Reset the predictor state if it exists. This clears internal masks/features
     # but keeps the video context initialized by preprocess_video_in.
     if session_state["inference_state"] is not None:
         predictor.reset_state(session_state["inference_state"])
+         # After resetting the state, if we still have the video path, re-initialize the state
+         # to be ready for new points on the same video.
+        if session_state["video_path"] is not None:
+             # Re-initialize state *without* the device argument
+             session_state["inference_state"] = predictor.init_state(video_path=session_state["video_path"])
+             print("Predictor state re-initialized after clearing points.")
         else:
              print("Warning: Could not re-initialize state after clear_points (video_path missing).")
+             session_state["inference_state"] = None # Ensure state is None if video_path is gone
     # Re-render the points_map with no points drawn (just the first frame)
     points = np.array(session_state["input_points"], dtype=np.float32)
     labels = np.array(session_state["input_labels"], np.int32)
+    # Ensure tensors are on CPU
     points_tensor = torch.tensor(points, dtype=torch.float32, device="cpu").unsqueeze(0) # Add batch dim
     labels_tensor = torch.tensor(labels, dtype=torch.int32, device="cpu").unsqueeze(0) # Add batch dim
         # Process logits: detach from graph, move to CPU, apply threshold
         # out_mask_logits is [batch_size, H, W] (batch_size=1 here)
+        # out_mask_logits[0] is the tensor for obj_id=OBJ_ID
         mask_tensor = (out_mask_logits[0][0].detach().cpu() > 0.0) # Apply threshold and get the single mask tensor [H, W]
         mask_numpy = mask_tensor.numpy() # Convert to numpy
     # Ensure mask is a numpy array (and boolean)
     if isinstance(mask, torch.Tensor):
          mask = mask.detach().cpu().numpy() # Ensure it's on CPU and converted to numpy
+    # Convert potential float/int mask to boolean mask
+    mask = mask.astype(bool)
     if random_color:
         color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) # RGBA with 0.6 alpha
     # Ensure mask has H, W dimensions
     if mask.ndim == 3:
+        mask = mask.squeeze() # Remove singular dimensions like (H, W, 1)
     if mask.ndim != 2:
         print(f"Warning: show_mask received mask with shape {mask.shape}. Expected 2D.")
         # Create an empty transparent image if mask shape is unexpected
+        h, w = mask.shape[:2] if mask.ndim >= 2 else (100, 100) # Use actual shape if possible, otherwise default
         if convert_to_image:
+             return Image.fromarray(np.zeros((h, w, 4), dtype=np.uint8), "RGBA")
         else:
+             return np.zeros((h, w, 4), dtype=np.uint8)
     h, w = mask.shape
     # Create an RGBA image from the mask and color
 # Removed @spaces.GPU decorator
 def propagate_to_all(
+    # We don't strictly need video_in path here anymore as it's in session_state,
+    # but keeping it is fine. Accessing session_state["video_path"] is more robust.
+    video_in,
     session_state,
 ):
     """Runs mask propagation through the video and generates the output video."""
         len(session_state["input_points"]) == 0 # Need at least one point
         or session_state["all_frames"] is None
         or session_state["inference_state"] is None
+        or session_state["video_path"] is None # Ensure we have the original video path
     ):
         print("Error: Cannot propagate. No points selected, video not loaded, or inference state missing.")
         return (
     # The generator yields (frame_idx, obj_ids, mask_logits)
     video_segments = {}
     try:
+        # This loop performs the core tracking prediction frame by frame
         for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
             session_state["inference_state"]
         ):
             # Process logits: detach from graph, move to CPU, convert to numpy boolean mask
              # Ensure tensor is on CPU before converting to numpy
              video_segments[out_frame_idx] = {
+                 # out_mask_logits is a list of tensors (one per object tracked in this frame)
+                 # Each tensor is [batch_size, H, W]. Batch size is 1 here.
+                 out_obj_id: (out_mask_logits[i][0].detach().cpu() > 0.0).numpy()
                  for i, out_obj_id in enumerate(out_obj_ids)
              }
              # Optional: print progress
     output_frames = []
     # Iterate through all original frames to generate output video
+    total_frames = len(session_state["all_frames"])
+    for out_frame_idx in range(total_frames):
         original_frame_rgb = session_state["all_frames"][out_frame_idx]
         # Convert original frame to RGBA for compositing
         transparent_background = Image.fromarray(original_frame_rgb).convert("RGBA")
     # Define output path in a temporary directory
+    # Use os.path.join for cross-platform compatibility
     unique_id = datetime.now().strftime("%Y%m%d%H%M%S%f") # Use microseconds for more uniqueness
     final_vid_filename = f"output_video_{unique_id}.mp4"
     final_vid_output_path = os.path.join(tempfile.gettempdir(), final_vid_filename)
     print(f"Output video path: {final_vid_output_path}")
     # Create a video clip from the image sequence
     # Get original FPS or default
+    # Get FPS from the stored video path in session state
+    original_fps = get_video_fps(session_state["video_path"])
     fps = original_fps if original_fps is not None and original_fps > 0 else 30 # Default to 30 if detection fails or is zero
     print(f"Creating output video with FPS: {fps}")
             final_vid_output_path,
             codec="libx264",
             fps=fps, # Ensure correct FPS is used during writing
+            preset="medium", # CPU optimization: 'fast', 'faster', 'veryfast' are options for speed vs size
             threads="auto", # CPU optimization: Use multiple cores
             logger=None # Suppress moviepy output
         )
     ).then( # Then, run the propagation function
         fn=propagate_to_all,
         inputs=[
+            video_in,      # Get the input video path (can also get from session_state["video_path"])
+            session_state, # Pass session state (contains frames, points, inference_state, video_path)
         ],
         outputs=[
             output_video,  # Update output video player with result